# Term Frequency (All Comments)

Description: Term frequency (TF) is used in connection with information retrieval and shows how frequently an expression (term, word) occurs in a document.

Term frequency indicates the significance of a particular term within the overall document. 

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

### NLP Libraries

In [2]:
# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
# Gensim
from gensim.parsing.porter import PorterStemmer

## Load Dataset

In [3]:
df = pd.read_csv('comments_nonull.csv')
# Inspect df
df.head()

Unnamed: 0,is_reply,comment_message,comment_published,comment_like_count,attachment_type
0,1,Why?,2019-06-18T00:08:26+0000,0,
1,0,The worst card ever it’s lawsuit time,2019-06-04T19:53:28+0000,6,
2,0,I think some needs to hear this. Over the year...,2019-05-14T03:28:34+0000,0,
3,0,"How long does it take to get refunded money, P...",2019-04-20T20:44:07+0000,1,
4,0,REALLY? Recipes From Heaven,2019-03-15T22:37:37+0000,0,


## Data Pre-processing

In [4]:
# Drop unnecessary columns
drop_columns = ['is_reply', 'comment_published', 'comment_like_count', 'attachment_type']
df = df.drop(drop_columns, axis=1)

In [5]:
df.head()

Unnamed: 0,comment_message
0,Why?
1,The worst card ever it’s lawsuit time
2,I think some needs to hear this. Over the year...
3,"How long does it take to get refunded money, P..."
4,REALLY? Recipes From Heaven


In [6]:
# Tokenize the comments
df['comment_message'] = df['comment_message'].apply(lambda list_words: word_tokenize(list_words))

In [7]:
def remove_nonalpha(text):
    '''
    Removing non-alpha characters
    '''
    return re.sub('[^a-zA-Z]', '', text)

In [8]:
# Run the function to remove non-letter characters
df['comment_message'] = df['comment_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [9]:
# Remove empty strings
df['comment_message'] = df['comment_message'].apply(lambda list_words: list(filter(None, list_words)))

In [10]:
# Convert all letters to lowercase
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [11]:
# Remove stopwords
stop_words = sorted(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
stop_words.update(["green", "dot", "go", "would", "get", "use", "u"])
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [12]:
# Sort words by grouping inflected forms of the same word
lem = WordNetLemmatizer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [lem.lemmatize(word) for word in list_words])

In [13]:
# Eliminate affixes using gensim's PorterStemmer
p = PorterStemmer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [p.stem(word) for word in list_words])

In [14]:
# Create hashable object that will contain all words
all_words = []
for words in df['comment_message']:
    all_words += words

## Word-Frequency Pair

In [15]:
# Get the most frequently used word
fdist = FreqDist(all_words)
fdist.most_common(10)

[('sweepstak', 4010),
 ('card', 2862),
 ('monei', 2716),
 ('nt', 1817),
 ('pai', 1421),
 ('need', 1259),
 ('make', 1198),
 ('help', 1149),
 ('save', 1085),
 ('year', 1036)]

In [16]:
# Create a dataframe that will contain the word-frequency pair
most_frequently_used = pd.DataFrame(list(fdist.items()), columns = ["Word","Frequency"])

## Save as most_frequently_used.tsv

In [17]:
most_frequently_used.to_csv('most_frequently_used.tsv', sep='\t')