# Term Frequency (Positive Comments)

Description: Term frequency (TF) is used in connection with information retrieval and shows how frequently an expression (term, word) occurs in a document.

Term frequency indicates the significance of a particular term within the overall document. 

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

### NLP Libraries

In [2]:
# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# gensim
from gensim.parsing.porter import PorterStemmer

## Load Dataset

In [3]:
df = pd.read_csv('commentsSVM.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0,comment_message,sentiments
0,[],Neutral
1,"['worst', 'card', 'ever', 'lawsuit', 'time']",Negative
2,"['think', 'need', 'hear', 'year', 'think', 'bi...",Neutral
3,"['long', 'take', 'get', 'refund', 'monei', 'pa...",Negative
4,"['realli', 'recip', 'heaven']",Neutral


## Data Pre-processing

In [4]:
# Select only positive comments
filter_comments = df[df['sentiments'] == 'Positive']

In [5]:
filter_comments.head()

Unnamed: 0,comment_message,sentiments
22,"['love', 'green', 'dot', 'never', 'ever', 'pro...",Positive
36,"['god', 'bless', 'bless', 'know', 'famili', 'h...",Positive
40,"['moo', 'moo', 'ye']",Positive
50,"['happi', 'kwanzaa']",Positive
51,"['merri', 'christma', 'green', 'dot']",Positive


In [6]:
# Drop unnecessary columns
drop_columns = ['sentiments']
df = filter_comments.drop(drop_columns, axis=1)

In [7]:
# Inspect dataset
df.head()

Unnamed: 0,comment_message
22,"['love', 'green', 'dot', 'never', 'ever', 'pro..."
36,"['god', 'bless', 'bless', 'know', 'famili', 'h..."
40,"['moo', 'moo', 'ye']"
50,"['happi', 'kwanzaa']"
51,"['merri', 'christma', 'green', 'dot']"


In [8]:
# Tokenize the comments
df['comment_message'] = df['comment_message'].apply(lambda list_words: word_tokenize(list_words))

In [9]:
def remove_nonalpha(text):
    '''
    Removing non-alpha characters
    '''
    return re.sub('[^a-zA-Z]', '', text)

In [10]:
# Run the function to remove non-letter characters
df['comment_message'] = df['comment_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [11]:
# Remove empty strings
df['comment_message'] = df['comment_message'].apply(lambda list_words: list(filter(None, list_words)))

In [12]:
# Convert all letters to lowercase
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [17]:
# Remove stopwords
stop_words = sorted(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
stop_words.update(["green", "dot", "go", "would", "get", "use", "mygreendotadv"])
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [18]:
# Sort words by grouping inflected forms of the same word
pem = PorterStemmer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [pem.stem(word) for word in list_words])

In [19]:
# Create hashable object that will contain all words
all_words = []
for words in df['comment_message']:
    all_words += words

## Word-Frequency Pair

In [20]:
# Get the most frequently used word
fdist = FreqDist(all_words)
fdist.most_common(10)

[('sweepstak', 1023),
 ('love', 604),
 ('save', 424),
 ('win', 410),
 ('thank', 366),
 ('monei', 365),
 ('pai', 361),
 ('good', 298),
 ('alwai', 257),
 ('make', 232)]

In [21]:
# Create a dataframe that will contain the word-frequency pair
most_frequently_used_positive = pd.DataFrame(list(fdist.items()), columns = ["Word","Frequency"])

## Save as most_frequently_used_positive.tsv

In [22]:
most_frequently_used_positive.to_csv('most_frequently_used_positive.tsv', sep='\t')