# Term Frequency (Positive Comments)

Description: Term frequency (TF) is used in connection with information retrieval and shows how frequently an expression (term, word) occurs in a document.

Term frequency indicates the significance of a particular term within the overall document. 

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### NLP Libraries

In [2]:
# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

# gensim
from gensim.parsing.porter import PorterStemmer

## Load Dataset

In [3]:
df = pd.read_csv('commentsVaderSentiments.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0,comment_message,sentiment_score
0,Why?,0.0
1,The worst card ever it’s lawsuit time,-0.7184
2,I think some needs to hear this. Over the year...,0.355367
3,"How long does it take to get refunded money, P...",0.0
4,REALLY? Recipes From Heaven,0.2553


## Data Pre-processing

In [4]:
# Select only positive comments
filter_comments = df[df['sentiment_score'] > 0.5]

In [5]:
filter_comments.head()

Unnamed: 0,comment_message,sentiment_score
22,I love green dot never ever had a problem,0.7549
36,God bless and have a blessed 2019 I know I wil...,0.9509
50,Happy kwanzaa,0.5719
51,Merry Christmas 🎄 to Green Dot!,0.5848
54,God please bless us Amen 🙏🏿,0.7351


In [6]:
# Drop unnecessary columns
drop_columns = ['sentiment_score']
df = filter_comments.drop(drop_columns, axis=1)

In [7]:
# Inspect dataset
df.head()

Unnamed: 0,comment_message
22,I love green dot never ever had a problem
36,God bless and have a blessed 2019 I know I wil...
50,Happy kwanzaa
51,Merry Christmas 🎄 to Green Dot!
54,God please bless us Amen 🙏🏿


In [8]:
# Tokenize the comments
df['comment_message'] = df['comment_message'].apply(lambda list_words: word_tokenize(list_words))

In [9]:
def remove_nonalpha(text):
    '''
    Removing non-alpha characters
    '''
    return re.sub('[^a-zA-Z]', '', text)

In [10]:
# Run the function to remove non-letter characters
df['comment_message'] = df['comment_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [11]:
# Remove empty strings
df['comment_message'] = df['comment_message'].apply(lambda list_words: list(filter(None, list_words)))

In [12]:
# Convert all letters to lowercase
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [13]:
# Remove stopwords
stop_words = sorted(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
stop_words.update(["green", "dot", "go", "would", "get", "use"])
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [14]:
# Sort words by grouping inflected forms of the same word
pem = PorterStemmer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [pem.stem(word) for word in list_words])

In [15]:
# Create hashable object that will contain all words
all_words = []
for words in df['comment_message']:
    all_words += words

## Word-Frequency Pair

In [16]:
# Get the most frequently used word
fdist = FreqDist(all_words)
fdist.most_common(10)

[('sweepstak', 512),
 ('love', 506),
 ('help', 322),
 ('monei', 317),
 ('win', 308),
 ('card', 249),
 ('save', 226),
 ('like', 222),
 ('thank', 213),
 ('make', 203)]

In [17]:
# Create a dataframe that will contain the word-frequency pair
most_frequently_used_positive = pd.DataFrame(list(fdist.items()), columns = ["Word","Frequency"])

## Save as most_frequently_used_positive.tsv

In [18]:
most_frequently_used_positive.to_csv('most_frequently_used_positive.tsv', sep='\t')