# Term Frequency (Negative Comments)

Description: Term frequency (TF) is used in connection with information retrieval and shows how frequently an expression (term, word) occurs in a document.

Term frequency indicates the significance of a particular term within the overall document. 

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")

### NLP Libraries

In [2]:
# NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist

# gensim
from gensim.parsing.porter import PorterStemmer

## Load Dataset

In [3]:
df = pd.read_csv('commentsSVM.tsv', sep='\t')
# Inspect df
df.head()

Unnamed: 0,comment_message,sentiments
0,[],Neutral
1,"['worst', 'card', 'ever', 'lawsuit', 'time']",Negative
2,"['think', 'need', 'hear', 'year', 'think', 'bi...",Neutral
3,"['long', 'take', 'get', 'refund', 'monei', 'pa...",Negative
4,"['realli', 'recip', 'heaven']",Neutral


## Data Pre-processing

In [4]:
# Select only negative comments
filter_comments = df[df['sentiments'] == 'Negative']

In [5]:
filter_comments.head()

Unnamed: 0,comment_message,sentiments
1,"['worst', 'card', 'ever', 'lawsuit', 'time']",Negative
3,"['long', 'take', 'get', 'refund', 'monei', 'pa...",Negative
5,"['hei', 'son', 'new', 'custom', 'card', 'direc...",Negative
9,"['transfer', 'monei', 'greendot', 'refund', 'd...",Negative
10,"['stolen', 'inform']",Negative


In [6]:
# Drop unnecessary columns
drop_columns = ['sentiments']
df = filter_comments.drop(drop_columns, axis=1)

In [7]:
# Inspect dataset
df.head()

Unnamed: 0,comment_message
1,"['worst', 'card', 'ever', 'lawsuit', 'time']"
3,"['long', 'take', 'get', 'refund', 'monei', 'pa..."
5,"['hei', 'son', 'new', 'custom', 'card', 'direc..."
9,"['transfer', 'monei', 'greendot', 'refund', 'd..."
10,"['stolen', 'inform']"


In [8]:
# Tokenize the comments
df['comment_message'] = df['comment_message'].apply(lambda list_words: word_tokenize(list_words))

In [9]:
def remove_nonalpha(text):
    '''
    Removing non-alpha characters
    '''
    return re.sub('[^a-zA-Z]', '', text)

In [10]:
# Run the function to remove non-letter characters
df['comment_message'] = df['comment_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [11]:
# Remove empty strings
df['comment_message'] = df['comment_message'].apply(lambda list_words: list(filter(None, list_words)))

In [12]:
# Convert all letters to lowercase
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [13]:
# Remove stopwords
stop_words = sorted(stopwords.words('english'))
stop_words = set(stopwords.words('english'))
stop_words.update(["green", "dot", "go", "would", "get", "use", "us"])
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [14]:
# Sort words by grouping inflected forms of the same word
lem = WordNetLemmatizer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [lem.lemmatize(word) for word in list_words])

In [15]:
# Eliminate affixes using gensim's PorterStemmer
p = PorterStemmer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [p.stem(word) for word in list_words])

In [16]:
# Create hashable object that will contain all words
all_words = []
for words in df['comment_message']:
    all_words += words

## Word-Frequency Pair

In [17]:
# Get the most frequently used word
fdist = FreqDist(all_words)
fdist.most_common(10)

[('card', 1299),
 ('monei', 1276),
 ('nt', 911),
 ('peopl', 597),
 ('call', 507),
 ('compani', 462),
 ('need', 424),
 ('back', 397),
 ('account', 395),
 ('never', 365)]

In [18]:
# Create a dataframe that will contain the word-frequency pair
most_frequently_used_negative = pd.DataFrame(list(fdist.items()), columns = ["Word","Frequency"])

## Save as most_frequently_used_negative.tsv

In [19]:
most_frequently_used_negative.to_csv('most_frequently_used_negative.tsv', sep='\t')