In [82]:
'''
TF-IDF - Term Frequency - Inverse Data Frequency
    Term Frequency: frequency of the word in a document
    Inverse Data Frequency: used to calculatee the weight of rare words in the document
    i.e. rare words have a high IDF score
    
This notebook calculates the TF-IDF score for each word in a given message.
'''

import pandas as pd
import numpy as np
import string
import os

project_root = os.path.dirname(os.getcwd())
fall_18_data_path = project_root + '/Data/Fall_2018/'
fall_19_data_path = project_root + '/Data/Fall_2019/'
tfidf_path = project_root + '/tfidf/'

In [83]:
df_fall_18_messages = pd.read_excel(fall_18_data_path + 'FA18_messages_with_race_gender.xlsx', index_col=0)

In [84]:
df_fall_18_messages.head()

Unnamed: 0,User ID,Race,Gender,Text
0,42683026,WHITE,F,Hey @Katie Poteet I know you said we should em...
1,25501571,WHITE,F,@Mary Cassell I would email Dr. K anyway with ...
2,42683026,WHITE,F,"Ok, thanks a lot. I have the email typed but w..."
3,25501571,WHITE,F,Upstairs from our lecture hall
4,25501571,WHITE,F,Sherman 207!!!


In [85]:
# 2D array holding an array of words for each message
all_messages = []
# A set of all words used in all messages (no duplicates)
word_set = set()

for index, row in df_fall_18_messages.iterrows():
    message = row['Text'].split(" ")
    for i in range(len(message)):
        message[i] = message[i].translate(str.maketrans('', '', string.punctuation)).lower()
    word_set = set(word_set).union(set(message))
    all_messages.append(message)

In [86]:
# Array holding a dictionary of words for each message
# Dictionary consists of a word and its counts in the message (init to 0)
word_dicts = []
for message in all_messages:
    word_dicts.append(dict.fromkeys(word_set,0))

In [87]:
# Word dictionary for message 1 (counts initialized to 0)
word_dicts[0]

{'': 0,
 'catmeif': 0,
 'poorerless': 0,
 '😂😂😂': 0,
 'severe': 0,
 'stacey’s': 0,
 '125': 0,
 'lollll': 0,
 'order': 0,
 'there\n': 0,
 'basis': 0,
 'stealing': 0,
 'mrs': 0,
 'stacys': 0,
 'max': 0,
 'spends': 0,
 'amidst': 0,
 'statementsean': 0,
 'flat': 0,
 'reserve': 0,
 'induction': 0,
 'uniqueness': 0,
 'minimalist': 0,
 'presemester': 0,
 'alliance': 0,
 'hard': 0,
 'midterm': 0,
 'portfolio': 0,
 'moving': 0,
 'from': 0,
 'food\nwhat': 0,
 '695': 0,
 'garden': 0,
 'wrong': 0,
 'light': 0,
 'kettle': 0,
 'sec': 0,
 'ballot': 0,
 'plz': 0,
 'around': 0,
 'firearm': 0,
 'itchange': 0,
 'linked': 0,
 'yep': 0,
 'successful”\n\nclubs': 0,
 'blown': 0,
 '\njulie': 0,
 '1215': 0,
 'coordinated': 0,
 'content': 0,
 'anna': 0,
 'preference': 0,
 'divide': 0,
 'grey': 0,
 'initiatives': 0,
 'today': 0,
 'stood': 0,
 'tomrooe': 0,
 'prospective': 0,
 'diabetes': 0,
 'wheneverwherever': 0,
 'taught': 0,
 'oh': 0,
 'gotcha': 0,
 'violence': 0,
 'naught': 0,
 'start': 0,
 'barring': 0,
 'de

In [88]:
# Add counts of each word in a message to it's dictionary
for i in range(len(all_messages)):
    for word in all_messages[i]:
        word_dicts[i][word] += 1        

In [89]:
# Word dictionary for message 1 (some counts are now non-zero)
print(word_dicts[0])



In [47]:
'''
df_word_counts = pd.DataFrame(word_dicts)
counter = 0
for dictionary in word_dicts:
    print(all_messages[counter])
    for i in dictionary:
        if (dictionary[i] >= 1):
            print(dictionary[i])
    counter += 1
'''

'\ndf_word_counts = pd.DataFrame(word_dicts)\ncounter = 0\nfor dictionary in word_dicts:\n    print(all_messages[counter])\n    for i in dictionary:\n        if (dictionary[i] >= 1):\n            print(dictionary[i])\n    counter += 1\n'

In [73]:
#df_word_counts.to_excel(tfidf_path+"FA18_word_count_vectors.xlsx")

In [90]:
# Compute term frequency
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [91]:
word_dict_tf = []

In [92]:
for index, row in df_fall_18_messages.iterrows():
    word_dict_tf.append(computeTF(word_dicts[index], all_messages[index]))

In [93]:
word_dict_tf[0]

{'': 0.0,
 'catmeif': 0.0,
 'poorerless': 0.0,
 '😂😂😂': 0.0,
 'severe': 0.0,
 'stacey’s': 0.0,
 '125': 0.0,
 'lollll': 0.0,
 'order': 0.0,
 'there\n': 0.0,
 'basis': 0.0,
 'stealing': 0.0,
 'mrs': 0.0,
 'stacys': 0.0,
 'max': 0.0,
 'spends': 0.0,
 'amidst': 0.0,
 'statementsean': 0.0,
 'flat': 0.0,
 'reserve': 0.0,
 'induction': 0.0,
 'uniqueness': 0.0,
 'minimalist': 0.0,
 'presemester': 0.0,
 'alliance': 0.0,
 'hard': 0.0,
 'midterm': 0.0,
 'portfolio': 0.0,
 'moving': 0.0,
 'from': 0.0,
 'food\nwhat': 0.0,
 '695': 0.0,
 'garden': 0.0,
 'wrong': 0.0,
 'light': 0.0,
 'kettle': 0.0,
 'sec': 0.0,
 'ballot': 0.0,
 'plz': 0.0,
 'around': 0.0,
 'firearm': 0.0,
 'itchange': 0.0,
 'linked': 0.0,
 'yep': 0.0,
 'successful”\n\nclubs': 0.0,
 'blown': 0.0,
 '\njulie': 0.0,
 '1215': 0.0,
 'coordinated': 0.0,
 'content': 0.0,
 'anna': 0.0,
 'preference': 0.0,
 'divide': 0.0,
 'grey': 0.0,
 'initiatives': 0.0,
 'today': 0.0,
 'stood': 0.0,
 'tomrooe': 0.0,
 'prospective': 0.0,
 'diabetes': 0.0,
 'wh

In [94]:
df_word_frequencies = pd.DataFrame(word_dict_tf)
df_word_frequencies.to_excel(tfidf_path+"FA18_word_frequency_vectors.xlsx")

In [95]:
# Compute inverse data frequency
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [96]:
idfs = computeIDF(word_dicts)

In [98]:
# Computes tf x idf
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [99]:
word_dict_tfidfs = []
for index, row in df_fall_18_messages.iterrows():
    word_dict_tfidfs.append(computeTFIDF(word_dict_tf[index], idfs))

In [100]:
df_word_tfidfs = pd.DataFrame(word_dict_tfidfs)

In [101]:
df_word_tfidfs.to_excel(tfidf_path+"FA18_tfidf_vectors.xlsx")