In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from glob import glob
import html2text

## Loading Data

In [5]:
# load data
filenames = glob("data/interim/df_*.csv")
dataframes = [pd.read_csv(f, encoding='latin-1') for f in filenames]

In [19]:
for i in range(17):
    dataframes[i] = dataframes[i].drop('Unnamed: 0', axis=1)

## Some useful functions

In [6]:
# takes a post id and returns a dataframe with that post
def getPost(id):
    
    # define dataframe structure in memory
    df = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
       'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])
    
    # find the post and add it to the dataframe in memory
    for dataframe in dataframes:
        df = pd.concat([dataframe[dataframe["id"] == id], df])
        
    return df

In [7]:
# takes a list of post ids and returns the comments on those posts
def getComments(parents, family=None):
    
    # define dataframe structure in memory
    children = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
       'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])
    
    # store family
    fam = family
    
    # base case: if no parents provided, return the family
    if not parents:
        return fam
    
    # recursive case: if parents provided, add children to family and check next generation
    else:
        
        # define the next generation
        nextgen = []
        
        # for each dataframe
        for dataframe in dataframes:

            # add the parent's children to the next generation
            children = pd.concat([dataframe[dataframe["parent"].isin(parents)], children]) 
            nextgen += list(children["id"])

            # add the parent's children to the family
            fam = pd.concat([children, fam])

            # reset children for the next parent
            children = pd.DataFrame(columns=['by', 'score', 'time', 'timestamp', 'title', 'type', 'url', 'text',
   'parent', 'deleted', 'dead', 'descendants', 'id', 'ranking'])

        # return getThread on the next generation and current family
        return getComments(nextgen, fam)        

In [8]:
def getThread(id):
    return pd.concat([getPost(id), getComments([id])])

In [9]:
def makeDocument(id):
    return html2text.html2text(" ".join(list(getThread(id)["text"].dropna())))

## Testing the waters with TF IDF

In [22]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df=0, stop_words='english')

In [23]:
# define test documents
sexism = [14988086, 13784509, 14967505]
selfDriving = [15644680, 13718586, 14338328]
augmentedReality = [15124809, 14490239, 14713679]

In [24]:
corpus = [makeDocument(id) for id in sexism + selfDriving + augmentedReality]

In [25]:
# find our most descriptive words!
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
dense_matrix = tfidf_matrix.todense()
num_documents = len(corpus)
for i in np.arange(num_documents):
    document_words = dense_matrix[i].tolist()[0]
    document_scores = [pair for pair in zip(range(0, len(document_words)), document_words) if pair[1] > 0]
    
    sorted_scores = sorted(document_scores, key=lambda t: t[1] * -1)[0:4] #3 words
    
    print("Top words in document {}".format(i + 1))
    for scores_tuple in sorted_scores:
        print("\tWord: {}, TF-IDF: {}".format(feature_names[scores_tuple[0]], round(scores_tuple[1], 5)))

Top words in document 1
	Word: women, TF-IDF: 0.54592
	Word: men, TF-IDF: 0.25394
	Word: people, TF-IDF: 0.19514
	Word: gender, TF-IDF: 0.15832
Top words in document 2
	Word: sexism, TF-IDF: 0.2909
	Word: manager, TF-IDF: 0.25308
	Word: tank, TF-IDF: 0.23651
	Word: dress, TF-IDF: 0.23221
Top words in document 3
	Word: women, TF-IDF: 0.3793
	Word: people, TF-IDF: 0.24035
	Word: salary, TF-IDF: 0.20347
	Word: memo, TF-IDF: 0.17917
Top words in document 4
	Word: car, TF-IDF: 0.52935
	Word: driving, TF-IDF: 0.29048
	Word: cars, TF-IDF: 0.24951
	Word: waymo, TF-IDF: 0.20291
Top words in document 5
	Word: google, TF-IDF: 0.34588
	Word: uber, TF-IDF: 0.27355
	Word: laptop, TF-IDF: 0.20365
	Word: waymo, TF-IDF: 0.19234
Top words in document 6
	Word: lyft, TF-IDF: 0.51379
	Word: uber, TF-IDF: 0.47412
	Word: driving, TF-IDF: 0.18652
	Word: drivers, TF-IDF: 0.17458
Top words in document 7
	Word: apple, TF-IDF: 0.33966
	Word: ar, TF-IDF: 0.33961
	Word: android, TF-IDF: 0.33788
	Word: ios, TF-IDF: 