In [None]:
import pandas as pd
import numpy as np
from time import time
import textacy
import spacy
spacy.load('en')

In [None]:
essay = pd.read_csv('../Assets/A/one_long_essay.csv')

In [None]:
essay = essay.drop('Unnamed: 0', axis=1)

In [87]:
essay.shape

(52041, 32)

In [None]:
# remove all rows with no essays
essay = essay[essay.essays.notnull()]

### Consolidate all essays by men and all essays by women into one long essay

In [None]:
essay_m = ''
essay_f = ''

def add_essay_m(text):
    global essay_m
    essay_m = text + essay_m

In [None]:
m = essay[essay.sex == 'm']

m.essays.apply(add_essay_m)

In [None]:
essay_m = unicode(essay_m, 'utf-8')

In [None]:
t0 = time()

f = essay[essay.sex == 'f']

def add_essay_f(text):
    global essay_f
    essay_f = text + essay_f
    
f.essays.apply(add_essay_f)
print (time() - t0)

In [None]:
essay_f = unicode(essay_f, 'utf-8')

In [None]:
# Build corpus
t0 = time()
corpus = textacy.TextCorpus.from_texts('en', [essay_m, essay_f], n_threads=1)
print (time() - t0)

In [None]:
t0 = time()
doc_term_matrix = corpus.as_doc_term_matrix(
    (doc.as_terms_list(words=True, ngrams=False, named_entities=True) for doc in corpus),
    weighting='tfidf', normalize=True, smooth_idf=True, max_n_terms=3000)
print time() - t0

In [88]:
# doc_term_matrix[1] is a dictionary so I cannot expect .values() to return terms in order
# Save list of features in order
features = []
for i in range(3000):
    features.append(doc_term_matrix[1][i])
    
# Save dtm as dataframe with labels as columns
dtm = pd.DataFrame(doc_term_matrix[0].toarray(), columns=features, index=['men', 'women'])
dtmt = dtm.T

In [89]:
# Sort by top tf-idf terms for men
dtmt.sort(columns="men", ascending=False).T

  from ipykernel import kernelapp as app


Unnamed: 0,m,like,love,t,good,friend,music,work,thing,time,...,chapstick,billie,nursing,trashy,interior,giggle,sassy,makeup,sewing,girly
men,0.40284,0.290207,0.218463,0.211028,0.181754,0.178192,0.172721,0.16895,0.164796,0.164306,...,0.000619,0.000604,0.000604,0.0006,0.000577,0.00057,0.000509,0.000448,0.000376,0.000235
women,0.385551,0.265082,0.324402,0.206457,0.171039,0.204648,0.156986,0.147034,0.156364,0.158334,...,0.002333,0.001384,0.001389,0.001431,0.001451,0.00155,0.002032,0.001876,0.002079,0.001959


In [90]:
# Sort by top tf-idf terms for women
dtmt.sort(columns="women", ascending=False).T

  from ipykernel import kernelapp as app


Unnamed: 0,m,love,like,t,friend,good,time,people,music,thing,...,metallica,2008,programming,audio,stephenson,samurai,kubrick,chess,skateboard,meu
men,0.40284,0.218463,0.290207,0.211028,0.178192,0.181754,0.164306,0.158748,0.172721,0.164796,...,0.001523,0.001466,0.002373,0.001249,0.001488,0.00175,0.001504,0.001431,0.001686,0.001921
women,0.385551,0.324402,0.265082,0.206457,0.204648,0.171039,0.158334,0.157862,0.156986,0.156364,...,0.00056,0.000534,0.000529,0.000524,0.000477,0.000456,0.000456,0.000435,0.000384,1.6e-05


Top tf-idf terms for both sexes are pretty generic.  Men say "like" more frequently than "love", and mention "work" more than "people", while women are vice versa.

The least popular terms are actually more interesting the most popular.  Men are unlikely to say "chapstick", "giggle", "sassy", "makeup", "sewing" or "girly", while women are less likely to mention, "metallica", "samurai", "chess" or "skateboarding".

### Try comparing differences in tf-idf scores

In [91]:
dtmt['diff'] = dtmt['men'] - dtmt['women']
dtmt.sort(columns='diff')

  from ipykernel import kernelapp as app


Unnamed: 0,men,women,diff
love,0.218463,0.324402,-0.105939
friend,0.178192,0.204648,-0.026456
family,0.071368,0.092478,-0.021110
dance,0.035263,0.056261,-0.020999
laugh,0.047124,0.067769,-0.020644
girl,0.024547,0.044619,-0.020072
smile,0.030414,0.043655,-0.013241
travel,0.055045,0.067701,-0.012656
favorite,0.058439,0.070220,-0.011781
wine,0.019277,0.030722,-0.011445


Write stuff about this!!!