In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
okc = pd.read_csv('../Assets/A/one_long_essay.csv', index_col='Unnamed: 0')

In [3]:
def denull(essay):
    if type(essay) == float:
        return ''
    else: return essay
    
okc.essays = okc.essays.apply(denull)

In [4]:
vec = TfidfVectorizer(encoding='utf-8', stop_words='english', max_features=2000)
tf = vec.fit_transform(okc.essays)

# add custom stop words? (ve, don, ll, im)

tf = pd.DataFrame(tf.toarray(), columns=vec.get_feature_names())

In [5]:
# Calculate mean for each word for comparison

mean_words = tf.mean(axis=0)

In [6]:
# Get categorical data for each data encoded as for Tableau dashboard
cats = pd.read_csv('../Assets/Tableau/encoded_categories.csv', index_col='Unnamed: 0')

In [7]:
cats.columns

Index([u'age', u'diet', u'drinks', u'drugs', u'education', u'income', u'job',
       u'orientation', u'religion', u'religiousness', u'sex', u'smokes'],
      dtype='object')

In [8]:
# index for cats and okc don't match.
cats.index=range(len(cats))

cats.index==okc.index

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [9]:
# Consider only those people who use the word zombie

df_filtered = tf[tf.zombie > 0]
mean_filtered = df_filtered.mean(axis=0)
diff = mean_filtered - mean_words
diff.sort_values(inplace=True)


print "People who use the word ZOMBIE in their profiles use the following other words"
print "most frequently compared to the average user of OKCupid:"
print "       "
top_list = ''
# List top words mentioned in reverse order so most popular is first
for i in range(-1, -100, -1):
    top_list = top_list + diff.index[i] + ', '
# remove comma and space from end
top_list = top_list[:-2]
print top_list
print '   '
print '##########################################################################################'
print '    '
print "They use the following words much less frequently than the general population:"
bottom_list = ''
for i in range(100):
    bottom_list = bottom_list + diff.index[i] + ', '
bottom_list = bottom_list[:-2]
print ' '
print bottom_list
print '   '
print '##########################################################################################'
print '    '
print 'In total %i individuals mentioned the word ZOMBIE in their profiles.' %df_filtered.shape[0]

People who use the word ZOMBIE in their profiles use the following other words
most frequently compared to the average user of OKCupid:
       
zombie, apocalypse, dead, like, movies, games, don, video, horror, really, game, walking, just, ve, bad, ll, stuff, think, punk, star, awesome, war, pretty, books, probably, metal, guide, rock, want, make, fight, shit, black, series, random, shows, know, dark, thing, people, actually, ass, art, wars, big, nerd, lot, music, playing, thrones, club, world, way, things, science, death, evil, oh, park, geek, beer, sci, fi, firefly, man, comic, fantasy, tattoos, watch, white, hell, weird, time, tv, lost, band, nerdy, hate, super, comics, yeah, history, vampire, head, look, anime, eat, day, days, guy, cool, order, dr, play, films, scott, flicks, say, making
   
##########################################################################################
    
They use the following words much less frequently than the general population:
 
family, friends,

In [10]:
length = 1059

In [11]:
float(cats.ix[df_filtered.index, :].sex.value_counts().ix['m']) / length

0.6968838526912181

In [39]:
# Generalize to any word:

def analyze(word):
    df_filtered = tf[tf[word] > 0]
    mean_filtered = df_filtered.mean(axis=0)
    diff = mean_filtered - mean_words
    diff.sort_values(inplace=True)


    
    print "People who use the word %s in their profiles use the following other words" %word
    print "most frequently compared to the average user of OKCupid:"
    print "       "
    top_list = ''
    # List top words mentioned in reverse order so most popular is first
    # Skip first word.  It will be input.
    for i in range(-2, -101, -1):
        top_list = top_list + diff.index[i] + ', '
    # remove comma and space from end
    top_list = top_list[:-2]
    
    length = df_filtered.shape[0]
    
    male = 100 * cats.ix[df_filtered.index, :].sex.value_counts().ix['m'] / length
    female = 100 * cats.ix[df_filtered.index, :].sex.value_counts().ix['f'] / length

    
    
    
    print top_list
    print '   '
    print '##########################################################################################'
    print '    '
    print "They use the following words much less frequently than the general population:"
    bottom_list = ''
    for i in range(100):
        bottom_list = bottom_list + diff.index[i] + ', '
    bottom_list = bottom_list[:-2]
    print ' '
    print bottom_list
    print '   '
    print '##########################################################################################'
    print '    '
    print 'In total %i individuals mentioned the word %s in their profiles.' %(length, word)
    print '   '
    print 'Approximately %i%% of those people are male, and %i%% are female.' %(male, female)
    

In [40]:
analyze('yoga')

People who use the word yoga in their profiles use the following other words
most frequently compared to the average user of OKCupid:
       
love, dancing, life, meditation, hiking, practice, healthy, nature, dance, travel, live, running, spiritual, world, active, wine, reading, learning, dinner, art, ve, cooking, class, enjoy, body, creative, health, great, years, chocolate, biking, bike, new, doing, partner, open, time, fresh, exploring, feel, exercise, energy, loving, week, climbing, passionate, heart, good, important, connection, food, balance, swimming, living, like, relationships, laughing, music, deep, practicing, organic, massage, teaching, beautiful, hike, community, happy, traveling, spirit, run, lived, hot, adventure, mind, vegan, outdoors, seeing, people, sunshine, getting, sf, teacher, make, soul, tea, teach, listening, warm, thai, daily, amazing, friends, recently, especially, camping, curious, appreciate, present, taking
   
#############################################