Combines all essays from okc dataframe into one long essay with markdown removed.
Saves result to new .csv

In [1]:
import pandas as pd
from bs4 import BeautifulSoup    
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
okc = pd.read_csv('../profiles.csv')

In [3]:
# Create list of all columns that are essays
essay_list = [('essay%i') %i for i in range(10)]

In [4]:
# Replace empty essays with ' '

okc.ix[:,essay_list] = okc.ix[:,essay_list].replace(np.nan,' ', regex=True)

In [5]:
def essay_to_words( raw_essay ):
    
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_essay, 'lxml').get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    # stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    # meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join([w for w in words]))

In [6]:
# Write new column to df that contains all essays
okc['essays'] = (okc.essay0 + ' ' + okc.essay1 + ' ' + okc.essay2 + ' ' + okc.essay3 + ' ' + okc.essay4 + ' ' 
              + okc.essay5 + ' ' + okc.essay6 + ' ' + okc.essay7 + ' ' + okc.essay8 + ' ' + okc.essay9)
okc['essays'] = okc.essays.apply(essay_to_words)

In [7]:
okc.columns

Index([u'age', u'body_type', u'diet', u'drinks', u'drugs', u'education',
       u'essay0', u'essay1', u'essay2', u'essay3', u'essay4', u'essay5',
       u'essay6', u'essay7', u'essay8', u'essay9', u'ethnicity', u'height',
       u'income', u'job', u'last_online', u'location', u'offspring',
       u'orientation', u'pets', u'religion', u'sex', u'sign', u'smokes',
       u'speaks', u'status', u'essays'],
      dtype='object')

In [8]:
# Drop individual essay cols
okc = okc.drop(essay_list, axis=1)

In [9]:
okc.shape

(59946, 22)

In [15]:
okc=okc[okc.essays != '']

In [16]:
okc.shape

(57809, 22)

In [17]:
okc.to_csv('one_long_essay.csv')

## Save Top N-grams per Essay with tf-idf vectorization

In [87]:
okc = pd.read_csv('one_long_essay.csv')

In [3]:
# Now save essays with tf-idf vectorization:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', binary = False, max_features = 500)
top_ngrams = vectorizer.fit_transform(okc['essays'])

In [4]:
top_ngrams.shape

(57809, 500)

In [5]:
# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(),
                   columns=vectorizer.get_feature_names())

In [22]:
df.to_csv('top_500_ngrams.csv')

## Do top N-grams, eliminating most common features

In [88]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', binary = False, max_df=50 ,max_features = 500)
top_ngrams = vectorizer.fit_transform(okc['essays'])

In [89]:
top_ngrams.shape

(57809, 500)

In [90]:
# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

In [91]:
df.head()

Unnamed: 0,abnormal,accommodating,ache,acrobatics,afghanistan,afro cuban,afrobeat,aha,airlines,ali khan,...,xxx,yada,yada yada,yadda,yadda yadda,year went,young adults,young lady,zap,zevon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Do Top N-grams Adding more features

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', binary = False, max_df=50 ,max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

In [96]:
# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

In [97]:
df.shape

(57809, 2000)

In [98]:
df.to_csv('top_2000_ngrams_max_50.csv')

## Do Top N-grams eliminating just 20 most common words

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', binary = False, max_df=20 ,max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_ngrams_max_20.csv')

## Do Top Single WORDS

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 1), encoding='utf-8', stop_words = 'english', binary = False, max_df=20 ,max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_words_max_20.csv')

## Do Top N-grams up to 5-grams

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 5), encoding='utf-8', stop_words = 'english', binary = False, max_df=20 ,max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_words_max_20.csv')

KeyboardInterrupt: 

## Apply Porter Stemmer to OKC['essays']

In [104]:
from nltk.stem.porter import *

In [105]:
stemmer = PorterStemmer()

In [139]:
def stem(essay):
    stems = [stemmer.stem(word) for word in essay.lower().split()]
    return ' '.join(stems)

In [140]:
okc['stemmed_essays'] = okc['essays'].apply(stem)

In [141]:
okc.columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'age', u'body_type', u'diet',
       u'drinks', u'drugs', u'education', u'ethnicity', u'height', u'income',
       u'job', u'last_online', u'location', u'offspring', u'orientation',
       u'pets', u'religion', u'sex', u'sign', u'smokes', u'speaks', u'status',
       u'essays', u'stemmed_essays'],
      dtype='object')

In [142]:
okc.to_csv('stemmed_essays.csv')

## Count Vectorize Stemmed Essays
### Single Words

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 1), encoding='utf-8', stop_words = 'english', binary = False, max_df=20 ,max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_words_max_20_stemmed.csv')

## 1 and 2 n-grams

In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', binary = False, max_df=20 ,max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_ngrams_max_20_stemmed.csv')

In [81]:
top_words = [col for col in df.columns]

In [86]:
np.average(df.friends)

0.063164702304183903

In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 1), encoding='utf-8', stop_words = 'english', binary = False, max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_words_nomax_stemmed.csv')

In [147]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', binary = False, max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])

# Save dataframe with feature names and tf-idf scores for each user
df  = pd.DataFrame(top_ngrams.todense(), columns=vectorizer.get_feature_names())

df.to_csv('top_2000_ngrams_nomax_stemmed.csv')

## Do Truncated SVD on Top Ngrams

In [23]:
# Do PCA on top_ngrams
# use scale instead of StandardScaler since StandardScaler doesn't handle sparse matrices

from sklearn.preprocessing import StandardScaler

In [24]:
xStand = StandardScaler(with_mean=False)
xStand = xStand.fit_transform(top_ngrams)

In [26]:
xStand.shape

(57809, 500)

In [37]:
import scipy

In [40]:
# Calculate singular values matrix

scipy.sparse.linalg.svds(xStand, k=6, ncv=None, tol=0, which='LM', v0=None, maxiter=None, return_singular_vectors=True)

(array([[-0.00351685, -0.00371659,  0.00552388, -0.00031655,  0.0054565 ,
         -0.00462528],
        [ 0.00027163, -0.00288121,  0.00255474,  0.00405496,  0.00041981,
         -0.00364063],
        [ 0.00181193,  0.00220358, -0.0025966 , -0.0009496 ,  0.0047058 ,
         -0.00520181],
        ..., 
        [-0.00013359, -0.00501633, -0.00167309, -0.00745094, -0.00962984,
         -0.00443072],
        [ 0.01331878,  0.00159634,  0.00642336, -0.00197511,  0.00261046,
         -0.00432818],
        [-0.00312386, -0.00310407,  0.00327641,  0.00282965,  0.00536482,
         -0.00504021]]),
 array([  411.87572825,   458.15123759,   473.31633777,   504.87980922,
          556.95614136,  2013.45540178]),
 array([[ 0.01899238,  0.00978826,  0.00208965, ..., -0.00096789,
         -0.00013366, -0.01577653],
        [-0.01425346, -0.00219207, -0.00190312, ...,  0.00262209,
          0.0800805 ,  0.0408114 ],
        [-0.02423098, -0.03156108, -0.02289064, ..., -0.01374537,
         -0.055700

In [None]:
# plot curve?

# I am not sure how to pick the number of components for SVD


In [29]:
# perform truncated svd
from sklearn.decomposition import TruncatedSVD

In [30]:
tsvd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

In [34]:
tsvdf = tsvd.fit_transform(xStand)

In [41]:
tsvdf.shape

(57809, 2)

In [42]:
tsvd = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

In [43]:
tsvdf = tsvd.fit_transform(xStand)

In [45]:
tsvdf.shape

(57809, 20)