In [18]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from time import time

In [2]:
okc = pd.read_csv(('../Assets/A/encoded_features.csv'), index_col='Unnamed: 0')
# tf = pd.read_csv('../Assets/A/top_2000_words_nomax_stemmed.csv', index_col='Unnamed: 0')
# stemming was done on all data.  REDO LATER?

In [3]:
okc.shape

(53951, 33)

In [4]:
# stem essays
t0 = time()
stemmer = PorterStemmer()
def stem(essay):
    try:
        stems = [stemmer.stem(word) for word in essay.lower().split()]
        return ' '.join(stems)
    except:
        return ''
okc['essays'] = okc['essays'].apply(stem)
print "stemmed essay in %g seconds" %(time()-t0)

# vectorize essays
t0 = time()
vectorizer = TfidfVectorizer(ngram_range = (1, 2), encoding='utf-8', stop_words = 'english', max_features = 2000)
top_ngrams = vectorizer.fit_transform(okc['essays'])
print "vectorized essays in %g seconds" %(time() - t0)

stemmed essay in 171.668 seconds
vectorized essays in 59.502 seconds


In [5]:
X = top_ngrams

In [6]:
type(X)

scipy.sparse.csr.csr_matrix

In [7]:
okc.columns

Index([u'age', u'body_type', u'diet', u'drinks', u'drugs', u'education',
       u'essay0', u'essay1', u'essay2', u'essay3', u'essay4', u'essay5',
       u'essay6', u'essay7', u'essay8', u'essay9', u'ethnicity', u'height',
       u'income', u'job', u'last_online', u'location', u'offspring',
       u'orientation', u'pets', u'religion', u'sex', u'sign', u'smokes',
       u'speaks', u'status', u'essays', u'strict_diet'],
      dtype='object')

In [8]:
y = np.ravel(okc.drinks)

In [45]:
okc.drinks.value_counts()

1    37616
0     8317
2     8018
Name: drinks, dtype: int64

In [49]:
8018.0/len(okc)

0.1486163370465793

In [58]:
# Class priors calculated from fraction of drinking classes
# How do I pick class priors?
nb = MultinomialNB(class_prior=[.154, .697, .149])

In [59]:
cv = cross_val_score(nb, X, y, scoring = 'f1')

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [60]:
print cv

[ 0.03739872  0.03859712  0.03859295]


In [61]:
# Produce classification report for "drinks"
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [62]:
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [63]:
pd.Series(y_pred).value_counts()

1    13430
2       58
dtype: int64

In [64]:
print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00      5999
          1       0.13      1.00      0.24      1796
          2       0.88      0.01      0.02      5693

avg / total       0.39      0.14      0.04     13488



In [71]:
# Tamper with thresholds?

okc.strict_diet.value_counts()[0]

47918

In [75]:
float(okc.strict_diet.value_counts()[0])/len(okc)

0.8881763081314527

In [85]:
# Model strict_diet
# Grid search prior to maximize f1-score?

y = np.ravel(okc.strict_diet)

nb = MultinomialNB(class_prior=[.5,.5])

X_train, X_test, y_train, y_test = train_test_split(X, y)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.93      0.65      0.76     12019
          1       0.17      0.60      0.27      1469

avg / total       0.85      0.64      0.71     13488



In [86]:
pd.Series(y_pred).value_counts()

0    8342
1    5146
dtype: int64

In [40]:
# predict body type from essays!

y = np.ravel(okc.body_type)

X_train, X_test, y_train, y_test = train_test_split(X, y)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.51      0.73      0.60      5975
          1       0.00      0.00      0.00      1832
          2       0.59      0.51      0.54      5681

avg / total       0.47      0.54      0.49     13488



In [44]:
pd.Series(y_pred).value_counts()

0    8593
2    4895
dtype: int64