In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from bs4 import BeautifulSoup    
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [2]:
# find null values for target column and remove from rows from tf
# outputs: target column, new tf matrix

def denull(tfidf, df, target_col):
    target = df[df[target_col].isnull()==False][target_col]
    tf_target = tfidf.ix[target.index]
    return (target, tf_target)

# Convert tf to sparse matrix
def sparse_df_to_array(tf_target):
    num_rows = tf_target.shape[0]   

    data = []
    row = []
    col = []

    for i, col_name in enumerate(tf_target.columns):
        if isinstance(tf_target[col_name], pd.SparseSeries):
            column_index = tf_target[col_name].sp_index
            if isinstance(column_index, BlockIndex):
                column_index = column_index.to_int_index()

            ix = column_index.indices
            data.append(tf_target[col_name].sp_values)
            row.append(ix)
            col.append(len(tf_target[col_name].sp_values) * [i])
        else:
            data.append(tf_target[col_name].values)
            row.append(np.array(range(0, num_rows)))
            col.append(np.array(num_rows * [i]))

    data_f = np.concatenate(data)
    row_f = np.concatenate(row)
    col_f = np.concatenate(col)

    arr = sp.sparse.coo_matrix((data_f, (row_f, col_f)), tf_target.shape, dtype=np.float64)
    return arr.tocsr()    

In [3]:
# encoder must be a function

def process(tfidf_matrix, df, target, encoder=lambda x:LabelEncoder().fit_transform(x)):
    # denull tf and df
    (target_col, target_tf) = denull(tf, okc, target)
    # encode values
    target_col = encoder(target_col)
    # Make tf sparse
    sparse_tf = sparse_df_to_array(target_tf)
    # Initiate train test split
    X_train, X_test, y_train, y_test = train_test_split(sparse_tf, target_col)
    return (X_train, X_test, y_train, y_test)
    

## Test preprocessing function
### Predict sex of user based on long essay (top 2000 words, stemmed, and tfidf vectorized)

In [12]:
# Test on sex column

okc = pd.read_csv('../Assets/A/one_long_essay.csv')
tf = pd.read_csv('../Assets/A/Tfidf_Variations/Long_Essay/top_2000_words_nomax_stemmed.csv')

In [13]:
(X_train, X_test, y_train, y_test) = process(tf, okc, "sex")

In [14]:
nb = MultinomialNB().fit(X_train, y_train)

In [15]:
y_pred = nb.predict(X_test)

In [16]:
print metrics.accuracy_score(y_pred, y_test)

0.722203002837


In [17]:
print metrics.classification_report(y_pred, y_test)

             precision    recall  f1-score   support

          0       0.41      0.80      0.54      2954
          1       0.93      0.70      0.80     11499

avg / total       0.82      0.72      0.75     14453

