In [1]:
import pandas as pd
import numpy as np
np.random.seed(1337)
import lda
from scipy.stats import expon
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.metrics import make_scorer, f1_score
from sklearn.grid_search import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, svm, metrics
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 




In [2]:
def create_dat(df_ess, df_dem):
    df_ess = df_ess[df_ess.Study=='Connecticut']
    df_ess.Condition.replace(['c', 'c2', 'c1', 'c3', 'ca', 'cb', '3'], 'Control', inplace=True)
    df_ess.Condition.replace(['t', 't2', 't3', 't1', '1', '2', 'ta', 'tb'], 'Treatment', inplace=True)
    df_ess.Condition.replace(['c/t'], np.nan, inplace=True)
    
    df_dem = df_dem[df_dem.Study=='Connecticut']
    df_dem.Ethnicity.replace('Asian', 'Asian American', inplace=True)
    df_dem.Ethnicity.replace('Other/Mixed', 'Other', inplace=True)
    df_dem = df_dem[['ID', 'Ethnicity', 'Gender']].dropna()
    
    outdat = pd.merge(df_ess[['ID', 'Intervention_number', 'Essay', 'Condition', 'Intervention_Date', 'corrected']], 
                    df_dem, how='left', on='ID').drop_duplicates()
    
    return(outdat)

def return_grid_scores(grid, clf='NB'):
    if clf == 'SVM':
        for i, (x, _, _) in enumerate(grid.grid_scores_):
            if x['C'] == grid.best_estimator_.C:
                idx = i
    else:
        for i, (x, _, _) in enumerate(grid.grid_scores_):
            if x['alpha'] == grid.best_estimator_.alpha:
                idx = i
    return grid.grid_scores_[idx][2]

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def get_most_informative_features(vocab, coef, n=20, clf='svm'):
    coefs_with_fns = sorted(zip(abs(coef), vocab, coef))
    if clf=='svm':
        top = coefs_with_fns[:-(n + 1):-1]
    else:
        top = [coefs_with_fns[:-(n+1):-1], coefs_with_fns[0:n]]
        top = [item for sublist in top for item in sublist]
    return top

In [3]:
# second, prepare text samples and their labels
print('Processing text dataset')
df_ess = pd.read_csv('../Data/3 CSV Files/essays1.23.16.csv', sep='|')
df_dem = pd.read_csv('../Data/3 CSV Files/demog3.2.16.csv')
df = create_dat(df_ess, df_dem)
df.dropna(axis=0, subset=['corrected', 'Condition'], inplace=True)
df['Class'] = df.Condition+df.Gender

Processing text dataset


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


# Classification for Gender+treatment

In [4]:
df.dropna(axis=0, subset=['Class'], inplace=True)
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<6224x1629 sparse matrix of type '<type 'numpy.int64'>'
	with 93914 stored elements in Compressed Sparse Row format>

In [5]:
tf = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X_tf = tf.fit_transform(df.corrected)
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions
X_con = np.concatenate((X_tf.todense(), model.doc_topic_), axis=1)

In [6]:
f1macro = make_scorer(f1_score, average='macro', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df.Class)
Y = le.transform(df.Class)
dtm_train, dtm_test, label_train, label_true = train_test_split(X_con, Y, test_size=.15)

clf = svm.LinearSVC()
pardist = {'C':expon(scale=100)}
cv = ShuffleSplit(dtm_train.shape[0], test_size=.15)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1macro,
                       n_jobs=1, cv=cv, n_iter=20, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)

## Results

In [7]:
f1_score(label_true, predicted, average='macro', pos_label=None)

0.53245034333289998

In [8]:
return_grid_scores(rs, 'SVM')

array([ 0.5590973 ,  0.55879693,  0.57162138,  0.58476495,  0.56341972,
        0.56290676,  0.5501746 ,  0.55537836,  0.58236781,  0.56343473])

In [9]:
metrics.confusion_matrix(label_true, predicted)

array([[168, 110,  16,  14],
       [102, 114,   7,  14],
       [ 17,   4, 129,  52],
       [ 15,  11,  73,  88]])

In [10]:
pd.Series(le.inverse_transform(label_true)).value_counts()

Controlf      308
Controlm      237
Treatmentf    202
Treatmentm    187
dtype: int64

In [12]:
features = tf.get_feature_names()
for i in range(0,50):
    features.append('topic_'+str(i))
df1=pd.DataFrame({'feature':features,
                 'coef':rs.best_estimator_.coef_[0],
                 'class':['control_f']*len(features)})
df2=pd.DataFrame({'feature':features,
                 'coef':rs.best_estimator_.coef_[1],
                 'class':['control_m']*len(features)})
df3=pd.DataFrame({'feature':features,
                 'coef':rs.best_estimator_.coef_[2],
                 'class':['aff_f']*len(features)})
df4=pd.DataFrame({'feature':features,
                 'coef':rs.best_estimator_.coef_[3],
                 'class':['aff_m']*len(features)})
df_coefs = pd.concat([df1, df2, df3, df4])
df_coefs.to_csv('output/classifier_features.cvs', index=False, encoding='utf-8')