In [363]:
import pandas as pd
import numpy as np
import lda
from scipy.stats import expon
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.metrics import make_scorer, f1_score
from sklearn.grid_search import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, svm, metrics
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 

In [361]:
def create_dat(df_ess, df_dem):
    df_ess = df_ess[df_ess.Study=='Connecticut']
    df_ess.Condition.replace(['c', 'c2', 'c1', 'c3', 'ca', 'cb', '3'], 'Control', inplace=True)
    df_ess.Condition.replace(['t', 't2', 't3', 't1', '1', '2', 'ta', 'tb'], 'Treatment', inplace=True)
    df_ess.Condition.replace(['c/t'], np.nan, inplace=True)
    
    df_dem = df_dem[df_dem.Study=='Connecticut']
    df_dem.Ethnicity.replace('Asian', 'Asian American', inplace=True)
    df_dem.Ethnicity.replace('Other/Mixed', 'Other', inplace=True)
    df_dem = df_dem[['ID', 'Ethnicity', 'Gender']].dropna()
    
    outdat = pd.merge(df_ess[['ID', 'Intervention_number', 'Essay', 'Condition', 'Intervention_Date', 'corrected']], 
                    df_dem, how='left', on='ID').drop_duplicates()
    
    return(outdat)

def return_grid_scores(grid, clf='NB'):
    if clf == 'SVM':
        for i, (x, _, _) in enumerate(grid.grid_scores_):
            if x['C'] == grid.best_estimator_.C:
                if x['gamma'] == grid.best_estimator_.gamma:
                    idx = i
    else:
        for i, (x, _, _) in enumerate(grid.grid_scores_):
            if x['alpha'] == grid.best_estimator_.alpha:
                idx = i
    return grid.grid_scores_[idx][2]

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [406]:
df_ess = pd.read_csv('../Data/3 CSV Files/essays1.23.16.csv', sep='|')
df_dem = pd.read_csv('../Data/3 CSV Files/demog3.2.16.csv')
df = create_dat(df_ess, df_dem)
df.dropna(axis=0, subset=['corrected'], inplace=True)

Connecticut               6258
CO/CA Latino              1161
Physics                    798
Awareness&Affirmation      227
Surgeons                   212
CO BMI Study               205
Belonging                  165
Debate                     143
NeuroBioThreat Study 1     129
Group Affirmation          104
Political Affirmation       93
Blindness                   35
Name: Study, dtype: int64

# Classification for Gender+treatment

In [461]:
df['Class'] = df.Condition+df.Gender
df.dropna(axis=0, subset=['Class'], inplace=True)
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df.Essay)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<6224x1763 sparse matrix of type '<type 'numpy.int64'>'
	with 90161 stored elements in Compressed Sparse Row format>

In [463]:
tf = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X_tf = tf.fit_transform(df.Essay)

In [464]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(6224, 50)

In [466]:
X_con = np.concatenate((X_tf.todense(), model.doc_topic_), axis=1)

In [467]:
f1weighted = make_scorer(f1_score, average='macro', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df.Class)
Y = le.transform(df.Class)
dtm_train, dtm_test, label_train, label_true = train_test_split(X_con, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [468]:
f1_score(label_true, predicted, average='macro', pos_label=None)

0.56879661968141648

In [469]:
return_grid_scores(rs, 'SVM')

array([ 0.60461716,  0.59968427,  0.57324741,  0.57748363,  0.61110184,
        0.58098241,  0.57603159,  0.56745881,  0.60653227,  0.59098896])

In [470]:
metrics.confusion_matrix(label_true, predicted)

array([[209, 136,  16,  13],
       [155, 141,   5,  22],
       [ 10,   7, 211,  64],
       [ 17,  14,  81, 144]])

# Classification for Ethnicity+treatment

In [476]:
df['Class'] = df.Condition+df.Ethnicity
df_sub = df[df.Ethnicity.isin(['African American','White'])]
df_sub.Class.value_counts()
df_sub.dropna(axis=0, subset=['Class'], inplace=True)
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_sub.Essay)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


<5330x1595 sparse matrix of type '<type 'numpy.int64'>'
	with 76328 stored elements in Compressed Sparse Row format>

In [486]:
tf = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X_tf = tf.fit_transform(df_sub.Essay)

In [478]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(5330, 50)

In [487]:
X_con = np.concatenate((X_tf.todense(), model.doc_topic_), axis=1)

In [488]:
f1weighted = make_scorer(f1_score, average='macro', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_sub.Class)
Y = le.transform(df_sub.Class)
dtm_train, dtm_test, label_train, label_true = train_test_split(X_con, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [489]:
f1_score(label_true, predicted, average='macro', pos_label=None)

0.50997476296536237

In [490]:
return_grid_scores(rs, 'SVM')

array([ 0.54504722,  0.52821296,  0.5762358 ,  0.52269957,  0.52935073,
        0.52481078,  0.53319855,  0.53534185,  0.54517801,  0.52500375])

In [491]:
metrics.confusion_matrix(label_true, predicted)

array([[159, 117,  17,  12],
       [127, 157,  10,  18],
       [ 32,   5, 136,  76],
       [ 22,   9,  76,  93]])

- look at how the transitions reflect ideas from the coding manual
- look at how the transitions change depending on words previously encountered in the essay.
- look at words and topics that have high weights in dtm and topic model

# Classification for Gender using just treatment

In [313]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2751x3988 sparse matrix of type '<type 'numpy.int64'>'
	with 85774 stored elements in Compressed Sparse Row format>

In [314]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2751, 50)

In [315]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [316]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.64327853945525504

In [317]:
return_grid_scores(rs, 'SVM')

array([ 0.6264599 ,  0.62518655,  0.65060202,  0.63995001,  0.65522849,
        0.66152959,  0.65731158,  0.60691516,  0.65494457,  0.62833656])

In [318]:
metrics.confusion_matrix(label_true, predicted)

array([[221,  64],
       [129, 137]])

# Classification for Ethnicity using just treatment
Stop words removed

In [319]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2354x3707 sparse matrix of type '<type 'numpy.int64'>'
	with 73332 stored elements in Compressed Sparse Row format>

In [320]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [321]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [322]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.52241789166998698

In [323]:
return_grid_scores(rs, 'SVM')

array([ 0.55655523,  0.55805491,  0.55119211,  0.58935521,  0.59697025,
        0.56874672,  0.55973747,  0.57786331,  0.57052214,  0.56452561])

In [324]:
metrics.confusion_matrix(label_true, predicted)

array([[124, 127],
       [ 98, 122]])

# Classification for Gender using just treatment on uncorrected essays

In [325]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(df_treat.Essay)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)

In [326]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2751, 50)

In [327]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [328]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.63516589331739215

In [329]:
return_grid_scores(rs, 'SVM')

array([ 0.63586695,  0.64908694,  0.66059795,  0.62553971,  0.66296212,
        0.65344806,  0.6632204 ,  0.66922941,  0.65810586,  0.66743357])

In [330]:
metrics.confusion_matrix(label_true, predicted)

array([[217,  71],
       [127, 136]])

# Classification for Ethnicity using just treatment on uncorrected essays

In [331]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(df_treat.Essay)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)

In [332]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [333]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [334]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.56687117128084508

In [335]:
return_grid_scores(rs, 'SVM')

array([ 0.58219607,  0.55533223,  0.55435784,  0.61147402,  0.58295604,
        0.53037179,  0.60747543,  0.57737173,  0.56268108,  0.55048015])

In [336]:
metrics.confusion_matrix(label_true, predicted)

array([[133, 104],
       [100, 134]])

# Classification for Gender using just treatment
This time, with stop words removed

In [337]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2751x3744 sparse matrix of type '<type 'numpy.int64'>'
	with 40738 stored elements in Compressed Sparse Row format>

In [338]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2751, 50)

In [339]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [340]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.62236389580409557

In [341]:
return_grid_scores(rs, 'SVM')

array([ 0.65283782,  0.68994757,  0.62815756,  0.62079374,  0.64410789,
        0.64591894,  0.65167141,  0.60965648,  0.62048311,  0.63037087])

In [342]:
metrics.confusion_matrix(label_true, predicted)

array([[230,  69],
       [134, 118]])

# Classification for Ethnicity using just treatment
stop words removed

In [343]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2354x3465 sparse matrix of type '<type 'numpy.int64'>'
	with 34852 stored elements in Compressed Sparse Row format>

In [344]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [345]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [346]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.54782370334516284

In [347]:
return_grid_scores(rs, 'SVM')

array([ 0.5535605 ,  0.54359946,  0.50924238,  0.54640633,  0.55934063,
        0.5629885 ,  0.53038494,  0.57885964,  0.56765751,  0.54973257])

In [348]:
metrics.confusion_matrix(label_true, predicted)

array([[130, 113],
       [100, 128]])

# Classification for Gender using just treatment
This time, with stop words removed, cutoff for words

In [349]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2751x1091 sparse matrix of type '<type 'numpy.int64'>'
	with 37050 stored elements in Compressed Sparse Row format>

In [350]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2751, 50)

In [351]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [352]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.63654226033800221

In [353]:
return_grid_scores(rs, 'SVM')

array([ 0.65341867,  0.67957963,  0.63833251,  0.64159417,  0.68558761,
        0.65321563,  0.65764778,  0.68221721,  0.64382311,  0.65191065])

In [354]:
metrics.confusion_matrix(label_true, predicted)

array([[226,  65],
       [131, 129]])

# Classification for Ethnicity using just treatment
stop words removed, cutoff imposed

In [355]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2354x976 sparse matrix of type '<type 'numpy.int64'>'
	with 31388 stored elements in Compressed Sparse Row format>

In [356]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [357]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [358]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.5240207328708838

In [359]:
return_grid_scores(rs, 'SVM')

array([ 0.53563225,  0.57273382,  0.52687446,  0.56211866,  0.54109468,
        0.59311932,  0.5251564 ,  0.54011511,  0.52736837,  0.54758504])

In [360]:
metrics.confusion_matrix(label_true, predicted)

array([[112, 116],
       [108, 135]])

# Classification for Gender using just treatment
This time, with stop words removed, cutoff for words, words lemmatized

In [364]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2751x1033 sparse matrix of type '<type 'numpy.int64'>'
	with 43873 stored elements in Compressed Sparse Row format>

In [368]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2751, 50)

In [369]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [370]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.65300992806208391

In [371]:
return_grid_scores(rs, 'SVM')

array([ 0.63779364,  0.64365065,  0.62457871,  0.63209512,  0.63010158,
        0.63897407,  0.65709186,  0.63632035,  0.63138774,  0.65236811])

In [372]:
metrics.confusion_matrix(label_true, predicted)

array([[256,  41],
       [141, 113]])

# Classification for Ethnicity using just treatment
stop words removed, cutoff imposed

In [374]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2354x926 sparse matrix of type '<type 'numpy.int64'>'
	with 37250 stored elements in Compressed Sparse Row format>

In [375]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [376]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [377]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.5631101534798848

In [378]:
return_grid_scores(rs, 'SVM')

array([ 0.53716929,  0.56999538,  0.56511569,  0.56006847,  0.56703488,
        0.52972051,  0.53981785,  0.52568264,  0.51148267,  0.51759903])

In [380]:
metrics.confusion_matrix(label_true, predicted)

array([[118, 133],
       [ 71, 149]])

# Classification for Gender using just treatment
This time, with stop words removed, cutoff for words, words lemmatized, fiddling with scoring metric

In [392]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2751x1033 sparse matrix of type '<type 'numpy.int64'>'
	with 43873 stored elements in Compressed Sparse Row format>

In [393]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2751, 50)

In [394]:
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring='f1',
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [398]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.5880515767247777

In [399]:
return_grid_scores(rs, 'SVM')

array([ 0.56119403,  0.5443787 ,  0.53205128,  0.53674121,  0.56687898,
        0.52887538,  0.56697819,  0.56657224,  0.53246753,  0.59459459])

In [400]:
metrics.confusion_matrix(label_true, predicted)

array([[226,  45],
       [171, 109]])

# Classification for Ethnicity using just treatment
stop words removed, cutoff imposed words lemmatized, fiddling with scoring metric

In [407]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2354x926 sparse matrix of type '<type 'numpy.int64'>'
	with 37250 stored elements in Compressed Sparse Row format>

In [408]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [409]:
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(model.doc_topic_, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring='f1',
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [410]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.55643934214555535

In [411]:
return_grid_scores(rs, 'SVM')

array([ 0.55639098,  0.57721519,  0.52849741,  0.56613757,  0.57441253,
        0.54347826,  0.56852792,  0.56533333,  0.58524173,  0.58097686])

In [412]:
metrics.confusion_matrix(label_true, predicted)

array([[127,  99],
       [110, 135]])

# Classification for Gender using just treatment
This time, with stop words removed, cutoff for words, words lemmatized, tf-idf appended

In [413]:
df_treat = df[df.Condition=='Treatment']
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2751x1033 sparse matrix of type '<type 'numpy.int64'>'
	with 43873 stored elements in Compressed Sparse Row format>

In [423]:
tf = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X_tf = tf.fit_transform(df_treat.corrected)

In [424]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions


(2751, 50)

In [428]:
X_con = np.concatenate((X_tf.todense(), model.doc_topic_), axis=1)

(2751, 1083)

In [429]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Gender)
Y = le.transform(df_treat.Gender)
dtm_train, dtm_test, label_train, label_true = train_test_split(X_con, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [430]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.65279725027128777

In [431]:
return_grid_scores(rs, 'SVM')

array([ 0.65198276,  0.69108415,  0.67945644,  0.6432505 ,  0.69545455,
        0.69822429,  0.66382432,  0.63582178,  0.65040661,  0.59386005])

In [432]:
metrics.confusion_matrix(label_true, predicted)

array([[207,  91],
       [100, 153]])

# Classification for Ethnicity using just treatment
stop words removed, cutoff imposed, words lemmatized, tf-idf appended

In [433]:
df_treat = df[df.Condition=='Treatment']
df_treat = df_treat[df_treat.Ethnicity.isin(['African American','White'])]
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X = vectorizer.fit_transform(df_treat.corrected)
model = lda.LDA(n_topics=50, n_iter=1500, random_state=1)
X

<2354x926 sparse matrix of type '<type 'numpy.int64'>'
	with 37250 stored elements in Compressed Sparse Row format>

In [434]:
tf = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=4, stop_words='english')
X_tf = tf.fit_transform(df_treat.corrected)

In [435]:
model.fit(X)
model.doc_topic_.shape # model.doc_topic_ contains the topic distributions

(2354, 50)

In [436]:
X_con = np.concatenate((X_tf.todense(), model.doc_topic_), axis=1)

In [437]:
f1weighted = make_scorer(f1_score, average='weighted', pos_label=None)
le = preprocessing.LabelEncoder()
le.fit(df_treat.Ethnicity)
Y = le.transform(df_treat.Ethnicity)
dtm_train, dtm_test, label_train, label_true = train_test_split(X_con, Y, test_size=.2)

clf = svm.SVC(kernel='linear', probability=True)
pardist = {'C':expon(scale=100), 'gamma':expon(scale=.1)}
cv = ShuffleSplit(dtm_train.shape[0], n_iter=10, test_size=.2)
rs = RandomizedSearchCV(clf, param_distributions=pardist, scoring=f1weighted,
                       n_jobs=1, cv=cv, n_iter=25, verbose=0)
rs.fit(dtm_train, label_train)
predicted = rs.predict(dtm_test)


## Results

In [438]:
f1_score(label_true, predicted, average='weighted', pos_label=None)

0.57724699014058889

In [439]:
return_grid_scores(rs, 'SVM')

array([ 0.59311177,  0.62107963,  0.61019483,  0.62076974,  0.63145554,
        0.58361298,  0.57840366,  0.61460697,  0.62057222,  0.5427291 ])

In [440]:
metrics.confusion_matrix(label_true, predicted)

array([[131, 106],
       [ 93, 141]])