In [64]:
#Libraries we'll need 
from collections import Counter
from sklearn.linear_model import LogisticRegression,SGDClassifier
from nltk.tokenize import word_tokenize
from sklearn import datasets, neighbors, linear_model
from sklearn.svm import SVC
import scipy.stats
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np 
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import defaultdict
import operator
import csv
from PorterStemmer import PorterStemmer
import random
from sklearn.naive_bayes import GaussianNB
from nltk import pos_tag

Reading in data - train and dev

In [2]:
df = pd.read_csv('fixed_combined_train_files_plscommas.csv',encoding='utf8')

In [3]:
df_dev = pd.read_csv('fixed_combined_dev_files.csv',encoding='utf8')

Selecting appropriate columns - train and dev

In [4]:
data_train = df[['sentence','gender']]

In [5]:
data_dev = df_dev[['sentence','gender']]

In [6]:
data_train.shape, data_dev.shape

((869536, 2), (125950, 2))

Preprocessing - Removing punctuation, Porter Stemming, and lowercasing 

Note: Must recount unigram and bigram frequencies for each type of preprocessed data

In [7]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        #text = str(text).encode('utf-8').replace(punctuation, ' ')
        text = text.replace(punctuation, ' ')
    #text = re.sub( '\s+', ' ', text ).encode('utf-8').strip()
    text = re.sub( '\s+', ' ', text ).strip()
    return text

In [8]:
def porter_stem(text):
    p = PorterStemmer()
    return p.stem(text)

In [9]:
def lower(text):
    return text.lower()

Cell below: exclusively for choosing a preprocessing technique

5/28/2018 1:35pm REMOVE PUNCTUATION AS PREPROCESSING

06/03/2016 REMOVE PUNCTUATION, LOWERCASE

In [10]:
data_train.loc[:, 'sentence'] = data_train.loc[:, 'sentence'].apply(remove_punctuations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [11]:
data_dev.loc[:, 'sentence'] = data_dev.loc[:, 'sentence'].apply(remove_punctuations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [12]:
data_train.loc[:, 'sentence'] = data_train.loc[:, 'sentence'].apply(lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [13]:
data_dev.loc[:, 'sentence'] = data_dev.loc[:, 'sentence'].apply(lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Functions for collecting unigram and bigram frequencies - must call every time a new preprocessing technique is applied 

In [14]:
def sort_unigrams():
    all_unigrams = {}
    #for every row in the dataframe, get the text of the sentence column, tokenize it, and count every token in it 
    for index, row in data_train.iterrows():
        text = row['sentence']
        words = word_tokenize(text)
        for word in words:
            all_unigrams[word] = all_unigrams.get(word,0) + 1
    sorted_unigrams = sorted(all_unigrams.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_unigrams

In [15]:
def sort_bigrams():
    all_bigrams = {}
    for index, row in data_train.iterrows():
        text = row['sentence']
        words = ['<S>'] + word_tokenize(text) + ['</S>']
        for i in range(1, len(words)):
            bi = words[i-1] + " " + words[i]
            all_bigrams[bi] = all_bigrams.get(bi,0) + 1
    sorted_bigrams = sorted(all_bigrams.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_bigrams

Actually collecting frequencies for model being run

In [16]:
unigram_frequencies = sort_unigrams()

In [17]:
bigram_frequencies = sort_bigrams()

In [18]:
len(unigram_frequencies)

115933

In [19]:
len(bigram_frequencies)

3146374

Getting the top 5k unigrams for use as features in the model

In [20]:
top_unigrams = {uni[0]: uni[1] for uni in unigram_frequencies[:5000]}

In [21]:
top_bigrams = {bi[0]: bi[1] for bi in bigram_frequencies[:10000]}

In [22]:
len(top_unigrams)

5000

In [23]:
len(top_bigrams)

10000

Featurizing functions: unigrams, bigrams, unigrams + bigrams, all the rest applied to POS and lemma, LIWC lists, HBR lexicons

In [24]:
# given sentence, and list of top unigrams and bigrams
# returns feature dicts for unigrams and bigrams
def unigrams(text):
    words = word_tokenize(text)
    final = {}
    
    for word in words:
        if word in top_unigrams:
            final[word] = final.get(word,0) + 1
        else:
            final['UNK'] = final.get('UNK',0) + 1
    return final

In [25]:
def all_unigrams(text):
    words = word_tokenize(text)
    final = {}
    
    for word in words:
        final[word] = final.get(word,0) + 1
    return final

In [48]:
def all_bigrams(text):
    words = ['<S>'] + word_tokenize(text) + ['</S>']
    final = {}
    
    for i in range(1, len(words)):
        bi = words[i-1] + " " + words[i]
        final[bi] = final.get(bi,0) + 1
    return final 

In [26]:
def bigrams(text):
    words = ['<S>'] + word_tokenize(text) + ['</S>']
    final = {}
    
    for i in range(1, len(words)):
        bi = words[i-1] + " " + words[i]
        if bi in top_bigrams:
            final[bi] = final.get(bi,0) + 1
        else:
            final['UNK UNK'] = final.get('UNK UNK',0) + 1
    return final 

In [27]:
def pos_unigrams(text):
    words = word_tokenize(text)
    tags = [pair[1] for pair in pos_tag(words)]
    final = {}
    for tag in tags:
        final[tag] = final.get(tag,0)+1
    return final

Actually featurizing the data

In [49]:
feat_dicts_train = []
labels_train = []
for index, row in data_train.iterrows():
    text = row['sentence']
    label = row['gender']
    feature_dict = {**all_unigrams(text),**all_bigrams(text)}
    feat_dicts_train.append(feature_dict)
    labels_train.append(label)

In [50]:
feat_dicts_dev = []
labels_dev = []
for index, row in data_dev.iterrows():
    text = row['sentence']
    label = row['gender']
    feature_dict = {**all_unigrams(text),**all_bigrams(text)}
    feat_dicts_dev.append(feature_dict)
    labels_dev.append(label)

Converting dictionary of dictionaries to matrices

In [51]:
vectorizer = DictVectorizer(sparse=True)
features_train = vectorizer.fit_transform(feat_dicts_train)

In [52]:
features_dev = vectorizer.transform(feat_dicts_dev)

In [53]:
features_train.shape, len(labels_train), features_dev.shape, len(labels_dev)

((869536, 3262307), 869536, (125950, 3262307), 125950)

Optional: re-weighted features_train and features_dev with tfidf

In [54]:
tfidf_transformer = TfidfTransformer()

In [55]:
features_train_tfidf = tfidf_transformer.fit_transform(features_train)

In [56]:
features_dev_tfidf = tfidf_transformer.transform(features_dev)

In [57]:
features_train_tfidf.shape,features_dev_tfidf.shape

((869536, 3262307), (125950, 3262307))

Instantiating and training the model

In [58]:
logistic = LogisticRegression(verbose=3,solver='sag',class_weight = {"male":0.25, "female":0.75})
log_model = logistic.fit(features_train_tfidf,labels_train)

convergence after 20 epochs took 53 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.1s finished


Model 2: SGD with hinge loss and batch generator (SVM)

In [93]:
def iter_minibatches2(chunksize, datasize):
    chunkstartmarker = 0
    while chunkstartmarker < datasize:
        if chunkstartmarker + chunksize < datasize:
            X_chunk = features_train_tfidf[chunkstartmarker:chunkstartmarker+chunksize]
            y_chunk = labels_train_tf[chunkstartmarker:chunkstartmarker+chunksize]
            chunkstartmarker+=chunksize
        else:
            X_chunk = features_train[chunkstartmarker:datasize]
            y_chunk = labels_train[chunkstartmarker]
            chunkstartmarker = datasize
        yield X_chunk,y_chunk

In [124]:
def iter_minibatches(chunksize, datasize):
    chunkstartmarker = 0
    while chunkstartmarker < datasize:
        X_chunk = features_train_tfidf[chunkstartmarker:chunkstartmarker+chunksize]
        y_chunk = labels_train[chunkstartmarker:chunkstartmarker+chunksize]
        chunkstartmarker+=chunksize
        yield X_chunk,y_chunk

In [107]:
batcherator = iter_minibatches(chunksize=108692,datasize=869536)

In [129]:
sgd = SGDClassifier(loss='hinge',verbose=3,class_weight = {"male":0.25, "female":0.75},max_iter=100)

In [130]:
sgd_model = sgd.fit(features_train_tfidf,labels_train)

-- Epoch 1
Norm: 20.40, NNZs: 3215895, Bias: -0.271719, T: 869536, Avg. loss: 0.892494
Total training time: 0.81 seconds.
-- Epoch 2
Norm: 20.41, NNZs: 3220752, Bias: -0.279386, T: 1739072, Avg. loss: 0.890188
Total training time: 1.53 seconds.
-- Epoch 3
Norm: 20.43, NNZs: 3221461, Bias: -0.279286, T: 2608608, Avg. loss: 0.890033
Total training time: 2.24 seconds.
-- Epoch 4
Norm: 20.44, NNZs: 3221664, Bias: -0.276347, T: 3478144, Avg. loss: 0.891505
Total training time: 2.97 seconds.
-- Epoch 5
Norm: 20.43, NNZs: 3221723, Bias: -0.281534, T: 4347680, Avg. loss: 0.888259
Total training time: 3.70 seconds.
-- Epoch 6
Norm: 20.42, NNZs: 3221758, Bias: -0.284463, T: 5217216, Avg. loss: 0.887753
Total training time: 4.51 seconds.
-- Epoch 7
Norm: 20.43, NNZs: 3221871, Bias: -0.280602, T: 6086752, Avg. loss: 0.891570
Total training time: 5.24 seconds.
-- Epoch 8
Norm: 20.43, NNZs: 3221886, Bias: -0.280328, T: 6956288, Avg. loss: 0.890111
Total training time: 5.95 seconds.
-- Epoch 9
Norm: 

Norm: 20.43, NNZs: 3221901, Bias: -0.282053, T: 58258912, Avg. loss: 0.889321
Total training time: 57.50 seconds.
-- Epoch 68
Norm: 20.43, NNZs: 3221901, Bias: -0.282042, T: 59128448, Avg. loss: 0.889490
Total training time: 58.21 seconds.
-- Epoch 69
Norm: 20.43, NNZs: 3221901, Bias: -0.282147, T: 59997984, Avg. loss: 0.889254
Total training time: 58.92 seconds.
-- Epoch 70
Norm: 20.43, NNZs: 3221901, Bias: -0.282080, T: 60867520, Avg. loss: 0.889504
Total training time: 59.66 seconds.
-- Epoch 71
Norm: 20.43, NNZs: 3221901, Bias: -0.282026, T: 61737056, Avg. loss: 0.889498
Total training time: 60.37 seconds.
-- Epoch 72
Norm: 20.43, NNZs: 3221901, Bias: -0.282160, T: 62606592, Avg. loss: 0.889225
Total training time: 61.10 seconds.
-- Epoch 73
Norm: 20.43, NNZs: 3221901, Bias: -0.282174, T: 63476128, Avg. loss: 0.889426
Total training time: 61.81 seconds.
-- Epoch 74
Norm: 20.43, NNZs: 3221901, Bias: -0.282099, T: 64345664, Avg. loss: 0.889470
Total training time: 62.53 seconds.
-- E

In [125]:
batcherator = iter_minibatches(chunksize=108692,datasize=869536)
for X_chunk, y_chunk in batcherator:
    print(X_chunk.shape,len(y_chunk))
    sgd_model = sgd.partial_fit(X_chunk, y_chunk,classes=np.unique(labels_train))

(108692, 3262307) 108692
-- Epoch 1




Norm: 21.45, NNZs: 2546972, Bias: -0.174077, T: 108692, Avg. loss: 0.993869
Total training time: 0.12 seconds.
(108692, 3262307) 108692
-- Epoch 1
Norm: 21.03, NNZs: 2678254, Bias: -0.203962, T: 108692, Avg. loss: 1.004631
Total training time: 0.13 seconds.
(108692, 3262307) 108692
-- Epoch 1
Norm: 20.80, NNZs: 2792711, Bias: -0.150260, T: 108692, Avg. loss: 0.964628
Total training time: 0.23 seconds.
(108692, 3262307) 108692
-- Epoch 1
Norm: 20.73, NNZs: 2899863, Bias: -0.066163, T: 108692, Avg. loss: 0.883742
Total training time: 0.11 seconds.
(108692, 3262307) 108692
-- Epoch 1
Norm: 20.69, NNZs: 2999927, Bias: -0.076528, T: 108692, Avg. loss: 0.839622
Total training time: 0.23 seconds.
(108692, 3262307) 108692
-- Epoch 1
Norm: 20.70, NNZs: 3087096, Bias: -0.063502, T: 108692, Avg. loss: 0.820627
Total training time: 0.11 seconds.
(108692, 3262307) 108692
-- Epoch 1
Norm: 20.76, NNZs: 3170707, Bias: -0.121287, T: 108692, Avg. loss: 0.838387
Total training time: 0.22 seconds.
(108692

Predicting on test set

In [119]:
logistic_predictions = logistic.predict(features_dev_tfidf)

In [131]:
sgd_predictions = sgd_model.predict(features_dev_tfidf)

Getting the numbers!

In [77]:
print("Logistic regression result: ")
print(classification_report(labels_dev, logistic_predictions))

Logistic regression result: 
             precision    recall  f1-score   support

     female       0.45      0.65      0.53     31498
       male       0.86      0.74      0.80     94452

avg / total       0.76      0.72      0.73    125950



In [132]:
print("SVM result: ")
print(classification_report(labels_dev, sgd_predictions))

SVM result: 
             precision    recall  f1-score   support

     female       0.34      0.70      0.46     31498
       male       0.85      0.54      0.66     94452

avg / total       0.72      0.58      0.61    125950



Figuring out the most informative features

In [78]:
weights = list(log_model.coef_.transpose())

In [79]:
names = vectorizer.get_feature_names()

In [80]:
weight_dict = {}

In [81]:
for name,weight in zip(names,weights):
    weight_dict[name] = weight[0]

In [82]:
sorted_weights = sorted(weight_dict.items(), key=operator.itemgetter(1))

In [83]:
sorted_weights[:50]

[('woman', -15.28234998805659),
 ('ms', -14.410394364231394),
 ('it husband', -14.0561093838102),
 ('husband', -11.449207288063292),
 ('girl', -10.013808216583808),
 ('mother', -8.729180563359714),
 ('mrs', -8.099146146971655),
 ('actress', -8.0961787054303),
 ('hillary', -7.903183282896676),
 ('daughter', -7.8972067366029215),
 ('lady', -7.8659130067956085),
 ('female', -7.407729584877959),
 ('devos', -6.997176335836824),
 ('women', -6.434847439771052),
 ('mrs names', -6.368574044697234),
 ('mom', -6.102296303229272),
 ('server', -5.979593459020139),
 ('beyoncé', -5.80772439226515),
 ('clinton', -5.784430928882867),
 ('<S> ms', -5.643238260721736),
 ('pregnant', -5.490622542304049),
 ('girls', -5.3118364176968385),
 ('sister', -5.184267153166717),
 ('feminist', -5.123609512959551),
 ('pregnancy', -4.973782757806087),
 ('boyfriend', -4.967090357230357),
 ('queen', -4.900113576404142),
 ('emails', -4.79800945117063),
 ('lock it', -4.700528759089891),
 ('as secretary', -4.616822771514494

In [47]:
sorted_weights[-50:]

[('breitbart', 3.029555373229729),
 ('it inauguration', 3.034438422704227),
 ('warriors', 3.0545243040283894),
 ('players', 3.1148744381856477),
 ('indiana', 3.1283241604414855),
 ('chairman', 3.1468328759835833),
 ('scalia', 3.1549902213016616),
 ('priebus', 3.1592727298344805),
 ('league', 3.163718699604721),
 ('<S> president', 3.164317479983277),
 ('duterte', 3.164744116316276),
 ('nato', 3.178603467516678),
 ('christie', 3.195267414715662),
 ('comey', 3.2028962576833746),
 ('father', 3.231586623583917),
 ('wife', 3.234055969562576),
 ('facebook </S>', 3.238604308041458),
 ('twitter bobpricebbtx', 3.244548638584041),
 ('knicks', 3.2893605096940175),
 ('football', 3.3263408650220674),
 ('congressman', 3.327704910980327),
 ('baseball', 3.4093706037143767),
 ('nfl', 3.551173167960581),
 ('ryan', 3.5585256705591646),
 ('milo', 3.5712744501348963),
 ('yankees', 3.57216836471881),
 ('businessman', 3.6482757380133988),
 ('it girlfriend', 3.6512450488367034),
 ('boy', 3.7157066011985482),
 