In [157]:
#Libraries we'll need 
from collections import Counter
from sklearn.linear_model import LogisticRegression
from nltk.tokenize import word_tokenize
from sklearn import datasets, neighbors, linear_model
from sklearn.svm import SVC
import scipy.stats
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np 
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from collections import defaultdict
import operator
import csv
from PorterStemmer import PorterStemmer
import random
from sklearn.naive_bayes import GaussianNB
from nltk import pos_tag

Reading in data - train and dev

In [102]:
df = pd.read_csv('fixed_combined_train_files_plscommas.csv',encoding='utf8')

In [103]:
df_dev = pd.read_csv('fixed_combined_dev_files.csv',encoding='utf8')

Selecting appropriate columns - train and dev

In [104]:
data_train = df[['sentence','gender']]

In [105]:
data_dev = df_dev[['sentence','gender']]

In [106]:
data_train.shape, data_dev.shape

((869536, 2), (125950, 2))

Preprocessing - Removing punctuation, Porter Stemming, and lowercasing 

Note: Must recount unigram and bigram frequencies for each type of preprocessed data

In [107]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        #text = str(text).encode('utf-8').replace(punctuation, ' ')
        text = text.replace(punctuation, ' ')
    #text = re.sub( '\s+', ' ', text ).encode('utf-8').strip()
    text = re.sub( '\s+', ' ', text ).strip()
    return text

In [108]:
def porter_stem(text):
    p = PorterStemmer()
    return p.stem(text)

In [109]:
def lower(text):
    return text.lower()

Cell below: exclusively for choosing a preprocessing technique

5/28/2018 1:35pm REMOVE PUNCTUATION AS PREPROCESSING

In [110]:
data_train.loc[:, 'sentence'] = data_train.loc[:, 'sentence'].apply(remove_punctuations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [111]:
data_dev.loc[:, 'sentence'] = data_dev.loc[:, 'sentence'].apply(remove_punctuations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [112]:
data_train.loc[:, 'sentence'] = data_train.loc[:, 'sentence'].apply(lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [113]:
data_dev.loc[:, 'sentence'] = data_dev.loc[:, 'sentence'].apply(lower)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Functions for collecting unigram and bigram frequencies - must call every time a new preprocessing technique is applied 

In [114]:
def sort_unigrams():
    all_unigrams = {}
    #for every row in the dataframe, get the text of the sentence column, tokenize it, and count every token in it 
    for index, row in data_train.iterrows():
        text = row['sentence']
        words = word_tokenize(text)
        for word in words:
            all_unigrams[word] = all_unigrams.get(word,0) + 1
    sorted_unigrams = sorted(all_unigrams.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_unigrams

In [115]:
def sort_bigrams():
    all_bigrams = {}
    for index, row in data_train.iterrows():
        text = row['sentence']
        words = ['<S>'] + word_tokenize(text) + ['</S>']
        for i in range(1, len(words)):
            bi = words[i-1] + " " + words[i]
            all_bigrams[bi] = all_bigrams.get(bi,0) + 1
    sorted_bigrams = sorted(all_bigrams.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_bigrams

Actually collecting frequencies for model being run

In [116]:
unigram_frequencies = sort_unigrams()

In [117]:
bigram_frequencies = sort_bigrams()

In [118]:
len(unigram_frequencies)

115933

In [119]:
len(bigram_frequencies)

3146374

Getting the top 5k unigrams for use as features in the model

In [179]:
top_unigrams = {uni[0]: uni[1] for uni in unigram_frequencies[:10000]}

In [180]:
top_bigrams = {bi[0]: bi[1] for bi in bigram_frequencies[:10000]}

In [181]:
len(top_unigrams)

10000

In [182]:
len(top_bigrams)

10000

Featurizing functions: unigrams, bigrams, unigrams + bigrams, all the rest applied to POS and lemma, LIWC lists, HBR lexicons

In [183]:
# given sentence, and list of top unigrams and bigrams
# returns feature dicts for unigrams and bigrams
def unigrams(text):
    words = word_tokenize(text)
    final = {}
    
    for word in words:
        if word in top_unigrams:
            final[word] = final.get(word,0) + 1
        else:
            final['UNK'] = final.get('UNK',0) + 1
    return final

In [198]:
def all_unigrams(text):
    words = word_tokenize(text)
    final = {}
    
    for word in words:
        final[word] = final.get(word,0) + 1
    return final

In [184]:
def bigrams(text):
    words = ['<S>'] + word_tokenize(text) + ['</S>']
    final = {}
    
    for i in range(1, len(words)):
        bi = words[i-1] + " " + words[i]
        if bi in top_bigrams:
            final[bi] = final.get(bi,0) + 1
        else:
            final['UNK UNK'] = final.get('UNK UNK',0) + 1
    return final 

In [185]:
def pos_unigrams(text):
    words = word_tokenize(text)
    tags = [pair[1] for pair in pos_tag(words)]
    final = {}
    for tag in tags:
        final[tag] = final.get(tag,0)+1
    return final

Actually featurizing the data

In [199]:
feat_dicts_train = []
labels_train = []
for index, row in data_train.iterrows():
    text = row['sentence']
    label = row['gender']
    feature_dict = all_unigrams(text)
    feat_dicts_train.append(feature_dict)
    labels_train.append(label)

In [200]:
feat_dicts_dev = []
labels_dev = []
for index, row in data_dev.iterrows():
    text = row['sentence']
    label = row['gender']
    feature_dict = all_unigrams(text)
    feat_dicts_dev.append(feature_dict)
    labels_dev.append(label)

Converting dictionary of dictionaries to matrices

In [201]:
vectorizer = DictVectorizer(sparse=True)
features_train = vectorizer.fit_transform(feat_dicts_train)

In [202]:
features_dev = vectorizer.transform(feat_dicts_dev)

In [203]:
features_train.shape, len(labels_train), features_dev.shape, len(labels_dev)

((869536, 115933), 869536, (125950, 115933), 125950)

Optional: re-weighted features_train and features_dev with tfidf

In [204]:
tfidf_transformer = TfidfTransformer()

In [205]:
features_train_tfidf = tfidf_transformer.fit_transform(features_train)

In [206]:
features_dev_tfidf = tfidf_transformer.transform(features_dev)

In [207]:
features_train_tfidf.shape,features_dev_tfidf.shape

((869536, 115933), (125950, 115933))

Instantiating and training the model

In [208]:
logistic = LogisticRegression(verbose=3,solver='sag')
log_model = logistic.fit(features_train_tfidf,labels_train)

convergence after 24 epochs took 24 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.5s finished


Predicting on test set

In [209]:
logistic_predictions = logistic.predict(features_dev_tfidf)

Getting the numbers!

In [211]:
print("Logistic regression result: ")
print(classification_report(labels_dev, logistic_predictions))

Logistic regression result: 
             precision    recall  f1-score   support

     female       0.69      0.25      0.37     31498
       male       0.79      0.96      0.87     94452

avg / total       0.77      0.78      0.75    125950



Figuring out the most informative features

In [212]:
weights = list(log_model.coef_.transpose())

In [213]:
names = vectorizer.get_feature_names()

In [214]:
weight_dict = {}

In [215]:
for name,weight in zip(names,weights):
    weight_dict[name] = weight[0]

In [216]:
sorted_weights = sorted(weight_dict.items(), key=operator.itemgetter(1))

In [217]:
sorted_weights[:50]

[('husband', -12.519585344458351),
 ('ms', -10.866351627147916),
 ('woman', -9.413209045501345),
 ('mrs', -8.549631262560471),
 ('actress', -7.535076350705512),
 ('devos', -7.395482305563533),
 ('lady', -7.271073993325328),
 ('girl', -6.7886979195848385),
 ('beyoncé', -6.3699398872525235),
 ('hijab', -6.186432974792667),
 ('server', -5.991395252412696),
 ('pregnancy', -5.633379477612983),
 ('lanashadwick2', -5.477710084907814),
 ('pregnant', -5.36039868700924),
 ('madonna', -5.325174553374608),
 ('boyfriend', -5.315579805206431),
 ('erdely', -5.293358539656793),
 ('lock', -5.255381022274753),
 ('feminist', -5.213592571622445),
 ('female', -5.178309871689569),
 ('hillary', -5.121758370390941),
 ('haley', -5.067249478548496),
 ('queen', -4.839592975104978),
 ('chairwoman', -4.826944014459833),
 ('daughter', -4.698547272526667),
 ('meldonium', -4.674419945066522),
 ('congresswoman', -4.5618650476939075),
 ('chancellor', -4.430826971163066),
 ('heroine', -4.417530207189577),
 ('streep', -4

In [218]:
sorted_weights[-50:]

[('warmbier', 3.090945335205684),
 ('potro', 3.097216598590564),
 ('scalia', 3.1117348648052103),
 ('blasio', 3.1128110112194856),
 ('priebus', 3.137806778589711),
 ('netanyahu', 3.138601069914844),
 ('zuckerberg', 3.152385919129663),
 ('christie', 3.1773271754428096),
 ('farage', 3.205149555119673),
 ('nba', 3.2725419916498395),
 ('comey', 3.356732063840973),
 ('ryan', 3.3589080888458263),
 ('nato', 3.4560851202670824),
 ('joelpollak', 3.4771038785849613),
 ('duterte', 3.482991006568131),
 ('xi', 3.4837291793789564),
 ('president', 3.5061725669599078),
 ('congressman', 3.5368469000280203),
 ('innings', 3.54729249139483),
 ('awr', 3.56266112833162),
 ('kaepernick', 3.6031980801854857),
 ('treasury', 3.607637968008508),
 ('jets', 3.6163108595596394),
 ('warriors', 3.623724305272961),
 ('boy', 3.643661624302862),
 ('beard', 3.673143384117225),
 ('romney', 3.7489711234958896),
 ('nfl', 3.8010419736926266),
 ('businessman', 3.8114877695025084),
 ('cruz', 3.881700131894629),
 ('giants', 3.9