CS224U Final Project: Analysis of Gender Roles and Bias in Literary Portrayal of Characters
===

In [2]:
__authors__ = "Stephanie Wang, Megha Srivastava, Sarai Gould"
__version__="CS224u, Stanford, Spring 2016 term"

In [3]:
import numpy as np
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.grid_search import GridSearchCV
from pandas import DataFrame
import scipy.stats
import utils

### Overview
##### Features
The input to these functions is a list of features to train on. Ideally this is a list of dictionaries.
We are using unigrams as our features, for which we have sparse vector representations for each sentence in the text. We are also using the dependency parses as a feature set. 
##### Binary Classification
We are looking at binary tasks -- classifying as either male/ female
##### Train and Test
We need to split our data into two different sets -- one for training our classifier, and one for testing the model for accuracy.
#### TO DO:
 - Add a function to read in the features as a tuple -- (dictionary, label) where dictionary is the dictionary for a sentence, and label is "female" or "male".
 - Account for the fact that a sentence could have both male and female labels
 - Add hyperparameter tuning for better results. Try out other classification models for better accuracy.

In [4]:
import re
import codecs 

#labels are: Male Char, Female Auth: MCFA; Female Char, Female Auth: FCFA; Male Char, Male Auth: MCMA;
#Female Char, Male Auth: FCMA; Male Char, Female Char, Female Auth: MCFCFA; Male Char, Female Char Male Auth: MCFCMA
def train_file_reader(src_filename):
    curr_author_gender = "FA"
    curr_genre = "Default_Genre"
    curr_pub = "0000"
    all_lines_file = codecs.open(src_filename, 'r', 'utf8')
    iterable_file = iter(all_lines_file)
    for line in iterable_file:
        if "#####" in line:   
            #line represents a new book in the dataset, indicating a change in author gender, pub year, and genre
            if "GENDER:MALE" in line:
                curr_author_gender = "MA"
            else:
                curr_author_gender = "FA"
            pub_year_pattern = re.compile('#PUB:(\d+)#') #extracts publication year
            pub_matches = pub_year_pattern.findall(line)
            if pub_matches:
                curr_pub = pub_matches[0]
            genre_pattern = re.compile('#GENRE:(.+)#PUB') #extracts genre
            genre_matches = genre_pattern.findall(line)
            if genre_matches:
                curr_genre = genre_matches[0]
        if line.startswith("SENTENCE: "): #sentence to label
            label = ""
            if "00MALE00" in line:
                label += "MC"
            if "00FEMALE00" in line:
                label += "FC"
            if label == "":
                continue
            label += curr_author_gender
            unigrams_list = []
            for word in line.split(" "):
                if (word != "00MALE00" and word != "00FEMALE00" and word != "SENTENCE:"):
                    unigrams_list.append(word) #gets all unigrams except those obviously indicating gender
            dependency_list = []
            line = next(iterable_file)
            if line.startswith("DEPENDENCY: "): #dependencies corresponding to above sentence
                for dep in line.split(" "):
                    if (dep != "DEPENDENCY:"):
                        dependency_list.append(dep)
            if not dependency_list:
                continue
            yield(unigrams_list, dependency_list, curr_pub, curr_genre, label) #returns a generator
        
def train_reader():
    #hard-code file HERE
    return train_file_reader("replaced_all_lines.txt.sentences.extracted.dep")    

def features_phi(unigrams, dependencies, pub, genre):
    features_list = []
    features_list.extend(unigrams)
    features_list.extend(dependencies)
    #publication date feature is currently divided into buckets by century. Change this code to 
    #use buckets of decades, centuries, etc. 
    if pub.startswith("18"):
        features_list.append("1800")
    elif pub.startswith("19"):
        features_list.append("1900")
    elif pub.startswith("17"):
        features_list.append("1700")
    elif pub.startswith("16"):
        features_list.append("1600")
    else:
        features_list.append("0000")
    #appends genre feature
    features_list.append(genre)
    #turns feature list into a dictionary
    return Counter(features_list)

## Build Dataset for experiments

In [5]:
def build_dataset(reader, phi, vectorizer=None):
    """
    Parameters
    ----------
    reader : iterator
        This is the dataset that we are featurizing
        
    vectorizer : sklearn.feature_extraction.DictVectorizer
        If this is None, then a new `DictVectorizer` is created and
        used to turn the list of dicts created by `phi` into a 
        feature matrix. This happens when we are training.

        If this is not None, then it's assumed to be a `DictVectorizer` 
        and used to transform the list of dicts. This happens in 
        assessment, when we take in new instances and need to 
        featurize them as we did in training.
    Returns
    -------
    dict
        A dict with keys 'X' (the feature matrix), 'y' (the list of labels), 
        'vectorizer' (the 'DictVectorizer'), and 'raw_examples'(for error analysis).
    """
    labels = []
    feat_dicts = []
    raw_examples = []
    for unigrams, deps, pub, genre, label in reader():
        labels.append(label)
        feat_dicts.append(phi(unigrams, deps, pub, genre))
        raw_examples.append(unigrams)
    feat_matrix = None
    #In training, we want a new vectorizer:
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
        feat_names = vectorizer.get_feature_names()
    #In assessment, we featurize using the existing vectorizer:
    else:
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    
    return {'X': feat_matrix,
            'y': labels,
            'featureNames': feat_names,
            'vectorizer': vectorizer}

## Fit the classifier on our feature matrix and labels

In [6]:
def fit_maxent_classifier(X, y):    
    """Wrapper for `sklearn.linear.model.LogisticRegression`. This is also 
    called a Maximum Entropy (MaxEnt) Classifier, which is more fitting 
    for the multiclass case.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    sklearn.linear.model.LogisticRegression
        A trained `LogisticRegression` instance.
    
    """
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

## Run experiment to train and test our model

In [14]:
def experiment(
            train_reader=train_reader,
            assess_reader=None,
            train_size=0.7,
            phi = features_phi,
            train_func=fit_maxent_classifier,
            score_func=utils.safe_macro_f1,
            verbose=True):
    """
    Parameters
    ==========
    train_reader : Iterator for training data.
    
    assess_reader : Iterator for assessment data.
    
    train_size : float
        If 'assess_reader' is None, then this is percentage of 
        'train_reader' devoted to training. Else this is ignored.
    train_func : model wrapper
        Any function taking in a feature matrix and label list
        and returns a fitted model with a 'predict' function
        that operates on feature matrices.
    score_metric : Scoring function, default is weighted average F1
    verbose : bool
        Whether to print out model assessment to standard output.
    
    Prints
    ======
    Model accuracy and model precision/recall/F1 report.
    
    Returns
    =======
    Float
        Overall scoring metric determined by 'score_metric'.
    """
    # Train dataset:
    train = build_dataset(train_reader, phi, vectorizer=None)
    #Manage the assessment set-up:
    X_train = train['X']
    y_train = train['y']
    feature_names = train['featureNames']
    X_assess = None
    y_assess = None
    if assess_reader == None:
        #'train_test_split' is a sklearn function that splits arrays or matrices into random train and test subsets
        X_train, X_assess, y_train, y_assess = train_test_split(
                X_train, y_train, train_size=train_size)
    else:
        #Assessment dataset using the training vectorizer:
        assess = build_dataset(assess_reader, phi, vectorizer=None)
        X_assess, y_assess = assess['X'], assess['y']
    #Train:
    mod = train_func(X_train, y_train)
    #Predictions:
    predictions = mod.predict(X_assess)
    #Print features and feature weights
    coef = np.matrix(mod.coef_)
    df = DataFrame(coef.transpose(), index = feature_names)
    print('Feature names and their weights:')
    print (df.sort_values([2], ascending=[False]))
    #Report:
    if verbose:
        print('Accuracy: %0.03f' % accuracy_score(y_assess, predictions))
        print(classification_report(y_assess, predictions, digits=3))
    # Return the overall score:
    return score_func(y_assess, predictions) 
        

In [15]:
_ = experiment()

Feature names and their weights:
                                     0         1         2
Mr.                          -0.559809 -0.597556  0.847500
himself                      -1.014760 -0.028129  0.820026
brother                      -0.537486 -0.274900  0.653254
who                          -0.837737 -0.049916  0.640674
Africa                       -0.654916 -1.112743  0.612274
!                            -0.623025 -0.005867  0.561063
compound_hust_00unknown00    -0.372820 -0.249542  0.550899
Hust                         -0.372820 -0.249542  0.550899
Bramble                      -0.698615  0.002973  0.540497
if                           -0.275452 -0.351671  0.535393
:                            -0.536173 -0.259557  0.532985
St.                          -0.550859 -0.056552  0.532510
from                         -0.371317 -0.391054  0.509517
whose                        -0.401558 -0.179912  0.504576
they                         -0.428538 -0.206128  0.487361
1800                   

## Hyperparameter Search

In [16]:
def fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid, scoring='accuracy'):
    #Find the best model within param_grid:
    crossvalidator = GridSearchCV(basemod, param_grid, cv=cv, scoring=scoring)
    crossvalidator.fit(X, y)
    print("Best params", crossvalidator.best_params_)
    print("Best score: %0.03f" % crossvalidator.best_score_)
    return crossvalidator.best_estimator_

In [21]:
def fit_maxent_with_crossvalidation(X, y):
    basemod = LogisticRegression()
    cv = 5
    param_grid = {'fit_intercept': [True, False],
                 'C': [0.4, 0.6, 0.8, 1.0, 2.0, 3.0, 0.9, 0.5],
                 'penalty': ['l1', 'l2']}
    return fit_classifier_with_crossvalidation(X, y, basemod, cv, param_grid)

In [22]:
_ = experiment(
        train_func=fit_maxent_with_crossvalidation)

('Best params', {'penalty': 'l1', 'C': 0.6, 'fit_intercept': False})
Best score: 0.741
Feature names and their weights:
                                      0         1         2
Corrard                        0.000000 -0.862152  1.085090
himself                       -1.638061 -0.359759  1.007214
heard                         -0.106515 -0.175148  0.986680
king                          -0.088170  0.000000  0.961549
guide                          0.000000 -0.341831  0.955942
servant                        0.000000 -0.094716  0.947406
if                            -0.193304 -0.651099  0.890419
received                      -0.692499  0.000000  0.864009
root_root_00unknown00         -0.213864  0.000000  0.863680
1800                           0.000000 -2.918631  0.840286
compound_00unknown00_st.       0.000000  0.000000  0.839233
Hust                          -0.353934 -0.208110  0.819039
orders                         0.000000 -0.289849  0.800969
n't                           -0.290470 