CS224U Final Project
===

In [1]:
__authors__ = "Stephanie Wang, Megha Srivastava, Sarai Gould"
__version__="CS224u, Stanford, Spring 2016 term"

In [2]:
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
import scipy.stats
import utils

### Overview
##### Features
The input to these functions is a list of features to train on. Ideally this is a list of dictionaries.
We are using unigrams as out features, for which we have sparse vector representations for each sentence in the text. We are also using the dependency parses as a feature set. 
##### Binary Classification
We are looking at binary tasks -- classifying as either male/ female

## Build Dataset for experiments

In [3]:
def build_dataset(reader, vectorizer=None):
    """
    Parameters
    ----------
    reader : iterator
        This is the dataset that we are featurizing
        
    vectorizer : sklearn.feature_extraction.DictVectorizer
        If this is None, then a new `DictVectorizer` is created and
        used to turn the list of dicts created by `phi` into a 
        feature matrix. This happens when we are training.

        If this is not None, then it's assumed to be a `DictVectorizer` 
        and used to transform the list of dicts. This happens in 
        assessment, when we take in new instances and need to 
        featurize them as we did in training.
    Returns
    -------
    dict
        A dict with keys 'X' (the feature matrix), 'y' (the list of labels), 
        'vectorizer' (the 'DictVectorizer'), and 'raw_examples'(for error analysis).
    """
    labels = []
    feat_dicts = []
    raw_examples = []
    for sentence, label in reader:
        #What if the sentence contains both male and female?
        labels.append(label)
        feat_dicts.append(sentence)
        #raw_examples.append()
    feat_matrix = None
    #In training, we want a new vectorizer:
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    #In assessment, we featurize using the existing vectorizer:
    else:
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    return {'X': feat_matrix,
            'y': labels,
            'vectorizer': vectorizer}

In [4]:
def fit_maxent_classifier(X, y):    
    """Wrapper for `sklearn.linear.model.LogisticRegression`. This is also 
    called a Maximum Entropy (MaxEnt) Classifier, which is more fitting 
    for the multiclass case.
    
    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.
        
    y : list
        The list of labels for rows in `X`.
    
    Returns
    -------
    sklearn.linear.model.LogisticRegression
        A trained `LogisticRegression` instance.
    
    """
    mod = LogisticRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [7]:
def experiment(
            train_reader=None,
            assess_reader=None,
            train_size=0.7,
            train_func=fit_maxent_classifier,
            score_func=utils.safe_macro_f1,
            verbose=True):
    """
    Parameters
    ==========
    train_reader : Iterator for training data.
    
    assess_reader : Iterator for assessment data.
    
    train_size : float
        If 'assess_reader' is None, then this is percentage of 
        'train_reader' devoted to training. Else this is ignored.
    train_func : model wrapper
        Any function taking in a feature matrix and label list
        and returns a fitted model with a 'predict' function
        that operates on feature matrices.
    score_metric : Scoring function, default is weighted average F1
    verbose : bool
        Whether to print out model assessment to standard output.
    
    Prints
    ======
    Model accuracy and model precision/recall/F1 report.
    
    Returns
    =======
    Float
        Overall scoring metric determined by 'score_metric'.
    """
    # Train dataset:
    train = build_dataset(train_reader, vectorizer=None)
    #Manage the assessment set-up:
    X_train = train['X']
    y_train = train['y']
    X_assess = None
    y_assess = None
    if assess_reader == None:
        #'train_test_split' is a sklearn function that splits arrays or matrices into random train and test subsets
        X_train, X_assess, y_train, y_assess = train_test_split(
                X_train, y_train, train_size=train_size)
    else:
        #Assessment dataset using the training vectorizer:
        assess = build_dataset(assess_reader, vectorizer=None)
        X_assess, y_assess = assess['X'], assess['y']
    #Train:
    mod = train_func(X_train, y_train)
    #Predictions:
    predictions = mod.predict(X_assess)
    #Report:
    if verbose:
        print('Accuracy: %0.03f' % accuracy_score(y_assess, predictions))
        print(classification_report(y_assess, predictions, digits=3))
    # Return the overall score:
    return score_func(y_assess, predictions) 
        

In [8]:
_ = experiment()

TypeError: 'NoneType' object is not iterable