# Introduction to statistical NLP with Python

In [None]:
__author__ = 'Chris Potts'

## Contents

1. [Set-up](#Set-up)
1. [Data readers](#Data-readers)
1. [Feature functions](#Feature-functions)
1. [Vectorizing](#Vectorizing)
1. [Classifier training](#Classifier-training)
1. [Test-set assessment](#Test-set-assessment)
1. [Cross-validation](#Cross-validation)
1. [Bake-off](#Bake-off)

## Set-up

In [None]:
from collections import defaultdict, Counter
import csv
import numpy as np
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold

## Data readers

In [None]:
def _cheese_disease_iterator(filename):
    with open(filename, encoding='utf8') as f:
        for label, text in csv.reader(f, delimiter='\t'):
            label = 'cheese' if label == '1' else 'disease'
            yield text, label
            
def train_iterator(filename=os.path.join('data', 'cheeseDisease.train.txt')):   
    return _cheese_disease_iterator(filename)
   
def test_iterator(filename=os.path.join('data', 'cheeseDisease.test.txt')):    
    return _cheese_disease_iterator(filename)

In [None]:
train = train_iterator()

In [None]:
next(train)

In [None]:
# How many examples do we have of each kind in the train set?



In [None]:
# What is the average string length of train examples of each kind?



## Feature functions

In [None]:
def featurize(s):
    """Represent an example `s` as a count dict.
    
    Parameters
    ----------
    s : str
        The example to process.
    
    Returns
    -------
    dict 
        The keys are feature names, and the values are the feature 
        values -- int, float, or bool.    
    """
    feats = example_length(s)
    feats.update(character_unigrams(s))
    return feats

In [None]:
def example_length(s):
    return {'LENGTH': len(s)}    

In [None]:
def character_unigrams(s):
    return Counter(list(s))

## Vectorizing

In [None]:
# Featurize each train example:



In [None]:
# Instantiate a `DictVectorizer`:



In [None]:
# Use the vectorizer's `fit_transform` to map the feature 
# dicts to a matrix:



In [None]:
# Get the list of train labels:



## Classifier training

In [None]:
# Instantiate a `LogisticRegression` model:



In [None]:
# Run `fit` on the training data:



In [None]:
# Other models to try:

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

## Test-set assessment

In [None]:
# Featurize each test example:



In [None]:
# Use the vectorizer's `transform` (NOT `fit_transform`!) to map 
# the feature dicts to a matrix:



In [None]:
# Use the model to make predictions on the new exampless:



In [None]:
# Get the list of test labels:




In [None]:
# Use `classification_report` to see how we did:



## Cross-validation

$$
\begin{array}{c c c }
\textbf{Splits} & \textbf{Experiment 1} & \textbf{Experiment 2} & \textbf{Experiment 3} \\
\begin{array}{|c|}
\hline
\textrm{fold } 1  \\\hline
\textrm{fold } 2  \\\hline
\textrm{fold } 3  \\\hline
\end{array}
& 
\begin{array}{|c c|}
\hline
\textbf{Test} & \textrm{fold } 1  \\\hline
\textbf{Train} & \textrm{fold } 2  \\
& \textrm{fold } 3  \\\hline
\end{array}
&
\begin{array}{|c c|}
\hline
\textbf{Test} & \textrm{fold } 2  \\\hline
\textbf{Train} & \textrm{fold } 1  \\
& \textrm{fold } 3  \\\hline
\end{array}
&
\begin{array}{|c c|}
\hline
\textbf{Test} & \textrm{fold } 3  \\\hline
\textbf{Train} & \textrm{fold } 1  \\
& \textrm{fold } 2  \\\hline
\end{array}
\end{array}
$$

In [None]:
cv = StratifiedKFold(n_splits=5)

In [None]:
scores = cross_val_score(
    estimator=None,  # Fill this in.
    X=None,          # Fill this in.
    y=None,          # Fill this in.
    cv=cv,
    scoring='f1_macro')

## Bake-off

1. The test set is off-limits during development! We will assess on it only once development is complete.

1. So, using just cross-validation runs on the train data, design a model that does really well at the cheese-disease task. You can fiddle with the featurizer, the choice of model, and pretty much anything else, as long as you don't look at the test data.

1. When you are all done, run the code below, where `your_featurize` is the featurizer you created, `your_vec` is the vectorizer you use, and `your_model` is the model that you developed, and then report the macro-F1 score to Chris.

In [None]:
test_feats = [your_featurize(text) for text, label in test_iterator()]

X_test = your_vec.transform(test_feats)

y_test = [label for text, label in test_iterator()]

test_preds = your_model.predict(X_test)

print(classification_report(y_test, test_preds))