# IEOR 242 Assignment 06
Classify MDA sections of 10-K reports with scikit learn Naive Bayes and cross validation

In [1]:
import glob
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import cross_validation, preprocessing
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import label_binarize

In [2]:
# Path to MDA section files
REPORT_PATH = 'mdna_sections/*'

# Path to classification file
CLASSIFICATION_FILE = 'MDNA_auto_classification_week6.2.csv'

# Maximum number of features for the Bayes classifier
MAX_FEATURE_COUNT = 5000

# Minimum and maximum n for n-grams
N_GRAMS_MIN = 1
N_GRAMS_MAX = 3

## Data Loading
Loading the MDA extracts and auto classification file from last week's assignment.

In [3]:
# Load file with manual classifications
class_df = pd.read_csv(CLASSIFICATION_FILE)
class_df.head()

Unnamed: 0,File Name,Ticker,Year,Val1,Val2,Review
0,1999_Q1_1000697_WATERS CORP -DE-_10-K_1999-03-31,WAT,1999,1.18534,68.470588,pos
1,1999_Q1_1030339_NANOGEN INC_10-K_1999-03-29,NGEN,1999,1.18534,-7.272727,neg
2,1999_Q1_1038133_HESKA CORP_10-K_1999-03-29,HSKA,1999,1.18534,86.486486,pos
3,1999_Q1_741815_HOOPER HOLMES INC_10-K_1999-03-31,HH,1999,1.18534,-26.993865,neg
4,1999_Q1_749647_CELSION CORP_10-K_1999-01-13,CLSN,1999,1.18534,11.538462,pos


## Data Preparation

In [4]:
# List with all file contents
file_contents = list()

# List with all classification labels
labels = list()

for fname in glob.iglob(REPORT_PATH):
    if fname != '':
        label_row = class_df.loc[class_df['File Name'] == fname.split('/')[-1].replace('mdna_', ''), 'Review']
        if len(label_row) > 0:
            labels.append(label_row.values[0])
            with open(fname, 'r') as file:
                file_contents.append(file.read())
print('Number of reports: %d' % len(labels))

Number of reports: 536


## Build Classifier with Cross Validation

In [5]:
# Initialize Naive Bayes classifier (use alpha for regularization)
classifier = MultinomialNB(fit_prior=False)               

# Initialize vectorizer module
vectorizer = CountVectorizer(analyzer='word',
                             stop_words='english',
                             max_features=MAX_FEATURE_COUNT,
                             ngram_range=(N_GRAMS_MIN, N_GRAMS_MAX))

In [6]:
# Do cross validation with k=10
cv = cross_validation.KFold(len(file_contents), n_folds=10, shuffle=True, random_state=None)

In [7]:
# Pandas dataframe to store the cross validation results
eval_df = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])

for traincv, testcv in cv:
    # Transform the data and train the classifier
    matrix = vectorizer.fit_transform(file_contents)
    classifier.fit(matrix[traincv[0]:traincv[len(traincv)-1]], labels[traincv[0]:traincv[len(traincv)-1]])
    print('Number of features: %d' % len(vectorizer.get_feature_names()))

    # Build predicted and true classification labels as binary vectors
    y_true = label_binarize(labels[testcv[0]:testcv[len(testcv)-1]], classes=['neg', 'pos'])
    y_pred = label_binarize(classifier.predict(matrix[testcv[0]:testcv[len(testcv)-1]]), classes=['neg', 'pos'])

    # Evaluate the classifier performance
    print(confusion_matrix(y_true, y_pred))
    eval_df.loc[len(eval_df)] = [accuracy_score(y_true, y_pred),
                                 precision_score(y_true, y_pred),
                                 recall_score(y_true, y_pred),
                                 f1_score(y_true, y_pred)]

Number of features: 5000
[[177  60]
 [100 198]]
Number of features: 5000
[[172  60]
 [ 99 198]]
Number of features: 5000
[[168  57]
 [ 93 185]]
Number of features: 5000
[[168  58]
 [ 95 188]]
Number of features: 5000
[[175  60]
 [ 99 195]]
Number of features: 5000
[[167  58]
 [ 96 180]]
Number of features: 5000
[[172  59]
 [ 96 184]]
Number of features: 5000
[[159  55]
 [ 91 174]]
Number of features: 5000
[[163  56]
 [ 92 175]]
Number of features: 5000
[[172  60]
 [ 98 195]]


In [8]:
eval_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
0,0.700935,0.767442,0.66443,0.71223
1,0.699433,0.767442,0.666667,0.713514
2,0.701789,0.764463,0.665468,0.711538
3,0.699411,0.764228,0.664311,0.710775
4,0.699433,0.764706,0.663265,0.710383
5,0.692615,0.756303,0.652174,0.700389
6,0.696673,0.757202,0.657143,0.703633
7,0.695198,0.759825,0.656604,0.704453
8,0.695473,0.757576,0.655431,0.702811
9,0.699048,0.764706,0.665529,0.711679


In [9]:
# Average classifier performance
eval_df.mean()

Accuracy     0.698001
Precision    0.762389
Recall       0.661102
F1 Score     0.708141
dtype: float64