# IEOR 242 Assignment 05
Classify MDA sections

In [1]:
import nltk
import pandas as pd
import numpy as np
from sklearn import cross_validation

# File_Sampler.py provides a helper function to read extracted MDA sections from a directory in a Pandas dataframe.
from File_Sampler import get_data_frame

# mda_tokenization.py provides a helper function to tokenize text (includign POS tagging)
from mda_tokenization import tokenize_text

# Compute_Performance_Metrics.py provides helper functions to evaluate classifier performance
from Compute_Performance_Metrics import compute_confusion_matrix, print_classification_report

# Cross_Validate_Bayes_Classifier.py provides helper functions to do cross validation
from Cross_Validate_Bayes_Classifier import compute_cross_validation

In [2]:
# Specify path to MDA section files
REPORT_PATH = 'assignment-05-report-mda/*'

# Specify path to manual MDA classification file
CLASSIFICATION_FILE = 'MDA Manual Classification.csv'

# Specify path to stock price MDA classification file
STOCK_CLASSIFICATION_FILE = 'MDA Auto Classification.csv'

# Maximum number of features
MAX_FEATURE_COUNT = 500

## Data Preparation

In [3]:
# Load reports
# The MDA sections of the reports were extracted with the functions in extract_comp_name.py
report_df = get_data_frame(REPORT_PATH, 0.75)
report_df['File Name'] = report_df['MDNA_FILE_NAMES'].map(lambda r: r.split('/')[-1])
report_df.head()

Unnamed: 0,MDNA_FILE_NAMES,MDNA_TEXT_BLOB,?Training,File Name
63,mdna_2013_Q1_1356576_SUPERNUS PHARMACEUTICALS ...,ITEM 7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF...,True,mdna_2013_Q1_1356576_SUPERNUS PHARMACEUTICALS ...
36,mdna_2012_Q1_911326_SYNAGEVA BIOPHARMA CORP_10...,MANAGEMENTS DISCUSSION AND ANALYSIS OF FINANC...,True,mdna_2012_Q1_911326_SYNAGEVA BIOPHARMA CORP_10...
54,mdna_2015_Q1_1178253_SCYNEXIS INC_10-K_2015-03-30,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS...,True,mdna_2015_Q1_1178253_SCYNEXIS INC_10-K_2015-03-30
62,"mdna_2013_Q1_1360214_Imprimis Pharmaceuticals,...",ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...,True,"mdna_2013_Q1_1360214_Imprimis Pharmaceuticals,..."
78,"mdna_2015_Q1_1157602_Vitae Pharmaceuticals, In...",Item7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF ...,True,"mdna_2015_Q1_1157602_Vitae Pharmaceuticals, In..."


In [4]:
# Load file with manual classifications
class_df = pd.read_csv(CLASSIFICATION_FILE)
class_df.head()

Unnamed: 0,Filing Date,Ticker Symbol,File Name,Reviewer,Review
0,2014-03-28,RCAR,"mdna_2014_Q1_1016708_RenovaCare, Inc._10-K_201...",Stefan,neg
1,2014-03-07,AXDX,"mdna_2014_Q1_727207_Accelerate Diagnostics, In...",Stefan,neg
2,2014-03-31,AMDA,mdna_2014_Q1_1269026_AMEDICA Corp_10-K_2014-03-31,Stefan,pos
3,2014-03-10,RTIX,"mdna_2014_Q1_1100441_RTI SURGICAL, INC._10-K_2...",Stefan,neg
4,2014-03-20,AMBI,mdna_2014_Q1_1131543_AMBIT BIOSCIENCES CORP_10...,Stefan,pos


In [5]:
# Merge both dataframes
df = pd.merge(report_df, class_df, how='inner', on='File Name')
print('Total number of reports:', len(df))
df.head()

Total number of reports: 65


Unnamed: 0,MDNA_FILE_NAMES,MDNA_TEXT_BLOB,?Training,File Name,Filing Date,Ticker Symbol,Reviewer,Review
0,mdna_2013_Q1_1356576_SUPERNUS PHARMACEUTICALS ...,ITEM 7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF...,True,mdna_2013_Q1_1356576_SUPERNUS PHARMACEUTICALS ...,2013-03-15,SUPN,Ted,pos
1,mdna_2012_Q1_911326_SYNAGEVA BIOPHARMA CORP_10...,MANAGEMENTS DISCUSSION AND ANALYSIS OF FINANC...,True,mdna_2012_Q1_911326_SYNAGEVA BIOPHARMA CORP_10...,,,Lakshmi,neg
2,mdna_2015_Q1_1178253_SCYNEXIS INC_10-K_2015-03-30,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS...,True,mdna_2015_Q1_1178253_SCYNEXIS INC_10-K_2015-03-30,2015-03-30,SCYX,Yvonne,neg
3,"mdna_2013_Q1_1360214_Imprimis Pharmaceuticals,...",ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...,True,"mdna_2013_Q1_1360214_Imprimis Pharmaceuticals,...",2013-03-18,IMMY,Ted,neg
4,"mdna_2015_Q1_1157602_Vitae Pharmaceuticals, In...",Item7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF ...,True,"mdna_2015_Q1_1157602_Vitae Pharmaceuticals, In...",2015-03-31,VTAE,Yvonne,neg


In [6]:
# Split reports in training and test set
train_set = df.loc[df['?Training'] == True]
print('Training set size:', len(train_set))
test_set = df.loc[df['?Training'] == False]
print('Test set size:', len(test_set))

Training set size: 51
Test set size: 14


## Build Classifier

In [7]:
# Build the feature list based on the training set
train_set_tokens = []
for index, row in train_set.iterrows():
    tokens = tokenize_text(row['MDNA_TEXT_BLOB'], nouns=True)
    train_set_tokens.extend(tokens)

In [8]:
# Calculate frequency distribution for all tokens
all_tokens = nltk.FreqDist(train_set_tokens)
print('Training set number of tokens: ', all_tokens.N())
print('Training set number of unique tokens: ', all_tokens.B())
print('Features used: ', MAX_FEATURE_COUNT)

Training set number of tokens:  114341
Training set number of unique tokens:  4438
Features used:  500


In [9]:
# Function to calculate features for a given text
def calc_features(text):
    text_tokens = text.split()
    features = {}
    for token, count in all_tokens.most_common(MAX_FEATURE_COUNT):
        features[token] = (token in text_tokens)
    return features

In [10]:
# Build features for training set
feature_train_set = train_set.apply(lambda r: (calc_features(r['MDNA_TEXT_BLOB']), r['Review']), axis=1)
feature_train_set.head()

0    ({'discount': True, 'evidence': True, 'content...
1    ({'discount': False, 'evidence': True, 'conten...
2    ({'discount': True, 'evidence': True, 'content...
3    ({'discount': False, 'evidence': False, 'conte...
4    ({'discount': False, 'evidence': True, 'conten...
dtype: object

In [11]:
# Build features for test set
feature_test_set = test_set.apply(lambda r: (calc_features(r['MDNA_TEXT_BLOB']), r['Review']), axis=1)
feature_test_set.head()

51    ({'discount': False, 'evidence': False, 'conte...
52    ({'discount': True, 'evidence': True, 'content...
53    ({'discount': False, 'evidence': False, 'conte...
54    ({'discount': False, 'evidence': True, 'conten...
55    ({'discount': True, 'evidence': True, 'content...
dtype: object

In [12]:
# Train Naive Bayes Classifier
classifier = nltk.NaiveBayesClassifier.train(feature_train_set)
classifier.show_most_informative_features()

Most Informative Features
                   rates = False             neg : pos    =      3.9 : 1.0
                    risk = False             neg : pos    =      3.6 : 1.0
         pharmaceuticals = True              pos : neg    =      3.5 : 1.0
                  states = True              pos : neg    =      3.5 : 1.0
                    size = True              pos : neg    =      3.5 : 1.0
                 pricing = False             neg : pos    =      3.3 : 1.0
                   trade = True              pos : neg    =      3.3 : 1.0
             corporation = True              neg : pos    =      3.0 : 1.0
                revenues = False             pos : neg    =      3.0 : 1.0
               agreement = False             pos : neg    =      3.0 : 1.0


## Performance Evaluation Test Set

In [13]:
# Evaluate the classifier with the test set
test_eval_actual = np.array(feature_test_set.map(lambda r: r[1]) == 'pos')
print('Test set actual:', test_eval_actual)
test_eval_predicted = np.array(feature_test_set.map(lambda r: classifier.classify(r[0]) == 'pos'))
print('Test set predicted:', test_eval_actual)

Test set actual: [ True  True False False  True  True  True False False  True False  True
 False False]
Test set predicted: [ True  True False False  True  True  True False False  True False  True
 False False]


In [14]:
# Print confusion matrix
compute_confusion_matrix(test_eval_actual, test_eval_predicted)

Confusion Matrix 

*Rows represent Actuals and Columns represent Predicted 

True Positive    False Negative 

False Positive   True Negative 

[[6 1]
 [2 5]]


In [15]:
# Print classification report
print_classification_report(test_eval_actual, test_eval_predicted)


Out of Sample Accuracy: 0.785714285714

     Classification Report 

             precision    recall  f1-score   support

      False       0.75      0.86      0.80         7
       True       0.83      0.71      0.77         7

avg / total       0.79      0.79      0.78        14





### Results
Number of reports: 65 (51 training, 14 test)

File_Sampler.py seed value: 1234

Word Class / Training Tokens / Training Unique Tokens / Features / Test Accuracy

nouns / 114343 / 4438 / 100 / 0.786

nouns / 114343 / 4438 / 500 / 0.786

nouns / 114343 / 4438 / 1000 / 0.714

nouns / 114343 / 4438 / 2500 / 0.571

adj / 38906 / 2654 / 100 / 0.5

adj / 38906 / 2654 / 500 / 0.571

adj / 38906 / 2654 / 1000 / 0.571

adj / 38906 / 2654 / 2500 / 0.571

verbs / 40569 / 2942 / 100 / 0.643

verbs / 40569 / 2942 / 500 / 0.714

verbs / 40569 / 2942 / 1000 / 0.714

verbs / 40569 / 2942 / 2500 / 0.5

other / 246342 / 8904 / 100 / 0.643

other / 246342 / 8904 / 1000 / 0.714

other / 246342 / 8904 / 2500 / 0.643

all / 246342 / 8904 / 100 / 0.643

all / 246342 / 8904 / 1000 / 0.714

all / 246342 / 8904 / 2500 / 0.642

### Best Classifier
The highest out of sample accuracy (0.785714285714) was achived by using only the 1000 most common nouns as features.

The confusion matrix is [[6, 1], [2, 5]], the precision is 0.79, the recall is 0.79, and the F1 score is 0.78.

The 5 most informative features of this classifier are:

rates = False (neg : pos = 3.9 : 1.0)

risk = False (neg : pos = 3.6 : 1.0)

pharmaceuticals = True (pos : neg = 3.5 : 1.0)

size = True (pos : neg = 3.5 : 1.0)

states = True (pos : neg = 3.5 : 1.0)

## Performance Evaluation Stock Price Data Set

In [16]:
# Load file with classifications based on stock prices
# Generated with the functions in get_mdna_sentiment.py
stock_class_df = pd.read_csv(STOCK_CLASSIFICATION_FILE)
stock_class_df.head()

Unnamed: 0,File Name,Review
0,"2011_Q1_1017491_APRICUS BIOSCIENCES, INC._10-K...",pos
1,"2011_Q1_1024126_PERNIX THERAPEUTICS HOLDINGS, ...",neg
2,2011_Q1_1030916_PREMIER HOLDING CORP._10-K_201...,neg
3,"2011_Q1_1055726_INOVIO PHARMACEUTICALS, INC._1...",neg
4,2011_Q1_1096738_EPOCRATES INC_10-K_2011-03-31,neg


In [17]:
# Merge both report dataframe with new classification dataframe
report_df['File Name'] = report_df['File Name'].map(lambda r: r.replace('mdna_', ''))
df2 = pd.merge(report_df, stock_class_df, how='inner', on='File Name')
print('Total number of reports:', len(df2))
df2.head()

Total number of reports: 86


Unnamed: 0,MDNA_FILE_NAMES,MDNA_TEXT_BLOB,?Training,File Name,Review
0,mdna_2013_Q1_1356576_SUPERNUS PHARMACEUTICALS ...,ITEM 7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF...,True,2013_Q1_1356576_SUPERNUS PHARMACEUTICALS INC_1...,pos
1,mdna_2012_Q1_911326_SYNAGEVA BIOPHARMA CORP_10...,MANAGEMENTS DISCUSSION AND ANALYSIS OF FINANC...,True,2012_Q1_911326_SYNAGEVA BIOPHARMA CORP_10-K_20...,neg
2,mdna_2015_Q1_1178253_SCYNEXIS INC_10-K_2015-03-30,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS...,True,2015_Q1_1178253_SCYNEXIS INC_10-K_2015-03-30,pos
3,"mdna_2013_Q1_1360214_Imprimis Pharmaceuticals,...",ITEM 7. MANAGEMENTS DISCUSSION AND ANALYSIS OF...,True,"2013_Q1_1360214_Imprimis Pharmaceuticals, Inc....",neg
4,"mdna_2015_Q1_1157602_Vitae Pharmaceuticals, In...",Item7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF ...,True,"2015_Q1_1157602_Vitae Pharmaceuticals, Inc_10-...",neg


In [18]:
# Build features for stock price data set
stock_price_set = df2.apply(lambda r: (calc_features(r['MDNA_TEXT_BLOB']), r['Review']), axis=1)
stock_price_set.head()

0    ({'discount': True, 'evidence': True, 'content...
1    ({'discount': False, 'evidence': True, 'conten...
2    ({'discount': True, 'evidence': True, 'content...
3    ({'discount': False, 'evidence': False, 'conte...
4    ({'discount': False, 'evidence': True, 'conten...
dtype: object

In [19]:
# Evaluate the classifier with the test set
stock_price_eval_actual = np.array(stock_price_set.map(lambda r: r[1]) == 'pos')
print('Stock price data set actual:', test_eval_actual)
stock_price_eval_predicted = np.array(stock_price_set.map(lambda r: classifier.classify(r[0]) == 'pos'))
print('Stock price data set predicted:', test_eval_actual)

Stock price data set actual: [ True  True False False  True  True  True False False  True False  True
 False False]
Stock price data set predicted: [ True  True False False  True  True  True False False  True False  True
 False False]


In [20]:
# Print confusion matrix
compute_confusion_matrix(stock_price_eval_actual, stock_price_eval_predicted)

Confusion Matrix 

*Rows represent Actuals and Columns represent Predicted 

True Positive    False Negative 

False Positive   True Negative 

[[36 28]
 [ 9 13]]


In [21]:
# Print classification report
print_classification_report(stock_price_eval_actual, stock_price_eval_predicted)


Out of Sample Accuracy: 0.56976744186

     Classification Report 

             precision    recall  f1-score   support

      False       0.80      0.56      0.66        64
       True       0.32      0.59      0.41        22

avg / total       0.68      0.57      0.60        86





### Results
Number of reports: 86

The best classifier from the manual classified training set (based on the the 500 most common nouns) was used to classify the stock price data set. The accuracy of that data set is 0.56976744186.

The confusion matrix is [[36, 28], [9, 13]], the precision is 0.68, the recall is 0.57, and the F1 score is 0.6.

## Cross Validation
Cross validation based on the manual classified training set. The average accuracy is 0.8439818295739349.

In [22]:
compute_cross_validation(feature_train_set)


 Begin Cross Validation 

accuracy: 0.868421052631579
accuracy: 0.875
accuracy: 0.8809523809523809
accuracy: 0.875
accuracy: 0.7857142857142857
accuracy: 0.8666666666666667
accuracy: 0.75
accuracy: 0.8571428571428571
accuracy: 0.8125
accuracy: 0.868421052631579

 The average accuracy is:  0.8439818295739349
