# IEOR 242 Assignment 06
Classify MDA sections of 10-K reports with tf-idf and the Loughran and McDonald dictionary

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# File_Sampler.py provides a helper function to read extracted MDA sections from a directory in a Pandas dataframe
from File_Sampler import get_data_frame

In [2]:
# Path to MDA section files
REPORT_PATH = 'mdna_sections/*'

# Path to the Loughran McDonald dictionary
MASTER_DICT_PATH = '../lecture/LoughranMcDonald_MasterDictionary_2014.xlsx'

# Path to classification file
CLASSIFICATION_FILE = 'MDNA_auto_classification_week6.2.csv'

# Maximum number of features
MAX_FEATURE_COUNT = 10000

# Minimum and maximum n for n-grams
N_GRAMS_MIN = 1
N_GRAMS_MAX = 3

## Data Preparation
Loading the MDA extracts and auto classification file from last week's assignment.

In [3]:
# Load reports, the MDA sections of the reports were extracted with the functions in extract_comp_name.py
report_df = get_data_frame(REPORT_PATH, 0.75)
report_df['File Name'] = report_df['MDNA_FILE_NAMES'].map(lambda r: r.split('/')[-1].replace('mdna_', ''))
report_df.head()

Unnamed: 0,MDNA_FILE_NAMES,MDNA_TEXT_BLOB,?Training,File Name
78,mdna_1997_Q1_1000185_STERLING HOUSE CORP_10-K_...,ITEM 7. Management's Discussion and Analysis o...,True,1997_Q1_1000185_STERLING HOUSE CORP_10-K_1997-...
366,mdna_2014_Q2_1493212_MEDIJANE HOLDINGS INC._10...,Item 7. Management's Discussion and Analysis o...,True,2014_Q2_1493212_MEDIJANE HOLDINGS INC._10-K_20...
351,mdna_1997_Q1_867572_AMERICAN BIOMED INC_10-K_1...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,True,1997_Q1_867572_AMERICAN BIOMED INC_10-K_1997-0...
755,"mdna_2012_Q4_1317880_Plandai Biotechnology, In...",ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,True,"2012_Q4_1317880_Plandai Biotechnology, Inc._10..."
564,mdna_1997_Q1_949173_AVIRON_10-K_1997-03-26,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS...,True,1997_Q1_949173_AVIRON_10-K_1997-03-26


In [4]:
# Load file with manual classifications
class_df = pd.read_csv(CLASSIFICATION_FILE)
class_df.head()

Unnamed: 0,File Name,Ticker,Year,Val1,Val2,Review
0,1999_Q1_1000697_WATERS CORP -DE-_10-K_1999-03-31,WAT,1999,1.18534,68.470588,pos
1,1999_Q1_1030339_NANOGEN INC_10-K_1999-03-29,NGEN,1999,1.18534,-7.272727,neg
2,1999_Q1_1038133_HESKA CORP_10-K_1999-03-29,HSKA,1999,1.18534,86.486486,pos
3,1999_Q1_741815_HOOPER HOLMES INC_10-K_1999-03-31,HH,1999,1.18534,-26.993865,neg
4,1999_Q1_749647_CELSION CORP_10-K_1999-01-13,CLSN,1999,1.18534,11.538462,pos


In [5]:
# Merge both dataframes
df = pd.merge(report_df, class_df, how='inner', on='File Name')
print('Total number of reports: %d' % len(df))
df.head()

Total number of reports: 536


Unnamed: 0,MDNA_FILE_NAMES,MDNA_TEXT_BLOB,?Training,File Name,Ticker,Year,Val1,Val2,Review
0,"mdna_2012_Q4_1317880_Plandai Biotechnology, In...",ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,True,"2012_Q4_1317880_Plandai Biotechnology, Inc._10...",PLPL,2012,1.879148,65.217391,pos
1,mdna_2007_Q2_1008586_STREAMLINE HEALTH SOLUTIO...,Item7.\n\n\nManagements\n Discussion and An...,True,2007_Q2_1008586_STREAMLINE HEALTH SOLUTIONS IN...,STRM,2007,-1.688245,23.866348,pos
2,mdna_2001_Q1_1108205_CURIS INC_10-K_2001-03-30,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS ...,True,2001_Q1_1108205_CURIS INC_10-K_2001-03-30,CRIS,2001,2.646647,3.506339,pos
3,mdna_2014_Q3_1583771_ContraVir Pharmaceuticals...,ITEM 7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF...,True,"2014_Q3_1583771_ContraVir Pharmaceuticals, Inc...",CTRV,2014,5.05585,8.108108,pos
4,mdna_2010_Q3_1443242_Bohai Pharmaceuticals Gro...,Item 7.Managements Discussion\nand Analysis or...,True,"2010_Q3_1443242_Bohai Pharmaceuticals Group, I...",BOPH,2010,-8.214596,-4.977376,pos


## Calculate Weights
TfidfVectorizer: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [6]:
# Create tool to calculate tf-idf features
tf = TfidfVectorizer(analyzer='word',
                     stop_words='english',
                     max_features=MAX_FEATURE_COUNT,
                     ngram_range=(N_GRAMS_MIN, N_GRAMS_MAX))
tfidf_matrix =  tf.fit_transform(df['MDNA_TEXT_BLOB'].tolist())

feature_names = tf.get_feature_names()
print('Number of features: %d' % len(feature_names))

Number of features: 10000


In [7]:
# Create a dataframe with td-idf values for each word in columns and one row per report
tfidf_df = pd.DataFrame(tfidf_matrix.toarray())
tfidf_df.columns = [i.upper() for i in feature_names]
tfidf_df.head()

Unnamed: 0,00,00 12,00 12 00,00 21,00 FSL,00 FSL WORKSTATION,00 SHARE,000,000 000,000 000 000,...,YIELD,YIELD CURVE,YIELDS,YORK,ZANAFLEX,ZANAFLEX TABLETS,ZEGERID,ZEQ,ZEQ SEQ,ZERO
0,0.0,0,0,0,0,0,0.0,0.008128,0.0,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
1,0.0,0,0,0,0,0,0.0,0.081188,0.018775,0.011966,...,0,0,0,0,0,0,0,0.0,0.0,0.0
2,0.011625,0,0,0,0,0,0.011195,0.230309,0.009237,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
3,0.0,0,0,0,0,0,0.0,0.02256,0.007019,0.0,...,0,0,0,0,0,0,0,0.05965,0.05965,0.004635
4,0.0,0,0,0,0,0,0.0,0.054002,0.013202,0.0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


## Scoring with Finance Dictionary
Loughran-McDonalds dictionary source: http://www3.nd.edu/~mcdonald/Word_Lists.html

In [8]:
# Loading the dictionary
dict = pd.read_excel(MASTER_DICT_PATH)
dict.head()

Unnamed: 0,Word,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
0,AARDVARK,1,81,5.690194e-09,3.06874e-09,5.779943e-07,45,0,0,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,2,1.404986e-10,8.217606e-12,7.84187e-09,1,0,0,0,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,8,5.619945e-10,1.686149e-10,7.09624e-08,7,0,0,0,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,5,3.512466e-10,1.727985e-10,7.532677e-08,5,0,0,0,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,1752,1.230768e-07,1.198634e-07,1.110293e-05,465,0,0,0,0,0,0,0,0,0,0,3,12of12inf


In [9]:
# Create a smaller dictionary that only contains the words which are used in the reports
minidict = dict[dict['Word'].isin(tfidf_df.columns)]
minidict = minidict.set_index('Word')

In [10]:
# Clean the positive & negative columns
minidict.loc[minidict['Positive'] > 0, 'Positive'] = 1
minidict.loc[minidict['Negative'] > 0, 'Negative'] = -1
minidict.head()

Unnamed: 0_level_0,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ABANDONED,11,174298,1.2e-05,1.2e-05,8.8e-05,83234,-1,0,0,0,0,0,0,0,0,2,3,12of12inf
ABILITY,101,8083865,0.000568,0.0006,0.000541,754403,0,0,0,0,0,0,0,0,0,0,4,12of12inf
ABLE,126,3253260,0.000229,0.000232,0.000343,553588,0,1,0,0,0,0,0,0,0,0,2,12of12inf
ABSENCE,247,845276,5.9e-05,3.6e-05,0.000109,226760,-1,0,0,0,0,0,0,0,0,1,2,12of12inf
ABSOLUTE,266,660860,4.6e-05,3.4e-05,0.000105,212346,0,0,0,0,0,0,0,0,0,0,3,12of12inf


In [11]:
# Just some transformations to facilitate merging
tfidf_df = tfidf_df.T 
tfidf_df.index.name='Word'
tfidf_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,526,527,528,529,530,531,532,533,534,535
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00,0,0,0.011625,0,0,0.004898,0.015678,0.003996,0,0,...,0.003233,0,0.041172,0,0.02063,0,0.01649,0.00791,0,0.004246
00 12,0,0,0.0,0,0,0.0,0.0,0.0,0,0,...,0.0,0,0.0,0,0.0,0,0.008355,0.0,0,0.0
00 12 00,0,0,0.0,0,0,0.0,0.0,0.0,0,0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0
00 21,0,0,0.0,0,0,0.0,0.0,0.0,0,0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0
00 FSL,0,0,0.0,0,0,0.0,0.0,0.0,0,0,...,0.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0


In [12]:
# Merge the dictionary with the report dataframe
senti_df = pd.merge(tfidf_df, minidict, how='inner', left_index=True, right_index=True)
senti_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABANDONED,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,2,3,12of12inf
ABILITY,0.027094,0.002438,0.033987,0.01823,0.0,0.010108,0.0,0.012369,0.012731,0.006546,...,0,0,0,0,0,0,0,0,4,12of12inf
ABLE,0.0,0.008696,0.052293,0.005419,0.003397,0.016024,0.004274,0.006536,0.015137,0.003891,...,0,0,0,0,0,0,0,0,2,12of12inf
ABSENCE,0.0,0.0,0.004355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,2,12of12inf
ABSOLUTE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,3,12of12inf


In [13]:
# Calculate sentiments for each report
for i, row in df.iterrows():
    df.loc[i, 'senti_pos'] = sum(senti_df[i] * senti_df['Positive'])
    df.loc[i, 'senti_neg'] = sum(senti_df[i] * senti_df['Negative'])
df.head()

Unnamed: 0,MDNA_FILE_NAMES,MDNA_TEXT_BLOB,?Training,File Name,Ticker,Year,Val1,Val2,Review,senti_pos,senti_neg
0,"mdna_2012_Q4_1317880_Plandai Biotechnology, In...",ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,True,"2012_Q4_1317880_Plandai Biotechnology, Inc._10...",PLPL,2012,1.879148,65.217391,pos,0.088082,-0.305968
1,mdna_2007_Q2_1008586_STREAMLINE HEALTH SOLUTIO...,Item7.\n\n\nManagements\n Discussion and An...,True,2007_Q2_1008586_STREAMLINE HEALTH SOLUTIONS IN...,STRM,2007,-1.688245,23.866348,pos,0.09686,-0.096045
2,mdna_2001_Q1_1108205_CURIS INC_10-K_2001-03-30,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS ...,True,2001_Q1_1108205_CURIS INC_10-K_2001-03-30,CRIS,2001,2.646647,3.506339,pos,0.483604,-0.588864
3,mdna_2014_Q3_1583771_ContraVir Pharmaceuticals...,ITEM 7.MANAGEMENT'S DISCUSSION AND ANALYSIS OF...,True,"2014_Q3_1583771_ContraVir Pharmaceuticals, Inc...",CTRV,2014,5.05585,8.108108,pos,0.085023,-0.173237
4,mdna_2010_Q3_1443242_Bohai Pharmaceuticals Gro...,Item 7.Managements Discussion\nand Analysis or...,True,"2010_Q3_1443242_Bohai Pharmaceuticals Group, I...",BOPH,2010,-8.214596,-4.977376,pos,0.124318,-0.094662


## Result Validation

In [14]:
# Takes a postitive and a negative sentiment value and returns either 'pos' or 'neg'
def senti_label(pos, neg):
    if (pos + neg) >= 0:
        return 'pos'
    else:
        return 'neg'

In [15]:
# Calculate a vector with all predicted classifications
y_pred = df.apply(lambda row: senti_label(row['senti_pos'], row['senti_neg']), axis=1)

In [16]:
print('Accuracy classification score: %f' % accuracy_score(df['Review'], y_pred))

Accuracy classification score: 0.457090


In [17]:
print('Confusion matrix:')
print(confusion_matrix(df['Review'], y_pred))

Confusion matrix:
[[170  68]
 [223  75]]


In [18]:
print('Classification report:')
print(classification_report(df['Review'], y_pred))

Classification report:
             precision    recall  f1-score   support

        neg       0.43      0.71      0.54       238
        pos       0.52      0.25      0.34       298

avg / total       0.48      0.46      0.43       536

