# IEOR 242 Assignment 09
Prepare data by reading MDA extracts from files and merging them with labels. Result is saved in a Pickle file.

In [1]:
import glob
import re
import pandas as pd
import pickle

In [2]:
import pymysql
from sqlalchemy import create_engine
from sqlalchemy.dialects import mysql

# Connect to team database
engine = create_engine('mysql+pymysql://<user>:<password>@<host>[:<port>]/<dbname>')

In [3]:
# Path to MDA section files
REPORT_PATH = 'assignment-07-report-mda/*'

# Regex for parsing the file list
sec_regex = re.compile(r'mdna_(\d{4})_Q(\d{1})_(\d+)_.+_10-K_\d{4}-\d{2}-\d{2}')

# Path to the Pickle file
PICKLE_FILE = 'prepared_subsector.pickle'

## Label Loading
Load the table with the labels.

In [4]:
# Read labels from database
fin_label_df = pd.read_sql_query('SELECT a.cik AS cik, a.fyearq AS year, a.fqtr AS quarter, a.saleq, ' \
                                 'a.saleq_change, b.gsubind AS subsector, b.conm AS name, b.tic ' \
                                 'FROM Compustat_Health_Quarterly_Sentiment_With_Change a ' \
                                 'JOIN Compustat_sub b ON a.cik = b.cik AND a.fyearq = b.fyear', engine)

print('Number of reports: %d' % len(fin_label_df))
fin_label_df.head()

Number of reports: 21018


Unnamed: 0,cik,year,quarter,saleq,saleq_change,subsector,name,tic
0,1800.0,2010,1,7698.354,14.64806633,35101010,ABBOTT LABORATORIES,ABT
1,1800.0,2010,2,8826.014,-1.716618623,35101010,ABBOTT LABORATORIES,ABT
2,1800.0,2010,3,8674.505,14.90970378,35101010,ABBOTT LABORATORIES,ABT
3,1800.0,2010,4,9967.848,-9.299880977,35101010,ABBOTT LABORATORIES,ABT
4,1800.0,2011,1,9040.85,6.364899318,35101010,ABBOTT LABORATORIES,ABT


## Reading Files & Matching
Read the file contents and match them to the labels.

In [5]:
# Dataframe with file content and labels
class_df = pd.DataFrame(columns=['content', 'cik', 'year', 'quarter', 'subsector', 'name', 'tic',
                                 'saleq', 'saleq_change'])

# Iterate through data directory
for path in glob.iglob(REPORT_PATH):
    file_name = path.split('/')[-1]
    year = sec_regex.search(file_name).group(1)
    quarter = sec_regex.search(file_name).group(2)
    cik = sec_regex.search(file_name).group(3)

    # Match file with label
    label_row = fin_label_df.loc[(fin_label_df['cik'] == int(cik)) & \
                             (fin_label_df['year'] == int(year)) & \
                             (fin_label_df['quarter'] == int(quarter))]
    if len(label_row) > 0:
        # Add file content and labels to dataframe
        with open(path, 'r') as file:
            class_df.loc[len(class_df)] = [file.read(),
                                           int(label_row.iloc[0]['cik']),
                                           int(label_row.iloc[0]['year']),
                                           int(label_row.iloc[0]['quarter']),
                                           int(label_row.iloc[0]['subsector']),
                                           label_row.iloc[0]['name'],
                                           label_row.iloc[0]['tic'],
                                           label_row.iloc[0]['saleq'],
                                           label_row.iloc[0]['saleq_change']]

print('Number of reports matched: %d' % len(class_df))
class_df.head()

Number of reports matched: 3625


Unnamed: 0,content,cik,year,quarter,subsector,name,tic,saleq,saleq_change
0,Item7.Management's Discussion and Analysis of ...,855654.0,2012.0,3.0,35201010.0,IMMUNOGEN INC,IMGN,3.252,-9.225092251
1,The following discussion and analysis provide...,788920.0,2011.0,3.0,35101010.0,PRO-DEX INC/CO,PDEX,6.876,-1.89063409
2,Item 7. Managements Discussion and Analysis of...,795551.0,2011.0,1.0,35101010.0,THERAGENICS CORP,TGX,20.253,6.334863971
3,Managements Discussion and Analysis of Financ...,352915.0,2012.0,1.0,35102020.0,UNIVERSAL HEALTH SVCS INC,UHS,1755.536,-1.882217169
4,MANAGEMENTS DISCUSSION AND ANALYSIS OF FINANC...,884731.0,2013.0,1.0,35201010.0,ARIAD PHARMACEUTICALS INC,ARIA,6.464,116.769802


In [6]:
# Save data as Pickle
with open(PICKLE_FILE, 'wb') as f:
    pickle.dump(class_df, f, pickle.HIGHEST_PROTOCOL)