# IEOR 242 Assignment 10
Prepare data by reading MDA extracts from files and merging them with labels. Result is saved in a Pickle file.

In [1]:
import glob
import re
import pandas as pd
import pickle

from clean_helper import cleanup_mda

In [2]:
import pymysql
from sqlalchemy import create_engine
from sqlalchemy.dialects import mysql

# Connect to team database
engine = create_engine('mysql+pymysql://<user>:<password>@<host>[:<port>]/<dbname>')

In [3]:
# Path to MDA section files
REPORT_PATH = 'assignment-07-report-mda/*'

# Regex for parsing the file list
sec_regex = re.compile(r'mdna_(\d{4})_Q(\d{1})_(\d+)_.+_10-K_\d{4}-\d{2}-\d{2}')

# Path to the Pickle file
PICKLE_FILE = 'prepared_reports_final.pickle'

## Label Loading
Load the table with the labels.

In [4]:
# Read labels from database
fin_label_df = pd.read_sql_query('SELECT a.cik AS cik, a.fyearq AS year, a.fqtr AS quarter, a.saleq, ' \
                                 'a.oiadpq_posneg, a.atq_posneg, a.ceqq_posneg, a.saleq_posneg, ' \
                                 'a.oiadpq_change, a.atq_change, a.ceqq_change, a.saleq_change, ' \
                                 'b.gsubind AS subsector, b.conm AS name, b.tic ' \
                                 'FROM Compustat_Health_Quarterly_Sentiment_With_Change a ' \
                                 'LEFT JOIN Compustat_sub b ON a.cik = b.cik AND a.fyearq = b.fyear', engine)

print('Number of reports: %d' % len(fin_label_df))
fin_label_df.head()

Number of reports: 114444


Unnamed: 0,cik,year,quarter,saleq,oiadpq_posneg,atq_posneg,ceqq_posneg,saleq_posneg,oiadpq_change,atq_change,ceqq_change,saleq_change,subsector,name,tic
0,1800.0,2010,1,7698.354,pos,pos,neg,pos,22.83181209,1.797138,-4.546430964,14.64806633,35101010.0,ABBOTT LABORATORIES,ABT
1,1800.0,2010,2,8826.014,pos,pos,pos,neg,6.578140464,0.042692,7.494198514,-1.716618623,35101010.0,ABBOTT LABORATORIES,ABT
2,1800.0,2010,3,8674.505,pos,pos,pos,pos,19.79259504,7.478668,4.598234567,14.90970378,35101010.0,ABBOTT LABORATORIES,ABT
3,1800.0,2010,4,9967.848,neg,pos,pos,neg,-28.10679154,3.640429,10.01206666,-9.299880977,35101010.0,ABBOTT LABORATORIES,ABT
4,1800.0,2011,1,9040.85,pos,pos,pos,pos,22.80394491,3.681489,7.102021344,6.364899318,35101010.0,ABBOTT LABORATORIES,ABT


## Reading Files & Matching
Read the file contents and match them to the labels.

In [5]:
# Dataframe with file content and labels
class_df = pd.DataFrame(columns=['content', 'cik', 'year', 'quarter', 'subsector', 'name', 'tic',
                                 'SALES_value', 'OIADP', 'AT', 'CEQ', 'SALES', 'OIADP_change',
                                 'AT_change', 'CEQ_change', 'SALES_change'])

# Iterate through data directory
for path in glob.iglob(REPORT_PATH):
    file_name = path.split('/')[-1]
    year = sec_regex.search(file_name).group(1)
    quarter = sec_regex.search(file_name).group(2)
    cik = sec_regex.search(file_name).group(3)

    # Match file with label
    label_row = fin_label_df.loc[(fin_label_df['cik'] == int(cik)) & \
                             (fin_label_df['year'] == int(year)) & \
                             (fin_label_df['quarter'] == int(quarter))]
    if len(label_row) > 0:
        # Add cleaned file content and labels to dataframe
        with open(path, 'r') as file:
            class_df.loc[len(class_df)] = [file.read(),
                                           int(label_row.iloc[0]['cik']),
                                           int(label_row.iloc[0]['year']),
                                           int(label_row.iloc[0]['quarter']),
                                           label_row.iloc[0]['subsector'],
                                           label_row.iloc[0]['name'],
                                           label_row.iloc[0]['tic'],
                                           label_row.iloc[0]['saleq'],
                                           label_row.iloc[0]['oiadpq_posneg'],
                                           label_row.iloc[0]['atq_posneg'],
                                           label_row.iloc[0]['ceqq_posneg'],
                                           label_row.iloc[0]['saleq_posneg'],
                                           label_row.iloc[0]['oiadpq_change'],
                                           label_row.iloc[0]['atq_change'],
                                           label_row.iloc[0]['ceqq_change'],
                                           label_row.iloc[0]['saleq_change']]

print('Number of reports matched: %d' % len(class_df))
class_df.head()

Number of reports matched: 12504


Unnamed: 0,content,cik,year,quarter,subsector,name,tic,SALES_value,OIADP,AT,CEQ,SALES,OIADP_change,AT_change,CEQ_change,SALES_change
0,Item7.Management's Discussion and Analysis of ...,855654.0,2012.0,3.0,35201010.0,IMMUNOGEN INC,IMGN,3.252,pos,neg,neg,neg,19.30809539,-5.710814,-16.88298821,-9.225092251
1,The following discussion and analysis provide...,788920.0,2011.0,3.0,35101010.0,PRO-DEX INC/CO,PDEX,6.876,neg,pos,pos,neg,-14.5187602,14.640276,11.23801315,-1.89063409
2,Item 7.\nManagements Discussion and Analysis o...,70487.0,2008.0,1.0,,,,13.454,neg,pos,pos,neg,-22.56208359,0.525303,2.764397345,-11.54303553
3,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,1027915.0,1999.0,2.0,,,,7.873,neg,neg,pos,neg,-814.6341463,-56.183942,15.27777778,-65.41343833
4,Item 7. Management's Discussion and Analy...,749660.0,2003.0,1.0,,,,2.214,pos,neg,neg,neg,328.5234899,-1.733328,-5.815035355,-39.56639566


In [6]:
# Clean up the data
class_df['content'] = cleanup_mda(class_df['content'])
class_df.head()

Unnamed: 0,content,cik,year,quarter,subsector,name,tic,SALES_value,OIADP,AT,CEQ,SALES,OIADP_change,AT_change,CEQ_change,SALES_change
0,Item Management s Discussion and Analysis of ...,855654.0,2012.0,3.0,35201010.0,IMMUNOGEN INC,IMGN,3.252,pos,neg,neg,neg,19.30809539,-5.710814,-16.88298821,-9.225092251
1,The following discussion and analysis provide...,788920.0,2011.0,3.0,35101010.0,PRO-DEX INC/CO,PDEX,6.876,neg,pos,pos,neg,-14.5187602,14.640276,11.23801315,-1.89063409
2,Item Managements Discussion and Analysis of...,70487.0,2008.0,1.0,,,,13.454,neg,pos,pos,neg,-22.56208359,0.525303,2.764397345,-11.54303553
3,BEGIN PRIVACY ENHANCED MESSAGE Proc ...,1027915.0,1999.0,2.0,,,,7.873,neg,neg,pos,neg,-814.6341463,-56.183942,15.27777778,-65.41343833
4,Item Management s Discussion and Analy...,749660.0,2003.0,1.0,,,,2.214,pos,neg,neg,neg,328.5234899,-1.733328,-5.815035355,-39.56639566


In [7]:
# Save data as Pickle
with open(PICKLE_FILE, 'wb') as f:
    pickle.dump(class_df, f, pickle.HIGHEST_PROTOCOL)