# IEOR 242 Assignment 07
Prepare data by reading MDA extracts from files and merging them with labels. Result is saved in a Pickle file. This version of the script includes additional columns for change values.

In [1]:
import glob
import re
import pandas as pd
import pickle

In [2]:
import pymysql
from sqlalchemy import create_engine
from sqlalchemy.dialects import mysql

# Connect to team database
engine = create_engine('mysql+pymysql://<user>:<password>@<host>[:<port>]/<dbname>')

In [3]:
# Path to MDA section files
REPORT_PATH = 'assignment-07-report-mda/*'

# Regex for parsing the file list
sec_regex = re.compile(r'mdna_(\d{4})_Q(\d{1})_(\d+)_.+_10-K_\d{4}-\d{2}-\d{2}')

# Path to the Pickle file
PICKLE_FILE = 'prepared_class2.pickle'

## Label Loading
Load the table with the labels.

In [5]:
# Read labels from database
fin_label_df = pd.read_sql_query('SELECT cik, fyearq, fqtr, oiadpq_posneg, atq_posneg, ceqq_posneg, saleq_posneg, ' \
                                 'oiadpq_change, atq_change, ceqq_change, saleq_change ' \
                                 'FROM Compustat_Health_Quarterly_Sentiment_With_Change', engine)

print('Number of labels: %d' % len(fin_label_df))
fin_label_df.head()

Number of labels: 114212


Unnamed: 0,cik,fyearq,fqtr,oiadpq_posneg,atq_posneg,ceqq_posneg,saleq_posneg,oiadpq_change,atq_change,ceqq_change,saleq_change
0,319126,1990,3,neg,neg,pos,pos,4.232804233,-3.592531,6.880860452,9.44540018
1,319126,1990,4,neg,neg,neg,neg,-150.9306261,-7.149258,-7.753838214,-34.8502994
2,319126,1991,1,neg,neg,neg,pos,-124.2524917,-1.211159,-2.657342657,11.74172794
3,319126,1991,2,pos,neg,neg,neg,61.64383562,-3.009865,-2.614942529,-2.920008225
4,319126,1991,3,neg,neg,neg,pos,-3.813559322,-3.541951,-3.290056064,7.593730142


## Reading Files & Matching
Read the file contents and match them to the labels.

In [6]:
# Dataframe with file content and labels
class_df = pd.DataFrame(columns=['content', 'OIADP', 'AT', 'CEQ', 'SALES',
                                 'OIADP_change', 'AT_change', 'CEQ_change', 'SALES_change'])

# Iterate through data directory
for path in glob.iglob(REPORT_PATH):
    file_name = path.split('/')[-1]
    year = sec_regex.search(file_name).group(1)
    quarter = sec_regex.search(file_name).group(2)
    cik = sec_regex.search(file_name).group(3)

    # Match file with label
    label_row = fin_label_df.loc[(fin_label_df['cik'] == int(cik)) & \
                             (fin_label_df['fyearq'] == int(year)) & \
                             (fin_label_df['fqtr'] == int(quarter))]
    if len(label_row) > 0:
        # Add file content and labels to dataframe
        with open(path, 'r') as file:
            class_df.loc[len(class_df)] = [file.read(),
                                           label_row.iloc[0]['oiadpq_posneg'],
                                           label_row.iloc[0]['atq_posneg'],
                                           label_row.iloc[0]['ceqq_posneg'],
                                           label_row.iloc[0]['saleq_posneg'],
                                           label_row.iloc[0]['oiadpq_change'],
                                           label_row.iloc[0]['atq_change'],
                                           label_row.iloc[0]['ceqq_change'],
                                           label_row.iloc[0]['saleq_change']]

print('Number of reports: %d' % len(class_df))
class_df.head()

Number of reports: 12504


Unnamed: 0,content,OIADP,AT,CEQ,SALES,OIADP_change,AT_change,CEQ_change,SALES_change
0,Item7.Management's Discussion and Analysis of ...,pos,neg,neg,neg,19.30809539,-5.710814,-16.88298821,-9.225092251
1,The following discussion and analysis provide...,neg,pos,pos,neg,-14.5187602,14.640276,11.23801315,-1.89063409
2,Item 7.\nManagements Discussion and Analysis o...,neg,pos,pos,neg,-22.56208359,0.525303,2.764397345,-11.54303553
3,-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...,neg,neg,pos,neg,-814.6341463,-56.183942,15.27777778,-65.41343833
4,Item 7. Management's Discussion and Analy...,pos,neg,neg,neg,328.5234899,-1.733328,-5.815035355,-39.56639566


In [7]:
# Save data as Pickle
with open(PICKLE_FILE, 'wb') as f:
    pickle.dump(class_df, f, pickle.HIGHEST_PROTOCOL)