## Team 1 - Regime Modeling For Prinicipal Financial Group 

### Data Pre-Processing Logic

In [1]:
# importing required packages

import pandas as pd
from dateutil.parser import parse
import csv
import numpy as np
import psycopg2 
from psycopg2.extras import RealDictCursor
from pandas.tseries.offsets import *

In [2]:
# counting and displaying number of lines in raw data file

In [3]:
with open('table.csv') as f:
    num_rows = sum(1 for line in f)

In [4]:
num_rows

5755967

### Pre-processing Algorithm

Since we have over 5 million rows and 300 columns, the logic implemented follows the following steps:
* Reads 50,000 rows at one time
* Converts 'date' column to date type (it is currently stores as a string)
* Creates a column 'week_ending' that holds the date of the nearest Friday
* Drops the 'date' column
* Aggregates columns by 'week_ending', adds up all of the text features

In [5]:
i = 0
read_size = 50000
final_df = pd.DataFrame()

while ((read_size*(i+1)) < num_rows):
    print("Processing piece: ",i+1)
    col_list = [j for j in range(4,313)]
    col_list.insert(0,0)
    df = pd.read_csv("table.csv", skiprows=read_size*i, nrows=read_size, usecols = col_list)
    col_names = pd.read_excel('ColNames.xlsx')
    df.columns = col_names[0]
    df['date']  = pd.to_datetime(df['date'])
    df['one'] = 1
    df['week_ending'] = df['date'] + Week(weekday=4)
    df.drop(['date'], axis = 1, inplace=True)
    df_group = pd.DataFrame(df.groupby('week_ending').sum())
    final_df = final_df.append(df_group)
    i = i + 1


Processing piece:  1
Processing piece:  2
Processing piece:  3
Processing piece:  4
Processing piece:  5
Processing piece:  6
Processing piece:  7
Processing piece:  8
Processing piece:  9
Processing piece:  10
Processing piece:  11
Processing piece:  12
Processing piece:  13
Processing piece:  14
Processing piece:  15
Processing piece:  16
Processing piece:  17
Processing piece:  18
Processing piece:  19
Processing piece:  20
Processing piece:  21
Processing piece:  22
Processing piece:  23
Processing piece:  24
Processing piece:  25
Processing piece:  26
Processing piece:  27
Processing piece:  28
Processing piece:  29
Processing piece:  30
Processing piece:  31
Processing piece:  32
Processing piece:  33
Processing piece:  34
Processing piece:  35
Processing piece:  36
Processing piece:  37
Processing piece:  38
Processing piece:  39
Processing piece:  40
Processing piece:  41
Processing piece:  42
Processing piece:  43
Processing piece:  44
Processing piece:  45
Processing piece:  

In [6]:
# checking length of dataframe

In [7]:
len(final_df)

90439

In [8]:
final_df.head()

Unnamed: 0_level_0,free_cash_flow,competitive_environment,strong_demand_bool,product_cycle,epidemic,higher_prices_bool,government_dissatisfaction,weak_strategy_bool,corporate_governance,management,...,industry_neg_headwinds,management_defensiveness,industry_pos_general,finance_neg_revenue_down,industry_neg_demand_unstable,economy_neg_yield_curve_inverting,management_buzz_growth,industry_specific_ltip_drivers,industry_neg_margin_larger,one
week_ending,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2003-01-17,40.0,52.0,52.0,0.0,0.0,32.0,0.0,40.0,0.0,270.0,...,12.0,4.0,16.0,56.0,4.0,0.0,16.0,0.0,0.0,30
2003-01-24,46.0,44.0,56.0,0.0,0.0,8.0,0.0,0.0,0.0,459.0,...,0.0,0.0,0.0,64.0,11.0,0.0,0.0,0.0,0.0,57
2003-01-31,30.0,22.0,34.0,0.0,0.0,10.0,0.0,10.0,6.0,94.0,...,0.0,3.0,0.0,45.0,21.0,0.0,0.0,0.0,0.0,50
2003-02-07,48.0,46.0,58.0,4.0,0.0,8.0,0.0,8.0,0.0,272.0,...,0.0,4.0,0.0,85.0,20.0,0.0,0.0,0.0,0.0,68
2003-02-14,34.0,36.0,36.0,0.0,0.0,2.0,0.0,0.0,0.0,137.0,...,2.0,0.0,7.0,34.0,2.0,0.0,0.0,0.0,0.0,50


In [9]:
# dropping the index i.e. week_ending 

In [10]:
final_df.reset_index(inplace=True)

In [11]:
# re-aggregating by week_ending and summing up all the text features

In [12]:
df_final = pd.DataFrame(final_df.groupby('week_ending').sum())

In [13]:
# final number of rows

In [14]:
len(df_final)

808

In [15]:
# saving results to csv

In [16]:
df_final.to_csv('Processed_Stage1.csv')