In [65]:
import pandas as pd
import os
import sys
import yaml
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [66]:
week, weeks2, weeks3, weeks4, weeks5, weeks6 = 7, 14, 21, 28, 35, 42
weeks_list = [week, weeks2, weeks3, weeks4, weeks5, weeks6]
raw_data_path = 'raw/'
original_path = 'ourpped/'
new_data_path = 'preprocessed_rania/'
start_date = '2013-06-03'
end_date = '2015-12-31'

In [67]:
def add_gm_features(stock_sample):
    mult_percentage = 100
    stock_sample.loc[:,'gm_week'] = stock_sample.loc[:, 'Adj Close'].rolling(week).mean()
    stock_sample.loc[:,'gm_2_weeks'] = stock_sample.loc[:, 'Adj Close'].rolling(weeks2).mean()
    stock_sample.loc[:,'gm_3_weeks'] = stock_sample.loc[:, 'Adj Close'].rolling(weeks3).mean()
    stock_sample.loc[:,'gm_4_weeks'] = stock_sample.loc[:, 'Adj Close'].rolling(weeks4).mean()
    stock_sample.loc[:,'gm_5_weeks'] = stock_sample.loc[:, 'Adj Close'].rolling(weeks5).mean()
    stock_sample.loc[:,'gm_6_weeks'] = stock_sample.loc[:, 'Adj Close'].rolling(weeks6).mean()
    
    stock_sample.loc[:,'gm_week'] = mult_percentage*(
    stock_sample.loc[:,'gm_week']/stock_sample.loc[:,'Adj Close'] -1)

    stock_sample.loc[:,'gm_2_weeks'] = mult_percentage*(
        stock_sample.loc[:,'gm_2_weeks']/stock_sample.loc[:,'Adj Close'] -1)

    stock_sample.loc[:,'gm_3_weeks'] = mult_percentage*(
        stock_sample.loc[:,'gm_3_weeks']/stock_sample.loc[:,'Adj Close'] -1)

    stock_sample.loc[:,'gm_4_weeks'] = mult_percentage*(
        stock_sample.loc[:,'gm_4_weeks']/stock_sample.loc[:,'Adj Close'] -1)

    stock_sample.loc[:,'gm_5_weeks'] = mult_percentage*(
        stock_sample.loc[:,'gm_5_weeks']/stock_sample.loc[:,'Adj Close'] -1)

    stock_sample.loc[:,'gm_6_weeks'] = mult_percentage*(
        stock_sample.loc[:,'gm_6_weeks']/stock_sample.loc[:,'Adj Close'] -1)

In [68]:
def add_pr_features(stock_sample):
    stock_sample.loc[:,'pr_week'] = stock_sample.loc[:,'Adj Close'].rolling(week).agg(lambda x: (x>0).mean())
    stock_sample.loc[:,'pr_2_weeks'] = stock_sample.loc[:,'Adj Close'].rolling(weeks2).agg(lambda x: (x>0).mean())
    stock_sample.loc[:,'pr_3_weeks'] = stock_sample.loc[:,'Adj Close'].rolling(weeks3).agg(lambda x: (x>0).mean())
    stock_sample.loc[:,'pr_4_weeks'] = stock_sample.loc[:,'Adj Close'].rolling(weeks4).agg(lambda x: (x>0).mean())
    stock_sample.loc[:,'pr_5_weeks'] = stock_sample.loc[:,'Adj Close'].rolling(weeks5).agg(lambda x: (x>0).mean())
    stock_sample.loc[:,'pr_6_weeks'] = stock_sample.loc[:,'Adj Close'].rolling(weeks6).agg(lambda x: (x>0).mean())

    stock_sample.loc[:,'pr_week'] = MinMaxScaler().fit_transform(stock_sample.loc[:, 'pr_week'].values.reshape(-1,1))
    stock_sample.loc[:,'pr_2_weeks'] = MinMaxScaler().fit_transform(stock_sample.loc[:, 'pr_2_weeks'].values.reshape(-1,1))
    stock_sample.loc[:,'pr_3_weeks'] = MinMaxScaler().fit_transform(stock_sample.loc[:, 'pr_3_weeks'].values.reshape(-1,1))
    stock_sample.loc[:,'pr_4_weeks'] = MinMaxScaler().fit_transform(stock_sample.loc[:, 'pr_4_weeks'].values.reshape(-1,1))
    stock_sample.loc[:,'pr_5_weeks'] = MinMaxScaler().fit_transform(stock_sample.loc[:, 'pr_5_weeks'].values.reshape(-1,1))
    stock_sample.loc[:,'pr_6_weeks'] = MinMaxScaler().fit_transform(stock_sample.loc[:, 'pr_6_weeks'].values.reshape(-1,1))

In [69]:
def add_features(stock):
    stock_path = os.path.join(raw_data_path, '{}'.format(stock))
    stock_sample = pd.read_csv(stock_path)
    add_gm_features(stock_sample)
    
    #### Adjusting accordingly to the txt ####

    # vector that normalize the prices by adjusted_price(t-1)
    norm_vector = pd.Series(stock_sample['Adj Close']).shift(1).fillna(method ='bfill') #fill the first value with the second

    # norm by close at t
    col_adj = ['Open', 'High', 'Low', 'Volume']
    for col in col_adj:
        stock_sample[col] = stock_sample[col]/stock_sample['Close'] -1

    #norm by 'close' at t-1
    col_adj_t_minus_1 = ['Close', 'Adj Close']
    for col in col_adj_t_minus_1:
        stock_sample[col] = stock_sample[col]/norm_vector -1
    
    add_pr_features(stock_sample)
    
    df = stock_sample.copy()
    
    #mask between dates
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    df = df.loc[mask]
    
    return np.vstack([df['gm_week'].values, df['gm_2_weeks'].values,
                      df['gm_3_weeks'].values, df['gm_4_weeks'].values,
                      df['gm_5_weeks'].values, df['gm_6_weeks'].values,
                      df['pr_week'].values, df['pr_2_weeks'].values,
                     df['pr_3_weeks'].values, df['pr_4_weeks'].values,
                     df['pr_5_weeks'].values, df['pr_6_weeks'].values]).T

In [70]:
def changeOurppedFiles(stock, num_rows=0):
    #original df
    stock_path = os.path.join(original_path, '{}'.format(stock))
    df_orig = pd.read_csv(stock_path,header=None)
    df_new = df_orig.copy()
    
    #insert new features 
    if num_rows == 0:
        tmp_last_columns = df_new.iloc[:,-2:].copy()
        newfeatures  = add_features(stock)
        df_new.drop(df_new.iloc[:, -2:], inplace=True, axis=1)
        df_new = pd.concat([df_new, pd.DataFrame(newfeatures)], axis=1)
        df_new = pd.concat([df_new, pd.DataFrame(tmp_last_columns)], axis=1)
    else:
        tmp_last_columns = df_new.iloc[-num_row:,-2:].copy()
        #TODO: continue
    return df_new
  

In [72]:
for stock in os.listdir(original_path):
    try:
        df_new = changeOurppedFiles(stock)
        #df_new = norm_df(df_new)
        df_new.to_csv(os.path.join(new_data_path,'{}'.format(stock)),header=None, index=None)
    except Exception as e:
        print(e)
        print('except:', stock)