# ML Applied Analysis
Steve Donahue, www.github.com/sdonahue0132

This Notebook makes an initial analysis of how useful the trained ML models are at guiding an investor in managing their All Weather Portfolio. Data for this notebook comes from the ML Preprocessing notebook and the algorithms used were developed in the ML SMOTE and ML ADASYN notebooks.

In [None]:
# Import essential libraries #

import pandas as pd
import numpy as np
from numpy import percentile
import math
import datetime
from datetime import date
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from functools import reduce
import warnings
from random import randint
import seaborn as sns
%matplotlib inline

warnings.filterwarnings(action='once')

In [4]:
# CSV_Formatter prepares a dataframe of values and a dataframe of percentages for each date of the fund's history. 
# It cleans the csv by isolating one measurement per day, at the Close of business, and then creating
# a shifted column for comparison and creating a column of the daily percent change in value. 
# Both daily value and percent change are retained in two separate dataframes. 

def CSV_Formatter (folder_name, filename):
    df_init = pd.read_csv('raw_data/' + str(folder_name) +'/' + str(filename) +'.csv')
    df = df_init.set_index('Date')[['Close']]
    df.columns = [str(filename)]
    df['Next Day Values'] = df[filename].shift(-1)
    df['Percentages'] = df['Next Day Values']/df[str(filename)]
    df_final = df[['Percentages']]
    df_final.columns = [str(filename)]
    return(df[[filename]], df_final)
    
def clean_label (labels):
    cleaned = str(labels).strip("Index'").replace("_y'", '').replace('Index([', '').replace(']', '').replace(
        ", dtype='object')", '').replace('[', '').replace("'", '').replace("_x'", '').replace('"', '')
    return cleaned

# The last function is a weekly master function which collects the percent changes week to week for a domain dataframe
# Domain dataframe should consist only of a date time index and a value series

def weekly_master(df, cutoff_date):
    
# A little setup is necessary to ensure fidelity across weekly data.  In the original yahoo finance data downloads,
# Weekends and holidays are not counted in the Datetime Index.  The following code creates a working df that
# Can be broken into calendar weeks at regular 7 day intervals, to better reflect paycheck contributions and 
# subsequent analyses can be done on a week to week basis.

# Note that for days where data is unavailable, I've filled in the value 1, since the method of assessing portfolios
# Is multiplication across daily percentage changes.  In this way, days when no percent changes are documented do not
# affect the value of the investment.

    # NOTE THAT THIS FUNCTION WILL ONLY ACCOMODATE DATAFRAMES WITH UP TO 8 COLUMNS AS WRITTEN! #

    cutoff_date = pd.to_datetime(cutoff_date)
    cutoff = cutoff_date - datetime.timedelta(days=cutoff_date.weekday())
    
    if cutoff_date.weekday() == 6:
        td = timedelta(1)
        cutoff_date = cutoff_date - td
    
    datelist = pd.to_datetime(df.index.values)
    df['Datetime'] = datelist
    df_timed = df.set_index('Datetime')
    labels = df_timed.columns

    df_segmented = pd.DataFrame(columns = labels)

    daterange = int(str(df_timed.index.max() - df_timed.index.min()).replace(" days 00:00:00", ''))
    all_dates = pd.date_range(df_timed.index.min(), periods=daterange).tolist()
    
    index_df = pd.DataFrame(all_dates)
    index_df.columns = ['Datetime']
    
    working_df = index_df.merge(df_timed, how = 'outer', left_on = 'Datetime', right_on ='Datetime')
    working_df = working_df.fillna(1).set_index('Datetime').sort_values('Datetime', ascending = False)
    working_df = working_df.loc[working_df.index <= cutoff_date]
    
    days = len(working_df.index)
    number_of_weeks = int(np.floor(days/7))

    df_progress = pd.DataFrame(index=[0,1,2,3,4])
    weekly_eval = pd.DataFrame()

    for i in range(0, number_of_weeks):
        portfolio_segment = working_df.iloc[i*7:(i+1)*7-1]
        
        products =[]
        prod_1 = portfolio_segment.iloc[:, 0].product()
        products.append(prod_1)

        if len(labels) > 1:
            prod_2 = portfolio_segment.iloc[:, 1].product()
            products.append(prod_2)
        if len(labels) > 2:        
            prod_3 = portfolio_segment.iloc[:, 2].product()
            products.append(prod_3)
        if len(labels) > 3:
            prod_4 = portfolio_segment.iloc[:, 3].product()
            products.append(prod_4)
        if len(labels) > 4:
            prod_5 = portfolio_segment.iloc[:, 4].product()
            products.append(prod_5)
        if len(labels) > 5:
            prod_6 = portfolio_segment.iloc[:, 5].product()
            products.append(prod_6)
        if len(labels) > 6:
            prod_7 = portfolio_segment.iloc[:, 6].product()
            products.append(prod_7)
        if len(labels) > 7:
            prod_8 = portfolio_segment.iloc[:, 7].product()
            products.append(prod_8)
   
    # Can we just make this into a loop?
    # For j in range(0, len(labels)):
    #    temp_prod = portfolio_segment.iloc[:, j].product()
    #    products.append(temp_prod)


        weekly_eval[str(portfolio_segment.index[cutoff_date.weekday()]).replace("00:00:00", '')] = products
    
    weekly_eval = weekly_eval.T
    weekly_eval.columns = labels
    weekly_eval.index = pd.to_datetime(weekly_eval.index)
    weekly_eval.index.name = 'Date'
    weekly_eval.sort_index()

    return(weekly_eval)


In [5]:
# Fidelity US 500, 'FUSEX', was rolled into another index, 'FXAIX' in November 2018.  
# This cell joins the two records at the transition point for.... fidelity.

(FXAIX, FXAIX_final) = CSV_Formatter('Stock_Indices', 'FXAIX')
(FUSEX, FUSEX_final) = CSV_Formatter('Stock_Indices', 'FUSEX')

FUSEX = pd.merge( FUSEX_final, FXAIX_final, how = 'outer', on = 'Date')

FUSEX = pd.DataFrame(FUSEX['FUSEX'].fillna(FUSEX['FXAIX']))



In [6]:
# This Cell reads all necessary source files for STOCK INDICES #

(PREIX, PREIX_final) = CSV_Formatter('Stock_Indices', 'PREIX')
(SWPPX, SWPPX_final) = CSV_Formatter('Stock_Indices', 'SWPPX')
#(FUSEX, FUSEX_final) = CSV_Formatter('Stock_Indices', 'FUSEX_plus')
(VFINX, VFINX_final) = CSV_Formatter('Stock_Indices', 'VFINX')
(VIGRX, VIGRX_final) = CSV_Formatter('Stock_Indices', 'VIGRX')

# Creates dataframes of their daily values and daily percent changes , aka increments.
dfstock_values = [PREIX[['PREIX']], SWPPX[['SWPPX']], FUSEX[['FUSEX']], VFINX[['VFINX']], VIGRX[['VIGRX']]]
dfstock_finals = [PREIX_final, SWPPX_final, FUSEX, VFINX_final, VIGRX_final]

#stock_daily_values_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), dfstock_values)
stock_increments_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), dfstock_finals)

pd.merge(stock_increments_df, FUSEX_final, how = 'outer')
stock_weekly = weekly_master(stock_increments_df, '2019-02-01')

In [7]:
# This Cell reads all necessary source files for Intermediate Term Bonds #

(BIV, BIV_final) = CSV_Formatter('Intermediate_Bonds', 'BIV')
(HYG, HYG_final) = CSV_Formatter('Intermediate_Bonds', 'HYG')
(IEF, IEF_final) = CSV_Formatter('Intermediate_Bonds', 'IEF')
(IEI, IEI_final) = CSV_Formatter('Intermediate_Bonds', 'IEI')
(IGIB, IGIB_final) = CSV_Formatter('Intermediate_Bonds', 'IGIB')
(IPE, IPE_final) = CSV_Formatter('Intermediate_Bonds', 'IPE')
(ITE, ITE_final) = CSV_Formatter('Intermediate_Bonds', 'ITE')
(TIP, TIP_final) = CSV_Formatter('Intermediate_Bonds', 'TIP')

# Creates dataframes of their daily values and daily percent changes , aka increments.
df_itb_values = [BIV[['BIV']], HYG[['HYG']], IEF[['IEF']], IEI[['IEI']], IGIB[['IGIB']], IPE[['IPE']], ITE[['ITE']], TIP[['TIP']]]
df_itb_finals = [BIV_final, HYG_final, IEF_final, IEI_final, IGIB_final, IPE_final, ITE_final, TIP_final]
itb_daily_values_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), df_itb_values)
itb_increments_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), df_itb_finals)

itb_weekly = weekly_master(itb_increments_df, '2019-02-01')

#itb_weekly.to_csv('itb_increments_training.csv')




In [8]:
# This Cell reads all necessary source files for Long Term Bonds #

(PRULX, PRULX_final) = CSV_Formatter('Long_Term_Bonds', 'PRULX')
(VUSTX, VUSTX_final) = CSV_Formatter('Long_Term_Bonds', 'VUSTX')
(WHOSX, WHOSX_final) = CSV_Formatter('Long_Term_Bonds', 'WHOSX')

# Creates dataframes of their daily values and daily percent changes , aka increments.
df_ltb_values = [PRULX[['PRULX']], VUSTX[['VUSTX']], WHOSX[['WHOSX']]]
df_ltb_finals = [PRULX_final, VUSTX_final, WHOSX_final]
ltb_daily_values_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), df_ltb_values)
ltb_increments_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), df_ltb_finals)

ltb_weekly = weekly_master(ltb_increments_df, '2019-02-01')

#ltb_weekly.to_csv('ltb_increments_training.csv')



In [9]:
# This Cell reads all necessary source files for Gold 

(INIVX, INIVX_final) = CSV_Formatter('Gold', 'INIVX')
(OPGSX, OPGSX_final) = CSV_Formatter('Gold', 'OPGSX')
(SGGDX, SGGDX_final) = CSV_Formatter('Gold', 'SGGDX')
(USERX, USERX_final) = CSV_Formatter('Gold', 'USERX')
(VGPMX, VGPMX_final) = CSV_Formatter('Gold', 'VGPMX')

# Creates dataframes of their daily values and daily percent changes , aka increments.
dfgold_values = [INIVX[['INIVX']], OPGSX[['OPGSX']], SGGDX[['SGGDX']], USERX[['USERX']], VGPMX[['VGPMX']]]
dfgold_finals = [INIVX_final, OPGSX_final, SGGDX_final, USERX_final, VGPMX_final]
gold_daily_values_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), dfgold_values)
gold_increments_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), dfgold_finals)

gold_weekly = weekly_master(gold_increments_df, '2019-02-01')

#gold_weekly.to_csv('gold_increments_training.csv')


In [10]:
# This Cell reads all necessary source files from their respective download folders for Broad Basket Commodities #

(DBC, DBC_final) = CSV_Formatter('Broad_Commodities', 'DBC')
(DJP, DJP_final) = CSV_Formatter('Broad_Commodities', 'DJP')
(GSG, GSG_final) = CSV_Formatter('Broad_Commodities', 'GSG')
(GSP, GSP_final) = CSV_Formatter('Broad_Commodities', 'GSP')

# Creates dataframes of their daily values and daily percent changes , aka increments.
df_commod_values = [DBC[['DBC']], DJP[['DJP']], GSG[['GSG']], GSP[['GSP']]]
df_commod_finals = [DBC_final, DJP_final, GSG_final, GSP_final]
commod_daily_values_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), df_commod_values)
commod_increments_df = reduce(lambda left, right: pd.merge(left, right, on = 'Date'), df_commod_finals)

commod_weekly = weekly_master(commod_increments_df, '2019-02-01')

#commod_weekly.to_csv('commod_weekly_training.csv')



In [11]:

# Time weighted return captures an accurate total return, accounting for time of accrual.

def time_weighted_return (df):
    sorted_df = df.sort_index(ascending=True).dropna()
    sorted_df['after_cash_flows'] = sorted_df.ending_value + sorted_df.cash_flows
    sorted_df['prev_after_cash_flows'] = sorted_df.after_cash_flows.shift(1)
    sorted_df['HPR'] = (sorted_df.ending_value / sorted_df.prev_after_cash_flows) - 1
    sorted_df['HPR_plus_one'] = sorted_df.HPR + 1
    sorted_df = sorted_df.dropna()

    rate = sorted_df.HPR_plus_one.product()-1
    
    return (rate)
 
# This function is used to compute the Compounded Annual Growth Rate and return the identities and length of investment

def p_summary(performance, fund_identities, term_years):
    twr = time_weighted_return(performance)
    CAGR = (float((1+twr))**(float(1/term_years)) -1)
    return(CAGR, fund_identities, term_years)


In [12]:
def ML_tester(y_predict, funds_moved_ratio):

    stock_name, itb_name, ltb_name, gold_name, commod_name = 'VIGRX', 'IEF', 'WHOSX', 'SGGDX', 'DBC'

    contribution = 500

    # Extract the columns of percent changes 
    rand_stock_inc = stock_weekly[[stock_name]]
    rand_inter_bond_inc = itb_weekly[[itb_name]]
    rand_long_bond_inc = ltb_weekly[[ltb_name]]
    rand_gold_inc = gold_weekly[[gold_name]]
    rand_commod_inc = commod_weekly[[commod_name]]

    random_inc = rand_stock_inc.merge(rand_inter_bond_inc, on = 'Date').merge(
        rand_long_bond_inc, on = 'Date').merge(
        rand_gold_inc, on = 'Date').merge(
        rand_commod_inc, on = 'Date')

    test = random_inc.dropna()    
    test = test.sort_index(ascending = True)
    test_col = test.iloc[1:, 0:5]
    num_weeks = test_col.shape[0]

    # This block sets markers in the test dataframe for when to add contributions, and ML flags for redistro
    a = np.empty((num_weeks,))
    a[::2] = 0
    a[1::2] = 1
    a = a*contribution
    ML = y_predict[::-1]
    test_col['contributions'] = a
    test_col['ML_flag'] = ML

    # This block determines the portions of the contributions to allocate to each fund.
    invesment_stock = contribution*0.3
    invesment_itb = contribution*0.4
    invesment_ltb = contribution*0.15
    invesment_gold = contribution*0.075
    invesment_commod = contribution*0.075

    values = test_col.values

    # Progress lists are used to accumulate the values of the incremented investments.  Fund Identities are formatted
    progress_stock = []
    progress_itb = []
    progress_ltb = []
    progress_gold = []
    progress_commod = []
    labels = str(test.columns)
    fund_identities = str(labels).strip('Index').replace('_x', '').replace('([', '').replace("], dtype='object')", '')

    # This loop applies the increments to the investment amounts, and appends the results to the progress lists

    counter = 0

    for i,j,k, l, m, contribution, flag in values:

        if flag == 0:
            funds_moved = invesment_stock * funds_moved_ratio
        
            invesment_stock = invesment_stock - funds_moved
            invesment_itb = invesment_itb + funds_moved/4
            invesment_ltb = invesment_ltb + funds_moved/4
            invesment_gold = invesment_gold + funds_moved/4
            invesment_commod = invesment_commod + funds_moved/4
    
        if flag == 1:
            subtotal = invesment_stock + invesment_itb + invesment_ltb + invesment_gold + invesment_commod
            stock_prop = invesment_stock/subtotal
            if stock_prop < 0.30:
                invesment_stock = subtotal*0.3
                invesment_itb = subtotal*0.4
                invesment_ltb = subtotal*0.15
                invesment_gold = subtotal*0.075
                invesment_commod = subtotal*0.075
    
        invesment_stock = invesment_stock*i + contribution*0.3
        invesment_itb = invesment_itb*j + contribution*0.4
        invesment_ltb = invesment_ltb*k + contribution*0.15
        invesment_gold = invesment_gold*l + contribution*0.075
        invesment_commod = invesment_commod*m + contribution*0.075
    
        if counter % 52 == 0:
            invesment_stock = subtotal*0.3
            invesment_itb = subtotal*0.4
            invesment_ltb = subtotal*0.15
            invesment_gold = subtotal*0.075
            invesment_commod = subtotal*0.075
        
        progress_stock.append(invesment_stock)
        progress_itb.append(invesment_itb)
        progress_ltb.append(invesment_ltb)
        progress_gold.append(invesment_gold)
        progress_commod.append(invesment_commod)
    
        counter = counter + 1
    
    # Lists are converted to arrays, and placed into a performace dataframe, which is evaluated for annualized return %
    result_stock = np.array(progress_stock)
    result_itb = np.array(progress_itb)
    result_ltb = np.array(progress_ltb)
    result_gold = np.array(progress_gold)
    result_commod = np.array(progress_commod)

    test_col['Stock_Eval'] = result_stock
    test_col['ITB_Eval']= result_itb
    test_col['LTB_Eval'] = result_ltb
    test_col['Gold_Eval'] = result_gold
    test_col['Commod_Eval']= result_commod

    performance = test_col.iloc[:, 7:12]
    performance['ending_value'] = performance.sum(axis=1)
    performance['cash_flows'] = a
    performance['ML_flags'] = ML
    performance = performance.sort_index(ascending = False)
    term_years = float(len(performance.index)/52.17857)

    CAGR, ident, years = p_summary(performance, fund_identities, term_years)

    return(round(CAGR*100,2), performance)


In [16]:
processed = pd.read_csv('csv_files/historical_record.csv')
processed.index = pd.to_datetime(processed.index)
processed = processed.iloc[35:, :]

processed.head()

processed.reset_index()
processed.index = processed['Date']
processed = processed.drop('Date', axis = 1)

processed = processed[['VIGRX', 'Stocks 2_Week_Avg', 'Stocks 3_Week_Avg',
       'Stocks 6_Week_Avg', 'Stocks 9_Week_Avg', 'Stocks 12_Week_Avg',
       'Stocks 15_Week_Avg',  'ITB 2_Week_Avg',
       'ITB 3_Week_Avg', 'ITB 6_Week_Avg', 'ITB 9_Week_Avg', 'ITB 12_Week_Avg',
       'ITB 15_Week_Avg', 'LTB 2_Week_Avg', 'LTB 3_Week_Avg',
       'LTB 6_Week_Avg', 'LTB 9_Week_Avg', 'LTB 12_Week_Avg',
       'LTB 15_Week_Avg',  'Gold 2_Week_Avg',
       'Gold 3_Week_Avg', 'Gold 6_Week_Avg', 'Gold 9_Week_Avg',
       'Gold 12_Week_Avg', 'Gold 15_Week_Avg', 
       'Commod 2_Week_Avg', 'Commod 3_Week_Avg', 'Commod 6_Week_Avg',
       'Commod 9_Week_Avg', 'Commod 12_Week_Avg', 'Commod 15_Week_Avg',
        'VIGRX 2_Week_Avg', 'VIGRX 3_Week_Avg', 'VIGRX 6_Week_Avg',
       'VIGRX 9_Week_Avg', 'VIGRX 12_Week_Avg', 'VIGRX 15_Week_Avg']]

full_features = processed

for_testing = pd.read_csv('csv_files/historical_record.csv')
for_testing.index = pd.to_datetime(for_testing.index)

for_testing.reset_index()
for_testing.index = for_testing['Date']
for_testing = for_testing.drop('Date', axis = 1)

for_testing = for_testing[['VIGRX', 'Stocks 2_Week_Avg', 'Stocks 3_Week_Avg',
       'Stocks 6_Week_Avg', 'Stocks 9_Week_Avg', 'Stocks 12_Week_Avg',
       'Stocks 15_Week_Avg',  'ITB 2_Week_Avg',
       'ITB 3_Week_Avg', 'ITB 6_Week_Avg', 'ITB 9_Week_Avg', 'ITB 12_Week_Avg',
       'ITB 15_Week_Avg', 'LTB 2_Week_Avg', 'LTB 3_Week_Avg',
       'LTB 6_Week_Avg', 'LTB 9_Week_Avg', 'LTB 12_Week_Avg',
       'LTB 15_Week_Avg',  'Gold 2_Week_Avg',
       'Gold 3_Week_Avg', 'Gold 6_Week_Avg', 'Gold 9_Week_Avg',
       'Gold 12_Week_Avg', 'Gold 15_Week_Avg', 
       'Commod 2_Week_Avg', 'Commod 3_Week_Avg', 'Commod 6_Week_Avg',
       'Commod 9_Week_Avg', 'Commod 12_Week_Avg', 'Commod 15_Week_Avg',
        'VIGRX 2_Week_Avg', 'VIGRX 3_Week_Avg', 'VIGRX 6_Week_Avg',
       'VIGRX 9_Week_Avg', 'VIGRX 12_Week_Avg', 'VIGRX 15_Week_Avg']]

test_features = for_testing

test_obs = test_features.iloc[:, 1:37]
t_obs = test_obs.transpose()

X_testing = []

for i in range(t_obs.shape[1]):
    x_testing = t_obs.iloc[:,i].values
    X_testing.append(x_testing)

In [17]:
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

# Identify the training data and targets

training_df = full_features
observations = training_df.iloc[:, 1:37]
obs = observations.transpose()

X = []

for i in range(obs.shape[1]):
    x = obs.iloc[:,i].values
    X.append(x)

y = training_df.iloc[:, 0].values

#Resolve the target column as binary
for i in range(len(y)):
    if y[i] <= 0.98:
        y[i] = 0
    if y[i] > 0.98:
        y[i] = 1

X_ADASYN, y_ADASYN = ADASYN().fit_resample(X, y)
X_SMOTE, y_SMOTE = SMOTE().fit_resample(X, y)



  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [18]:
from sklearn.ensemble import RandomForestClassifier

rfca = RandomForestClassifier(n_estimators = 25, max_depth = 14, min_samples_leaf = 0.25, 
                             min_samples_split = 0.1, max_features=4)

rfca.fit(X_ADASYN, y_ADASYN)
y_predict = rfca.predict(X_testing)

for i in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]:
    CAGR, performance = ML_tester(y_predict, i)
    print('Random Forest - ADASYN analysis')
    print('For funds_moved_ratio, ', i, 'CAGR = ', CAGR)
    print('')

  return f(*args, **kwds)


Random Forest - ADASYN analysis
For funds_moved_ratio,  0.0 CAGR =  9.88

Random Forest - ADASYN analysis
For funds_moved_ratio,  0.05 CAGR =  9.29

Random Forest - ADASYN analysis
For funds_moved_ratio,  0.1 CAGR =  9.14

Random Forest - ADASYN analysis
For funds_moved_ratio,  0.15 CAGR =  9.04

Random Forest - ADASYN analysis
For funds_moved_ratio,  0.2 CAGR =  8.91

Random Forest - ADASYN analysis
For funds_moved_ratio,  0.25 CAGR =  8.78

Random Forest - ADASYN analysis
For funds_moved_ratio,  0.3 CAGR =  8.64



In [19]:
from sklearn.linear_model import LogisticRegression

logrega = LogisticRegression()

logrega.fit(X_ADASYN, y_ADASYN)

y_predict = logrega.predict(X_testing)
    
for i in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]:
    CAGR, performance = ML_tester(y_predict, i)
    print('Log Reg Classifier - ADASYN analysis')
    print('For funds_moved_ratio, ', i, 'CAGR = ', CAGR)
    print('')



Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.0 CAGR =  10.13

Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.05 CAGR =  9.68

Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.1 CAGR =  9.5

Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.15 CAGR =  9.37

Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.2 CAGR =  9.24

Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.25 CAGR =  9.13

Log Reg Classifier - ADASYN analysis
For funds_moved_ratio,  0.3 CAGR =  9.02



In [20]:
logregs = LogisticRegression()

logregs.fit(X_SMOTE, y_SMOTE)

y_predict = logregs.predict(X_testing)

for i in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]:
    CAGR, performance = ML_tester(y_predict, i)
    print('Log Reg Classifier - SMOTE analysis')
    print('For funds_moved_ratio, ', i, 'CAGR = ', CAGR)
    print('')



Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.0 CAGR =  10.18

Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.05 CAGR =  9.59

Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.1 CAGR =  9.37

Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.15 CAGR =  9.26

Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.2 CAGR =  9.17

Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.25 CAGR =  9.09

Log Reg Classifier - SMOTE analysis
For funds_moved_ratio,  0.3 CAGR =  9.02



In [21]:
from sklearn.ensemble import RandomForestClassifier

rfcs_u = RandomForestClassifier()

rfcs_u.fit(X_SMOTE, y_SMOTE)

y_predict = rfcs_u.predict(X_testing)

for i in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45]:
    CAGR, performance = ML_tester(y_predict, i)
    print('Random Forest Untuned Classifier - SMOTE analysis')
    print('For funds_moved_ratio, ', i, 'CAGR= ', CAGR)
    print('')



Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.0 CAGR=  10.43

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.05 CAGR=  10.21

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.1 CAGR=  10.13

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.15 CAGR=  10.06

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.2 CAGR=  9.99

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.25 CAGR=  9.94

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.3 CAGR=  9.91

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.35 CAGR=  9.88

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.4 CAGR=  9.86

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.45 CAGR=  9.85



In [22]:
rfcs_t = RandomForestClassifier(n_estimators = 9, max_depth = 11, min_samples_leaf = 0.2, 
                                 min_samples_split = 0.3, max_features=18)

rfcs_t.fit(X_SMOTE, y_SMOTE)

y_predict = rfcs_t.predict(X_testing)

for i in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45]:
    CAGR, performance = ML_tester(y_predict, i)
    print('Random Forest Untuned Classifier - SMOTE analysis')
    print('For funds_moved_ratio, ', i, 'CAGR= ', CAGR)
    print('')

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.0 CAGR=  10.0

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.05 CAGR=  9.42

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.1 CAGR=  9.27

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.15 CAGR=  9.15

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.2 CAGR=  9.03

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.25 CAGR=  8.91

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.3 CAGR=  8.79

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.35 CAGR=  8.67

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.4 CAGR=  8.57

Random Forest Untuned Classifier - SMOTE analysis
For funds_moved_ratio,  0.45 CAGR=  8.47



In [23]:
# This Cell develops an equally weighted voting system for all classifiers

sum_of_predictors = rfca.predict(X_testing) + logrega.predict(X_testing) + logregs.predict(X_testing) + rfcs_u.predict(X_testing) + rfcs_t.predict(X_testing)

list_version = list(sum_of_predictors)

unanimous_1 = np.floor((sum_of_predictors)/5), '5 votes for 1'
vote_four_1 = np.floor((sum_of_predictors)/4), '4 votes for 1'
vote_three_1 = np.floor((sum_of_predictors)/3), '3 votes for 1'
vote_two_1 = np.floor(np.array([2 if x in [3, 4, 5] else x for x in list_version])/2), '2 votes for 1'
vote_one_1 = np.floor(np.array([1 if x in [2, 3, 4, 5] else x for x in list_version])), '1 vote for 1'

all_methods = [unanimous_1, vote_four_1, vote_three_1, vote_two_1, vote_one_1]

In [24]:
for votes, name in all_methods:
    for i in [0.00, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45]:
        CAGR, performance = ML_tester(votes, i)
        print('Using ', name, ' method...')
        print('Funds_moved_ratio = ', i, ', CAGR = ', CAGR)
        print('')

Using  5 votes for 1  method...
Funds_moved_ratio =  0.0 , CAGR =  9.77

Using  5 votes for 1  method...
Funds_moved_ratio =  0.05 , CAGR =  9.02

Using  5 votes for 1  method...
Funds_moved_ratio =  0.1 , CAGR =  8.76

Using  5 votes for 1  method...
Funds_moved_ratio =  0.15 , CAGR =  8.61

Using  5 votes for 1  method...
Funds_moved_ratio =  0.2 , CAGR =  8.44

Using  5 votes for 1  method...
Funds_moved_ratio =  0.25 , CAGR =  8.3

Using  5 votes for 1  method...
Funds_moved_ratio =  0.3 , CAGR =  8.16

Using  5 votes for 1  method...
Funds_moved_ratio =  0.35 , CAGR =  8.05

Using  5 votes for 1  method...
Funds_moved_ratio =  0.4 , CAGR =  7.95

Using  5 votes for 1  method...
Funds_moved_ratio =  0.45 , CAGR =  7.87

Using  4 votes for 1  method...
Funds_moved_ratio =  0.0 , CAGR =  9.9

Using  4 votes for 1  method...
Funds_moved_ratio =  0.05 , CAGR =  9.25

Using  4 votes for 1  method...
Funds_moved_ratio =  0.1 , CAGR =  9.08

Using  4 votes for 1  method...
Funds_moved_rat