## Codebase 4: Machine Learning Implementation

Implementation is structured in the following parts where sections 1 to 6 comprise cross-validation.

1. Helper functions
2. Undersampling
3. Oversampling
4. Weight Class
5. Threshhold Testing 
6. Ensembles
7. Create Financial Ratio Features Target Matrix
8. Generate Holdout Testing Sets
9. Test Holdout Sets

### 1. Helper Functions

There are four key helper functions in implementation: (i) time equalization preprocesses the sample to ensure that the annual ratio of events in the majority and minority classes are equal, (ii) undersampling, (iii) oversampling and (iv) a function to convert sparse dataframes to sparse matrices in chunks.

First up is time equalization:

In [None]:
def equalize_year_ratio(df_train, ratio):
    '''preprocesses sample to set the annual ratio of negative events (majority class) 
    equal to that of positive events (minority class)'''
    
    mask_pos = df_train['max_dd_1yr'] < -0.8
    df_year = df_train[['ticker_','Filed_Date']][mask_pos]
    df_year['year'] = df_year['Filed_Date'].dt.year
    df_pos_count = df_year['year'].value_counts()
    
    years = sorted(df_pos_count.index.tolist())
    
    df_train['year'] = df_train['Filed_Date'].dt.year
    
    #loop through years and randomly select n samples
    count=0
    for year in years:
    
        mask_year = df_train['year']==year
        df = df_train[mask_year]
        mask_neg = df['max_dd_1yr'] >=-0.8
        df = df[mask_neg]
        m = df.shape[0]
        n = int(ratio*df_pos_count[year])
    
        random_idx = random.sample(range(0,m), n)
        df = df.iloc[random_idx]
        if count == 0:
            df_final = df
        else:
            df_final = pd.concat([df_final,df])
        count +=1    
    
    df_train = df_train.drop('year', axis=1)
    df_neg = df_final.drop('year', axis=1)
    df_pos = df_train[mask_pos]
    
    df_train = pd.concat([df_neg, df_pos])
    df_train = df_train.sort_values(by='Filed_Date')  

    return df_train          

Undersampling

In [None]:
def undersample_random(X_train, y_train, seed = 41):
    '''undersample negative events (majority class) to same size
    as positive events (minority class)'''
    
    random.seed(seed)

    #get indices and calculate class sizes
    idx_train_argsort= np.argsort(y_train)
    n_pos = y_train.sum()
    n_neg = y_train.shape[0] - n_pos
    n_min = min(n_pos, n_neg)
    
    #set minority class sets
    idx_pos_prelim = idx_train_argsort[-n_pos:]
    random_idx_pos = random.sample(range(0,n_pos), n_min)
    idx_pos = idx_pos_prelim[random_idx_pos]
    X_train_pos = X_train[idx_pos]
    y_train_pos = y_train[idx_pos]
    
    #undersample majority class without replacement
    idx_neg_prelim = idx_train_argsort[:n_neg]
    random_idx_neg = random.sample(range(0,n_neg), n_min)
    idx_neg = idx_neg_prelim[random_idx_neg]
    X_train_neg = X_train[idx_neg]
    y_train_neg = y_train[idx_neg]

    #join classes for training and testing sets 
    X_train_balanced = vstack((X_train_pos,X_train_neg))        #X arrays assumed sparse
    y_train_balanced = np.concatenate((y_train_pos,y_train_neg), axis=0)  
    
    dict_answer = {'X_balanced': X_train_balanced , 'y_balanced': y_train_balanced}
    
    return dict_answer

and Oversampling:

In [None]:
def oversample_random(X_train, y_train, seed = 41, flag=False, n='define'):
    '''oversmaple positive events (minority class) to same size
    as negative events (majority class)'''
    
    random.seed(seed)
    
    #get indices and calculate class sizes
    idx_train_argsort= np.argsort(y_train)
    n_pos = y_train.sum()
    n_neg = y_train.shape[0] - n_pos
    
    #Default to size of largest sample unless set in argument
    if flag==True:
        n_max = n
    else:  
        n_max = max(n_pos, n_neg)
    
    #oversample minority class with replacement
    idx_pos_prelim = idx_train_argsort[-n_pos:]
    random_idx_pos = np.random.choice(range(0,n_pos), n_max)
    idx_pos = idx_pos_prelim[random_idx_pos]
    X_train_pos = X_train[idx_pos]
    y_train_pos = y_train[idx_pos]
    
    #randomly sample majority class without replacement to required size
    idx_neg_prelim = idx_train_argsort[:n_neg]
    random_idx_neg = random.sample(range(0,n_neg), n_max)
    idx_neg = idx_neg_prelim[random_idx_neg]
    X_train_neg = X_train[idx_neg]
    y_train_neg = y_train[idx_neg]

    #join classes for training and testing sets 
    try:
        X_train_balanced = vstack((X_train_pos,X_train_neg))     #if X arrays sparse
    except:
        X_train_balanced = np.concatenate((X_train_pos,X_train_neg), axis=0)
        
    y_train_balanced = np.concatenate((y_train_pos,y_train_neg), axis=0)
    
    dict_answer = {'X_balanced': X_train_balanced , 'y_balanced': y_train_balanced}
    
    return dict_answer

Converting sparse dataframe to sparse matrix in chunks:

In [None]:
def convert_df_values_csc_chunk(df):
    '''converts dataframe to sparse matrix in chunks'''
    
    #calculate number loops for 3,000 chunk size
    rows = df.shape[0]
    if rows < 3000:
        num = rows - 1
    else:
        num = 3000
        
    loops = rows // num
    stub_start = num * loops 

    #process loops    
    for j in range(1, 1+ loops):
        arr = df[(j-1)*num: j*num].values
        arr = np.nan_to_num(arr)
        mat_csr = csr_matrix(arr)
        if j == 1:
            answer = mat_csr
        else:
            answer =  vstack([answer, mat_csr])
    
    #process end stub        
    arr_stub = df[stub_start:].values
    arr_stub = np.nan_to_num(arr_stub)    
    mat_csr_stub = csr_matrix(arr_stub)
    
    #join stub to rest
    answer =  vstack([answer, mat_csr_stub])
                
    return answer

### 2. Undersampling

As the method with least computation, undersampling is where we do the heavy lifting to find the core model. This is where we choose min_df, decide on random or time equalized sampling and investigate whther sector dummy variables improve the model.

We start with choosing min_df by looking at performance over a random selection of samples for various min_df values:  

In [None]:
import pickle
import numpy as np
import pandas as pd
import random
from scipy.sparse import vstack
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from capstone_10k_functions import convert_df_values_csc_chunk

#inputs
label_cv = ['cv1', 'cv2', 'cv3', 'cv4']   #['cv1', 'cv2', 'cv3', 'cv4']
vector_func = 'TfidfVectorizer'
ngram = 'unigram'
min_df = 25
x_drop_columns = ['Filed_Date', 'ticker_','sector_', 'sic_sector', 'max_dd_1yr','max_dd_2yr', 
                  'year_dd_flag', 'cum_year_dd_flag', 'custom_sector']
models = [GradientBoostingClassifier(random_state=41),
          RandomForestClassifier(n_estimators=100, bootstrap = True, 
                                 max_features = 'sqrt'),
          LogisticRegression(random_state=41)]
method = 'undersample_random'     #describes methof for output name
model_names = ['grad_boost', 'random_forest', 'log_reg' ]
output_filename= 'dict_cv_' + method +'_' + vector_func + '_' + 'min_df_' + str(min_df) + '_' + ngram + '.pickle'

#calculations

dict_cv= {}

for label in label_cv:
    
    print(label)
    
    #open dataframe files for cv set
    dict_cv[label] = {}
    filename = label + '_' + vector_func +  '_' +'min_df_' + str(min_df) + '_' + ngram + '.pickle' 
    d_cv = pd.read_pickle(filename)
    df_train = d_cv['min_df_' + str(min_df)]['df_train_master']
    df_test = d_cv['min_df_' + str(min_df)]['df_test_master']
    
    #define X and y dataframes
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1

    #convert to X and Y arrays
    X_train = convert_df_values_csc_chunk(df_x_train)
    y_train = df_y_train.values
    X_test = convert_df_values_csc_chunk(df_x_test)
    y_test = df_y_test.values

    #normalize (row wise so not strictly necesarry to process this way
    #but kept in format for generalization to other preprocessing)
    n_test = y_test.shape[0]
    X_train_test = vstack((X_train, X_test ))
    X_train = normalize(X_train)
    X_test = normalize(X_train_test)[-n_test:]    
    
    #undersample
    dict_bal = undersample_random(X_train, y_train, seed = 41)
    X_train_balanced = dict_bal['X_balanced']
    y_train_balanced = dict_bal['y_balanced'] 
    
    #train model
    for idx, model_func in enumerate(models):
        model_name = model_names[idx]
        model =model_func
        model.fit(X_train_balanced, y_train_balanced)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
    
        cm = confusion_matrix(y_test, y_pred, labels=[0,1])
        report = classification_report(y_test, y_pred,output_dict=True)
        recall = recall_score(y_test, y_pred, average='macro')
        
        dict_cv[label][model_name] = {'y_test': y_test, 'y_pred': y_pred,
                                      'y_proba': y_proba, 'conf_matrix':cm,
                                      'class_report': report, 
                                      'macro_recall':recall}
        
        print(model_name, ' : ', "{:.2f}".format(recall))
    
    with open(output_filename, 'wb') as handle:                                     
        pickle.dump(dict_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

Before considering time-equalization:

In [None]:
import pickle
import numpy as np
import pandas as pd
import random
from scipy.sparse import vstack
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from capstone_10k_functions import convert_df_values_csc_chunk

#inputs
ratio = 5
label_cv = ['cv1', 'cv2', 'cv3', 'cv4']   #['cv1', 'cv2', 'cv3', 'cv4']
vector_func = 'TfidfVectorizer'
ngram = 'unigram'
min_df = 25
x_drop_columns = ['Filed_Date', 'ticker_','sector_', 'sic_sector', 'max_dd_1yr','max_dd_2yr', 
                  'year_dd_flag', 'cum_year_dd_flag', 'custom_sector']

models = [GradientBoostingClassifier(random_state=41),
          RandomForestClassifier(n_estimators=100, bootstrap = True, 
                                 max_features = 'sqrt'),
          LogisticRegression(random_state=41)]
method = 'undersample_equal_num'
model_names = ['grad_boost', 'random_forest', 'log_reg' ]
output_filename= 'dict_cv_' + method +'_' + vector_func + '_' + 'min_df_' + str(min_df) + '_' + ngram + '.pickle'

#calculations
dict_cv= {}

for label in label_cv:
    
    print(label)
    #open dataframe files for cv set
    dict_cv[label] = {}
    filename = label + '_' + vector_func +  '_' +'min_df_' + str(min_df) + '_' + ngram + '.pickle'
    d_cv = pd.read_pickle(filename) 
    df_train = d_cv['min_df_' + str(min_df)]['df_train_master']
    df_test = d_cv['min_df_' + str(min_df)]['df_test_master']
    
    #equal number algo
    df_train = equalize_year_ratio(df_train, ratio)
        
    #define X and y dataframes
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1 
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1

    #convert to X and Y arrays
    X_train = convert_df_values_csc_chunk(df_x_train)
    y_train = df_y_train.values
    X_test = convert_df_values_csc_chunk(df_x_test)
    y_test = df_y_test.values

    #normalize (row wise so not strictly necesarry to process this way
    #but kept in format for generalization to other preprocessing)
    n_test = y_test.shape[0]
    X_train_test = vstack((X_train, X_test ))
    X_train = normalize(X_train)
    X_test = normalize(X_train_test)[-n_test:]    
    
    #undersample
    dict_bal = undersample_random(X_train, y_train, seed = 41)
    X_train_balanced = dict_bal['X_balanced']
    y_train_balanced = dict_bal['y_balanced'] 
    
    #train model
    for idx, model_func in enumerate(models):
        model_name = model_names[idx]
        model =model_func
        model.fit(X_train_balanced, y_train_balanced)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
    
        cm = confusion_matrix(y_test, y_pred, labels=[0,1])
        report = classification_report(y_test, y_pred,output_dict=True)
        recall = recall_score(y_test, y_pred, average='macro')
        
        dict_cv[label][model_name] = {'y_test': y_test, 'y_pred': y_pred,
                                      'y_proba': y_proba, 'conf_matrix':cm,
                                      'class_report': report, 
                                      'macro_recall':recall}
        
        print(model_name, ' : ', "{:.2f}".format(recall))
    
    with open(output_filename, 'wb') as handle:                                     
        pickle.dump(dict_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

And sector dummy variables:

In [None]:
import pickle
import numpy as np
import pandas as pd
import random
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from capstone_10k_functions import convert_df_values_csc_chunk

#inputs
ratio = 5
label_cv = ['cv1', 'cv2', 'cv3', 'cv4']   #['cv1', 'cv2', 'cv3', 'cv4']
vector_func = 'TfidfVectorizer'
ngram = 'unigram'
min_df = 25
x_drop_columns = ['Filed_Date', 'ticker_','sector_', 'sic_sector', 'max_dd_1yr','max_dd_2yr', 
                  'year_dd_flag', 'cum_year_dd_flag', 'custom_sector']
models = [GradientBoostingClassifier(random_state=41),
          RandomForestClassifier(n_estimators=100, bootstrap = True, 
                                 max_features = 'sqrt'),
          LogisticRegression(random_state=41)]
method = 'undersample_equal_num_DUMMIES'
model_names = ['grad_boost', 'random_forest', 'log_reg' ]

output_filename= 'dict_cv_' + method +'_' + vector_func + '_' + 'min_df_' + str(min_df) + '_' + ngram + '.pickle'

#calculations
dict_cv= {}

for label in label_cv:
    
    print(label)
    #open dataframe files for cv set
    dict_cv[label] = {}
    filename = label + '_' + vector_func +  '_' +'min_df_' + str(min_df) + '_' + ngram + '.pickle'
    d_cv = pd.read_pickle(filename)
    df_train = d_cv['min_df_' + str(min_df)]['df_train_master']
    df_test = d_cv['min_df_' + str(min_df)]['df_test_master']
   
    #equal number algo
    df_train = equalize_year_ratio(df_train, ratio)
    
    #get_dummies
    df_dummies_train = pd.get_dummies(df_train['sector_'], prefix='_').astype(int)
    df_dummies_test = pd.get_dummies(df_test['sector_'], prefix='_').astype(int)
    #make sure they have same columns
    dummies_train = set(df_dummies_train.columns)
    dummies_test = set(df_dummies_test.columns)
    union_dummies = dummies_train.union(dummies_test)
    add_dummies_train = union_dummies - dummies_train
    add_dummies_test = union_dummies - dummies_test
    #make sure train and test sets have same number cols    
    if len(add_dummies_train) == 0:
        pass
    else:
        for dummy in add_dummies_train:
            df_dummies_train[dummy] = 0
            
    if len(add_dummies_test) == 0:
        pass
    else:
        for dummy in add_dummies_test:
            df_dummies_test[dummy] = 0
    #convert to sparse matrices
    csr_dummies_train = csr_matrix(df_dummies_train.values)
    csr_dummies_test = csr_matrix(df_dummies_test.values)
    
    #define X and y dataframes
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1
    
    #convert to X and Y arrays
    X_train = convert_df_values_csc_chunk(df_x_train)
    X_train = hstack((X_train, csr_dummies_train))
    y_train = df_y_train.values
    X_test = convert_df_values_csc_chunk(df_x_test)
    X_test = hstack((X_test, csr_dummies_test))
    y_test = df_y_test.values

    #normalize (row wise so not strictly necesarry to process this way
    #but kept in format for generalization to other preprocessing)
    n_test = y_test.shape[0]
    X_train_test = vstack((X_train, X_test ))
    X_train = normalize(X_train)
    X_test = normalize(X_train_test)[-n_test:]    
    
    #undersample
    dict_bal = undersample_random(X_train, y_train, seed = 41)
    X_train_balanced = dict_bal['X_balanced']
    y_train_balanced = dict_bal['y_balanced'] 
    
    #train model
    for idx, model_func in enumerate(models):
        model_name = model_names[idx]
        model =model_func
        model.fit(X_train_balanced, y_train_balanced)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
    
        cm = confusion_matrix(y_test, y_pred, labels=[0,1])
        report = classification_report(y_test, y_pred,output_dict=True)
        recall = recall_score(y_test, y_pred, average='macro')
         ort': report, 
                                      'macro_recall':recall}
        
        print(model_name, ' : ', "{:.2f}".format(recall))
    
    with open(output_filename, 'wb') as handle:                                     
        pickle.dump(dict_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 3. Oversampling

We apply the core model (min_df=25, time equalization and no sectordummy variables) to oversampling. Given the 1:20 data imbalance, we test a more computationally practical n = 2,3 and 5 where n x (minority class size for unique events) sets the size of the samples:

In [None]:


import pickle
import numpy as np
import pandas as pd
import random
from scipy.sparse import vstack
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from capstone_10k_functions import convert_df_values_csc_chunk

#inputs
pd.set_option('mode.chained_assignment', None)  
ratio = 5
label_cv = ['cv1', 'cv2', 'cv3', 'cv4']  #['cv1', 'cv2', 'cv3', 'cv4']
vector_func = 'TfidfVectorizer'
ngram = 'unigram'
min_df = 25
over_sample_amount = [2, 3, 5]
x_drop_columns = ['Filed_Date', 'ticker_','sector_', 'sic_sector', 'max_dd_1yr','max_dd_2yr', 
                  'year_dd_flag', 'cum_year_dd_flag', 'custom_sector']
models = [GradientBoostingClassifier(random_state=41),
          RandomForestClassifier(n_estimators=100, bootstrap = True, 
                                 max_features = 'sqrt'),
          LogisticRegression(random_state=41), MultinomialNB()]
method = 'oversample_equal_num'
model_names = ['grad_boost', 'random_forest', 'log_reg', 'NBayes' ]


#create dictionary structure
dict_cv = {}
for label in label_cv:
    dict_cv.update({label: {}})
    for number in over_sample_amount:
        dict_cv[label].update({'number'+ str(number):{}})
        for model in model_names:
            dict_cv[label]['number'+ str(number)].update({model: {}})


for label in label_cv:
    
    print(label)
    #open dataframe files for cv set
    filename = label + '_' + vector_func +  '_' +'min_df_' + str(min_df) + '_' + ngram + '.pickle'
    d_cv = pd.read_pickle(filename)
    df_train = d_cv['min_df_' + str(min_df)]['df_train_master']
    df_test = d_cv['min_df_' + str(min_df)]['df_test_master']
    
    #Equal number algo
    df_train = equalize_year_ratio(df_train, ratio)
    
    #define X and y dataframes
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_2yr'] <= -0.8)*1
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_2yr'] <= -0.8)*1

    #convert to X and Y arrays
    X_train = convert_df_values_csc_chunk(df_x_train)
    y_train = df_y_train.values
    X_test = convert_df_values_csc_chunk(df_x_test)
    y_test = df_y_test.values  

    #normalize (row wise so not strictly necesarry to process this way
    #but kept in format for generalization to other preprocessing)
    n_test = y_test.shape[0]
    X_train_test = vstack((X_train, X_test ))
    X_train = normalize(X_train)
    X_test = normalize(X_train_test)[-n_test:]    
    
    #oversample
    for number in over_sample_amount:
        
        key_number = 'number'+ str(number)  
        n = int(number*n_pos)
        
        dict_bal = oversample_random(X_train, y_train, seed = 41, flag=True, n=n)
        X_train_balanced = dict_bal['X_balanced']
        y_train_balanced = dict_bal['y_balanced'] 

        #train model
        for idx, model_func in enumerate(models):
            model_name = model_names[idx]
            model =model_func
            model.fit(X_train_balanced, y_train_balanced)
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)
    
            cm = confusion_matrix(y_test, y_pred, labels=[0,1])
            report = classification_report(y_test, y_pred,output_dict=True)
            recall = recall_score(y_test, y_pred, average='macro')
        
            dict_cv[label][key_number][model_name] = {'y_test': y_test, 'y_pred': y_pred,
                                      'y_proba': y_proba, 'conf_matrix':cm,
                                      'class_report': report, 
                                      'macro_recall':recall}
        
            print(model_name, ' : ', "{:.2f}".format(recall))
            
           
output_filename= 'dict_cv_' + method +'_' + vector_func + '_' + 'min_df_' + str(min_df) + '_number_ratios_' + ngram + '.pickle'
with open(output_filename, 'wb') as handle:                                     
    pickle.dump(dict_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)



### 4. Weight Classes

We use SK-Learn's Random Forest to see how balanced weight class performs for the problem:

In [None]:
import pickle
import pandas as pd
import random
from scipy.sparse import csr_matrix, vstack, hstack
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from capstone_10k_functions import convert_df_values_csc_chunk

pd.set_option('mode.chained_assignment', None)

#inputs
ratio = 5
label_cv = ['cv1', 'cv2', 'cv3', 'cv4'] #['cv1', 'cv2', 'cv3', 'cv4']
vector_func = 'TfidfVectorizer'  
ngram = 'unigram'
min_df = 25
x_drop_columns = ['Filed_Date', 'ticker_','sector_', 'sic_sector', 'max_dd_1yr','max_dd_2yr', 
                  'year_dd_flag', 'cum_year_dd_flag', 'custom_sector']
models = [GradientBoostingClassifier(random_state=41),
          RandomForestClassifier(n_estimators=100, bootstrap = True, 
                                 max_features = 'sqrt'),
          RandomForestClassifier(n_estimators=100, bootstrap = True, 
                                 max_features = 'sqrt', class_weight='balanced')]
method = 'class_weights_equal_num'
model_names = ['grad_boost', 'random_forest_balanced_cw_none',
                                           'random_forest_cw_balanced']
output_filename= 'dict_cv_' + method +'_' + vector_func + '_' + 'min_df_' + str(min_df) + '_' + ngram + '.pickle'

#calculations
dict_cv= {}

for label in label_cv:
    
    print(label)
    #open dataframe files for cv set
    dict_cv[label] = {}
    filename = label + '_' + vector_func +  '_' +'min_df_' + str(min_df) + '_' + ngram + '.pickle'
    d_cv = pd.read_pickle(filename)
    df_train = d_cv['min_df_' + str(min_df)]['df_train_master']
    df_test = d_cv['min_df_' + str(min_df)]['df_test_master']
    
    #equal number algo
    df_train = equalize_year_ratio(df_train, ratio)
    
    #define X and y dataframes
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1

    #convert to X and Y arrays
    X_train = convert_df_values_csc_chunk(df_x_train)
    y_train = df_y_train.values
    X_test = convert_df_values_csc_chunk(df_x_test)
    y_test = df_y_test.values

    #normalize (row wise so not strictly necesarry to process this way
    #but kept in format for generalization to other preprocessing)
    n_test = y_test.shape[0]
    X_train_test = vstack((X_train, X_test ))
    X_train = normalize(X_train)
    X_test = normalize(X_train_test)[-n_test:]    
    
    #train model
    for idx, model_func in enumerate(models):
        model_name = model_names[idx]
        model =model_func
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)
    
        cm = confusion_matrix(y_test, y_pred, labels=[0,1])
        report = classification_report(y_test, y_pred,output_dict=True)
        recall = recall_score(y_test, y_pred, average='macro')
        
        dict_cv[label][model_name] = {'y_test': y_test, 'y_pred': y_pred,
                                      'y_proba': y_proba, 'conf_matrix':cm,
                                      'class_report': report, 
                                      'macro_recall':recall}
        
        print(model_name, ' : ', "{:.2f}".format(recall))
    
    with open(output_filename, 'wb') as handle:                                     
        pickle.dump(dict_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 5. Threshold Testing

This section investigates how performance changes with movement of probability threshhold away from 50%.

This analysis can be found in Code Base 4: Interactive Analysis and Interpretation where it uses the results saved in preceding sections.

### 6. Ensemble

We investigate ensemble methods by (i) combining learning alogorithims for undersampling, (ii) combining learning algorithms for oversampling and (iii) combining the best models for vanilla oversampling and undersampling.

This analysis can be found in Code Base 4: Interactive Analysis and Interpretation where it uses the results saved in preceding sections.

### 7. Create Financial Ratios Target-Features Matrix

The below code matches tickers and filing dates of the NLP dataset and calculates the relevant annual financial ratios where there is data.    

In [None]:
import pickle
import numpy as np
import pandas as pd

#inputs
input_fin = 'fundamentals_df_all_db_ary.pickle'
input_all_10ks = '10k_clean_df.pickle'
input_master = 'dict_10k_matched_dd.pickle'
output_file = "df_fund_ratios_matched.pickle"


#Alignment calcs: reportperiod to filed date
df_align = pd.read_pickle(input_all_10ks)
df_align = df_align[['ticker','Period', 'Filed_Date']]
df_align['Period'] = pd.to_datetime(df_align['Period'], errors='coerce').values
df_align['Filed_Date'] = pd.to_datetime(df_align['Filed_Date'], errors='coerce').values
df_align.columns = ['ticker_','reportperiod', 'Filed_Date']

#Calculate Ratios
df_fin= pd.read_pickle(input_fin)

df_fin_ratios = pd.DataFrame(df_fin['reportperiod'])
df_fin_ratios['ticker_'] = df_fin['ticker']

df_fin_ratios['netinc_assets'] = df_fin['netinc'].values / df_fin['assets'].values
df_fin_ratios['leverage_'] = df_fin['assets'].values / df_fin['liabilities'].values
df_fin_ratios['accruals_'] = df_fin['ncfo'].values / df_fin['netinc'].values
df_fin_ratios['cash_debt'] = df_fin['ncfo'].values / df_fin['liabilities'].values
df_fin_ratios['coe_'] = df_fin['ncfo'].values / (df_fin['assets'] - df_fin['liabilities']).values
df_fin_ratios['roe_'] = df_fin['netinc'].values / (df_fin['assets'] - df_fin['liabilities']).values

del df_fin #memory management

df_fin_ratios = df_fin_ratios.dropna()
df_fin_ratios = df_fin_ratios.merge(df_align, on =['ticker_','reportperiod'], how='inner')
df_fin_ratios = df_fin_ratios.drop('reportperiod', axis=1)
df_fin_ratios = df_fin_ratios.replace([np.inf, -np.inf], np.nan).dropna()

#merge to NLP matched master tickers / dates
dict_input = pd.read_pickle(input_master)       
df = dict_input['matched_df_10k_dd']
df = df[['ticker_', 'Filed_Date','sector', 'max_dd_1yr']].sort_values('Filed_Date')
df.columns = ['ticker_', 'Filed_Date','sector_', 'max_dd_1yr']

df = df.merge(df_fin_ratios, on=['ticker_', 'Filed_Date'], how='inner')

with open(output_file, 'wb') as handle:                                     
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 8. Generate Holdout Testing Sets

This section starts by generating the holdout set before testing it with the optimal model found in CV (ensemble of oversmapling and undersampling). An expanding annual window method is used to update the model and test on an annual basis. The aggregation of these results then form the full set of holdout results.

Generate holdout sets for NLP model:

In [None]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from capstone_10k_functions import vectorize_corpus
from datetime import datetime as dt

t1 = dt.now()
print(t1)

#inputs
input_file = 'dict_10k_matched_dd.pickle'
vector_func = TfidfVectorizer    
func_name = 'TfidfVectorizer'   #['TfidfVectorizer', 'CountVectorizer']
hold_out_set_start = [2015, 2016, 2017, 2018, 2019]
min_df_grid = [25]
max_df = 0.5
ngram = (1,1)
ngram_name = 'unigram'
label = ['hold_out_2015', 'hold_out_2016', 'hold_out_2017', 'hold_out_2018', 
         'hold_out_2019']


#read data into memory
with open(input_file , 'rb') as f:
        d_data = pickle.load(f)
df = d_data['matched_df_10k_dd']
df = df.sort_values("Filed_Date")

for idx_ho, year in enumerate(hold_out_set_start):
    
    print(label[idx_ho])
    
    ##Define hold out set
    mask_train = df['Filed_Date'].dt.year < year
    mask_test = df['Filed_Date'].dt.year == year
    df_test = df[mask_test]
    df_train = df[mask_train]

    #Generate df master (word vector / vectorizer) sets for each cv fold
    dict_cv = {}
    #format training data
    df_train_text = df_train[['ticker_','Filed_Date', 'Text']]
    df_train_other = df_train.drop('Text', axis=1)
    df_train_other.columns = ['ticker_', 'Filed_Date', 'sector_', 'sic_sector', 
                        'max_dd_1yr', 'max_dd_2yr', 'year_dd_flag', 
                        'cum_year_dd_flag']
    df_train_other['custom_sector'] = str(df_train_other['sector_']) + ' : ' + str(df_train_other['sic_sector'])

    #format testing data
    df_test_text = df_test[['ticker_','Filed_Date', 'Text']]
    df_test_other = df_test.drop('Text', axis=1)
    df_test_other.columns = ['ticker_', 'Filed_Date', 'sector_', 'sic_sector', 
                        'max_dd_1yr', 'max_dd_2yr', 'year_dd_flag', 
                        'cum_year_dd_flag']
    df_test_other['custom_sector'] = str(df_test_other['sector_']) + ' : ' + str(df_test_other['sic_sector'])
        

    for min_df in min_df_grid: 
        print(min_df)
        
        #name for cv dictionary specified by min_df value
        key_name = 'min_df_' + str(min_df)
        
        #vectorize corpus and assign word vector and vectorizer
        function = vectorize_corpus(df_train_text['Text'], vector_func, min_df, 
                                            max_df,ngram)
        X = function['df_wv']
        vectorizer = function['vectorizer']
        
        #Transform training data into df_master format
        vocab = X.columns.tolist()
        X['Filed_Date'] = df_train_text['Filed_Date'].values
        X['ticker_'] = df_train_text['ticker_'].values
                        
        df_train_master = df_train_other.merge(X, on=['ticker_','Filed_Date'], how='inner')
        
        #Transform test data into df master format
        arr_test_transform = vectorizer.transform(df_test_text['Text'])
        df_test_transform = pd.DataFrame.sparse.from_spmatrix(arr_test_transform,
                                                           columns = vocab)
        df_test_transform['Filed_Date'] = df_test_text['Filed_Date'].values
        df_test_transform['ticker_'] = df_test_text['ticker_'].values
        
        df_test_master = df_test_other.merge(df_test_transform, 
                                             on=['ticker_','Filed_Date'], 
                                                                 how='inner')
            
        dict_final = {'df_test_master': df_test_master, 'df_train_master': df_train_master}
                
        dict_cv[key_name] = dict_final
        
        output_filename = label[idx_ho] + '_' + func_name + '_' + key_name + '_' + ngram_name + '.pickle'
           
        with open(output_filename, 'wb') as handle:                                     
            pickle.dump(dict_cv, handle, protocol=pickle.HIGHEST_PROTOCOL)
                    
t2 = dt.now()
print(t2)
print(t2-t1)

#runtime 2hrs11mins

Generate holdout sets for financial ratio (FIN) model:

In [None]:
import pandas as pd
import pickle

#inputs
input_file = "df_fund_ratios_matched.pickle"
output_filename = 'hold_out_fund_ratio_sets.pickle'
years = [2015, 2016, 2017, 2018, 2019]
labels = list(map(lambda x: 'fund_h_out_' + str(x), years))

#read data into memory and set new year_column
df_all = pd.read_pickle(input_file)
df_all['year_'] = (df_all['Filed_Date'].dt.year).values

#create dictionary
dict_sets={}
for label in labels:
    dict_sets.update({label: {'df_train_master': pd.DataFrame(),
                              'df_test_master': pd.DataFrame()}}) 
    
#Create holdout sets and populate dictionary 
for idx, label in enumerate(labels):
    year = years[idx]
    
    mask_train = df_all['year_'] < year
    mask_test = df_all['year_'] == year
    
    df_train = df_all[mask_train]
    df_test = df_all[mask_test]
    
    dict_sets[label]['df_train_master'] = df_train
    dict_sets[label]['df_test_master'] = df_test
    
with open(output_filename, 'wb') as handle:                                     
    pickle.dump(dict_sets, handle, protocol=pickle.HIGHEST_PROTOCOL)

The holdout sets for the market (MKT) model are set equal to FIN holdout sets.

### 9. Test Holdout Sets 

Test and store results for the NLP model: 

In [None]:
import pickle
import numpy as np
import pandas as pd
import random
from scipy.sparse import csr_matrix, vstack, hstack, identity
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import classification_report, confusion_matrix
from capstone_10k_functions import convert_df_values_csc_chunk
from datetime import datetime as dt

t1 = dt.now()
print(t1)
pd.set_option('mode.chained_assignment', None)

#inputs
ratio = 5
vector_func = 'TfidfVectorizer'
ngram = 'unigram'
years = [2015, 2016, 2017, 2018, 2019]
label_years = list(map(lambda x: 'hold_out_' + str(x), years))
number = 3
x_drop_columns = ['Filed_Date', 'ticker_','sector_', 'sic_sector', 'max_dd_1yr','max_dd_2yr', 
                  'year_dd_flag', 'cum_year_dd_flag', 'custom_sector']
weight_over =0.25
weight_under = 0.75
model_over = GradientBoostingClassifier(random_state=41)
model_under = GradientBoostingClassifier(random_state=41)
method = 'over_under_ensemble_words_only_hold_out'

#create dictionary structure
dict_years = {}
for label in label_years:
    dict_years.update({label: {}})

#loop through holdout sets and populate dictionary
for label in label_years:
    
    print(label)
    
    filename = label + '_' + vector_func + '_min_df_25_' + ngram + '.pickle'
    d_years = pd.read_pickle(filename)
    df_train = d_years['min_df_25']['df_train_master']
    df_test = d_years['min_df_25']['df_test_master']

    #Equal number algo
    df_train = equalize_year_ratio(df_train, ratio)
       
    #define X and y dataframes 
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1

    
    #convert X and y arrays
    X_train = convert_df_values_csc_chunk(df_x_train)
    y_train = df_y_train.values
    X_test = convert_df_values_csc_chunk(df_x_test)
    y_test = df_y_test.values  

    #normalize (row wise so not strictly necesarry to process this way
    #but kept in format for generalization to other preprocessing
    n_test = y_test.shape[0]
    X_train_test = vstack((X_train, X_test ))
    X_train = normalize(X_train)
    X_test = normalize(X_train_test)[-n_test:]    

    #oversample              
    key_number = 'number'+ str(number)
    n_over = int(number*n_pos_over)
    dict_bal_over = oversample_random(X_train, y_train, seed = 41, flag=True, n=n_over)
    X_train_balanced_over = dict_bal_over['X_balanced'] 
    y_train_balanced_over = dict_bal_over['y_balanced']  

    #undersample
    dict_bal_under = undersample_random(X_train, y_train, seed = 41)
    X_train_balanced_under = dict_bal_under['X_balanced'] 
    y_train_balanced_under = dict_bal_under['y_balanced']  

    #train model
    model_1 =model_over
    model_1.fit(X_train_balanced_over, y_train_balanced_over)
    y_1_log_proba = model_1.predict_log_proba(X_test)[:,1]
        
    model_2 =model_under
    model_2.fit(X_train_balanced_under, y_train_balanced_under)
    y_2_log_proba = model_2.predict_log_proba(X_test)[:,1]
        
    y_log_proba = weight_over*y_1_log_proba + weight_under*y_2_log_proba
    y_pred = (y_log_proba > np.log(0.5))*1
        
    #calcuate model metrics    
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    pos_recall = cm[1,1] / (cm[1,0] + cm[1,1])
    neg_recall = cm[0,0] / (cm[0,0] + cm[0,1])
    mh_recall =  2*neg_recall*pos_recall / (neg_recall + pos_recall)
    report = classification_report(y_test, y_pred,output_dict=True)
        
    print('mh_recall = ', mh_recall)
    print('pos_recall = ', pos_recall)
    print('neg_recall = ', neg_recall)
        
    df_y_result = df_test[['ticker_', 'Filed_Date']]
    df_y_result['true_'] = y_test
    df_y_result['pred_'] = y_pred
    df_y_result['log_proba'] = y_log_proba
        
    dict_years[label] = {'df_y_result': df_y_result, 'class_report': report}
        
        
    #find words in testing set only and count doc number
          #drop null columns
    null_columns = df_x_test.columns[df_x_test.isnull().any()]
    df_x_test_notna = df_x_test.drop(null_columns, axis=1)
          #count how many docs word in
    df_x_test_notna = df_x_test_notna.transpose()
    s_word_in_test_doc_count = df_x_test_notna.apply(lambda x: (x != 0).sum(), axis=1) 
    mask_keep = s_word_in_test_doc_count != 0
    s_word_in_test_doc_count = s_word_in_test_doc_count[mask_keep]
    s_word_in_test_doc_count = s_word_in_test_doc_count.sort_values(ascending=False)

        #find words in predicted pos only only
    mask_pred_pos = (y_pred == 1)
    df_x_pred_pos = df_x_test[mask_pred_pos]
          #drop null columns
    null_columns = df_x_pred_pos.columns[df_x_pred_pos.isnull().any()]
    df_x_pred_pos_notna = df_x_pred_pos.drop(null_columns, axis=1)
          #count how many docs word in
    df_x_pred_pos_notna= df_x_pred_pos_notna.transpose()
    s_word_in_pred_pos_doc_count = df_x_pred_pos_notna.apply(lambda x: (x != 0).sum(), axis=1) 
    mask_keep = s_word_in_pred_pos_doc_count!= 0
    s_word_in_pred_pos_doc_count = s_word_in_pred_pos_doc_count[mask_keep]   
    s_word_in_pred_pos_doc_count = s_word_in_pred_pos_doc_count.sort_values(ascending=False)
        

    #Generate prob word matrix 
    words_list = list(df_x_test.columns)  
    n_id = len(words_list)

    word_arr = identity(n_id).tolil()
    word_arr_q = csr_matrix(word_arr)
            
    word_1_log_proba = model_1.predict_log_proba(word_arr_q)[:,1]
    word_2_log_proba = model_2.predict_log_proba(word_arr_q)[:,1]
    word_log_proba = weight_over*word_1_log_proba + weight_under*word_2_log_proba
    word_proba = np.exp(word_log_proba)

    df_words_prob = pd.DataFrame(word_proba, index = words_list, columns=['prob'])
    df_words_prob = df_words_prob.sort_values('prob', ascending=False)
        
    dict_years[label] = {'df_y_result': df_y_result,
                             'conf_mat': cm,
                             'class_report': report, 
                             'df_words_proba': df_words_prob,
                             'words_test_doc_count': s_word_in_test_doc_count,
                             'words_pred_pos_doc_count': s_word_in_pred_pos_doc_count}
                             
        
output_filename= 'hold_out_results_under_over_ensemble_words_only.pickle'
with open(output_filename, 'wb') as handle:                                     
    pickle.dump(dict_years, handle, protocol=pickle.HIGHEST_PROTOCOL)

t2= dt.now()
print(t2)
print(t2-t1)
               
#runtime 1hr56mins
            

Test and store results for the baseline financial ratio (FIN) model: 

In [None]:
import pickle
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('mode.chained_assignment', None)
#not refactored

#inputs
random.seed(41)
input_file = 'hold_out_fund_ratio_sets.pickle'
output_filename = 'fund_ratio_hold_out_results.pickle'
ratio = 5
years = [2015, 2016, 2017, 2018, 2019]
label_years = list(map(lambda x: 'fund_h_out_' + str(x), years))
over_sample_amount = [3]
x_drop_columns = ['Filed_Date', 'ticker_','sector_',  'max_dd_1yr', 'ann_qcut']
models = [GradientBoostingClassifier(random_state=41)]
model_names = ['grad_boost']

#create dictionary structure
dict_results = {}
for label in label_years:
    dict_results.update({label: {}})
    for model in model_names:
        dict_results[label].update({model: {}})

#populate dictionary
d_h_out = pd.read_pickle(input_file)

for label in label_years:
    
    df_train = d_h_out[label]['df_train_master']
    df_test = d_h_out[label]['df_test_master']
    
    
       
#calculate previous year quartile rank
    #join training and test to calculate annual quartile ranks
    df_all = pd.concat([df_train, df_test])
    df_all['year_'] = df_all['Filed_Date'].dt.year.values
    
    years = set(df_all['year_'].tolist())
    years = sorted(list(years))
    n_years =len(years)
    
    counter=0
    for j in range(n_years-1):
        current_year = years[j+1]
        prev_year = years[j]
        
        mask_prev = df_all['year_'] == prev_year
        df_prev = df_all[mask_prev]
        df_prev['ann_qcut'] = pd.qcut(df_prev['max_dd_1yr'], q=[0,0.25,0.5,0.75,1],
                                      labels=[4,3,2,1]).astype(int)
        df_prev = df_prev[['ticker_','ann_qcut']]
        mask_current = df_all['year_'] == current_year
        df_current = df_all[mask_current]
        df_current = df_current.merge(df_prev, on=['ticker_'], how='inner')
        
        if counter==0:
            df_final = df_current
        else:
            df_final = pd.concat([df_final, df_current])
        counter+=1
    
    #drop year_ column and sort ascending date
    df_final = df_final.drop('year_', axis=1)
    df_final = df_final.sort_values(by='Filed_Date')
    
    #split data bank into training and test sets
    all_rows = df_all.shape[0]    
    train_perc = df_train.shape[0] / df_all.shape[0] 
    updated_train_rows = int(train_perc * df_final.shape[0] // 1)
    df_train = df_final.iloc[:updated_train_rows, :].reset_index(drop=True)
    df_test = df_final.iloc[updated_train_rows:, :].reset_index(drop=True)

    

#Equal number algo (see helper function for comments)
    mask_pos = df_train['max_dd_1yr'] < -0.8
    df_year = df_train[['ticker_','Filed_Date']][mask_pos]
    df_year['year'] = df_year['Filed_Date'].dt.year
    df_pos_count = df_year['year'].value_counts()
    
    years = sorted(df_pos_count.index.tolist())
    
    df_train['year'] = df_train['Filed_Date'].dt.year

    count=0
    for year in years:
    
        mask_year = df_train['year']==year
        df = df_train[mask_year]
        mask_neg = df['max_dd_1yr'] >= -0.8 
        df = df[mask_neg]
        m = df.shape[0]
        n = int(ratio*df_pos_count[year])
    
        random_idx = random.sample(range(0,m), n)
        df = df.iloc[random_idx]
        if count == 0:
            df_final = df
        else:
            df_final = pd.concat([df_final,df])
        count +=1    
    
    df_train = df_train.drop('year', axis=1)
    df_neg = df_final.drop('year', axis=1)
    df_pos = df_train[mask_pos]
    
    df_train = pd.concat([df_neg, df_pos])
    df_train = df_train.sort_values(by='Filed_Date')
    
    
     #define X and y dataframes
    df_x_train = df_train.drop(x_drop_columns, axis=1)
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1
    df_x_test = df_test.drop(x_drop_columns, axis=1)
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1

    #convert X and y arrays
    X_train = df_x_train.values
    y_train = df_y_train.values
    X_test = df_x_test.values
    y_test = df_y_test.values
    
    
    #oversample (see helper function for comments)
    for number in over_sample_amount:
        
        key_number = 'number'+ str(number)
        idx_train_argsort= np.argsort(y_train)
        n_pos = y_train.sum()
        n_neg = y_train.shape[0] - n_pos
    
        idx_pos_prelim = idx_train_argsort[-n_pos:]
        
        n = int(number*n_pos)

        idx_pos = np.random.choice(idx_pos_prelim, n)   
        X_train_pos = X_train[idx_pos]
        y_train_pos = y_train[idx_pos]
        
        idx_neg_prelim = idx_train_argsort[:n_neg]
        random_idx = random.sample(range(0,n_neg), n)
        idx_neg = idx_neg_prelim[random_idx]
        X_train_neg = X_train[idx_neg]
        y_train_neg = y_train[idx_neg]

        X_train_balanced = np.concatenate((X_train_pos,X_train_neg), axis=0)
        y_train_balanced = np.concatenate((y_train_pos,y_train_neg),axis=0)
    
    
        #train model
        for idx, model_func in enumerate(models):
            model_name = model_names[idx]
            model =model_func
            model.fit(X_train_balanced, y_train_balanced)
            y_pred = model.predict(X_test)
    
            cm = confusion_matrix(y_test, y_pred, labels=[0,1])
            pos_recall = cm[1,1] / (cm[1,0] + cm[1,1])
            neg_recall = cm[0,0] / (cm[0,0] + cm[0,1])
            mh_recall =  2*neg_recall*pos_recall / (neg_recall + pos_recall)
            report = classification_report(y_test, y_pred,output_dict=True)
        
            print('mh_recall = ', mh_recall)
            print('pos_recall = ', pos_recall)
            print('neg_recall = ', neg_recall)
        
            df_y_result = df_test[['ticker_', 'Filed_Date']]
            df_y_result['true_'] = y_test
            df_y_result['pred_'] = y_pred
    
            dict_results[label][model_name] = {'df_y_result': df_y_result, 'class_report': report}
            
       
with open(output_filename, 'wb') as handle:                                     
    pickle.dump(dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)



Test and store results for the market (MKT) model: 

In [None]:
import pickle
import numpy as np
import pandas as pd
import random
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import scale
from sklearn.metrics import classification_report, confusion_matrix

pd.set_option('mode.chained_assignment', None)
#not refactored

#inputs
random.seed(41)
input_file = 'hold_out_fund_ratio_sets.pickle'
output_filename = 'prev_quartile_only_hold_out_results.pickle'
ratio = 5
years = [2015, 2016, 2017, 2018, 2019]
label_years = list(map(lambda x: 'fund_h_out_' + str(x), years))
over_sample_amount = [3]
x_drop_columns = ['Filed_Date', 'ticker_','sector_',  'max_dd_1yr']
models = [GradientBoostingClassifier(random_state=41)]
model_names = ['grad_boost']

#create dictionary structure
dict_results = {}
for label in label_years:
    dict_results.update({label: {}})
    for model in model_names:
        dict_results[label].update({model: {}})

#read data into memory
d_h_out = pd.read_pickle(input_file)

#loop through holdout sets and populate dictionary
for label in label_years:
    
    df_train = d_h_out[label]['df_train_master']
    df_test = d_h_out[label]['df_test_master']
    
#calculate previous year quartile rank
    #join training and test to calculate annual quartile ranks
    df_all = pd.concat([df_train, df_test])
    df_all['year_'] = df_all['Filed_Date'].dt.year.values
    
    years = set(df_all['year_'].tolist())
    years = sorted(list(years))
    n_years =len(years)
    
    counter=0
    for j in range(n_years-1):
        current_year = years[j+1]
        prev_year = years[j]
        
        mask_prev = df_all['year_'] == prev_year
        df_prev = df_all[mask_prev]
        df_prev['ann_qcut'] = pd.qcut(df_prev['max_dd_1yr'], q=[0,0.25,0.5,0.75,1],
                                      labels=[4,3,2,1]).astype(int)
        df_prev = df_prev[['ticker_','ann_qcut']]
        mask_current = df_all['year_'] == current_year
        df_current = df_all[mask_current]
        df_current = df_current.merge(df_prev, on=['ticker_'], how='inner')
        
        if counter==0:
            df_final = df_current
        else:
            df_final = pd.concat([df_final, df_current])
        counter+=1
    
    #drop year_ column and sort ascending date
    df_final = df_final.drop('year_', axis=1)
    df_final = df_final.sort_values(by='Filed_Date')
    
    #split data bank into training and test sets
    all_rows = df_all.shape[0]    
    train_perc = df_train.shape[0] / df_all.shape[0] 
    updated_train_rows = int(train_perc * df_final.shape[0] // 1)
    df_train = df_final.iloc[:updated_train_rows, :].reset_index(drop=True)
    df_test = df_final.iloc[updated_train_rows:, :].reset_index(drop=True)

#Equal number algo (see helper function for comments)
    mask_pos = df_train['max_dd_1yr'] < -0.8
    df_year = df_train[['ticker_','Filed_Date']][mask_pos]
    df_year['year'] = df_year['Filed_Date'].dt.year
    df_pos_count = df_year['year'].value_counts()
    
    years = sorted(df_pos_count.index.tolist())
    
    df_train['year'] = df_train['Filed_Date'].dt.year

    count=0
    for year in years:
    
        mask_year = df_train['year']==year
        df = df_train[mask_year]
        mask_neg = df['max_dd_1yr'] >= -0.8 
        df = df[mask_neg]
        m = df.shape[0]
        n = int(ratio*df_pos_count[year])
    
        random_idx = random.sample(range(0,m), n)
        df = df.iloc[random_idx]
        if count == 0:
            df_final = df
        else:
            df_final = pd.concat([df_final,df])
        count +=1    
    
    df_train = df_train.drop('year', axis=1)
    df_neg = df_final.drop('year', axis=1)
    df_pos = df_train[mask_pos]
    
    df_train = pd.concat([df_neg, df_pos])
    df_train = df_train.sort_values(by='Filed_Date')
    
    
    #define X and y dataframes
    df_x_train = df_train['ann_qcut']
    df_y_train= (df_train['max_dd_1yr'] <= -0.8)*1
    df_x_test = df_test['ann_qcut']
    df_y_test= (df_test['max_dd_1yr'] <= -0.8)*1

    #convert X and y arrays
    X_train = df_x_train.values
    X_train = X_train.reshape(-1, 1)
    y_train = df_y_train.values
    X_test = df_x_test.values
    X_test = X_test.reshape(-1, 1)
    y_test = df_y_test.values
    
    #oversample (see helper function for comments)
    idx_train_argsort= np.argsort(y_train)
    n_pos = y_train.sum()
    n_neg = y_train.shape[0] - n_pos
    
    idx_pos_prelim = idx_train_argsort[-n_pos:]
    
    for number in over_sample_amount:
        
        key_number = 'number'+ str(number)
    
        n = int(number*n_pos)

        idx_pos = np.random.choice(idx_pos_prelim, n)   
        X_train_pos = X_train[idx_pos]
        y_train_pos = y_train[idx_pos]

        idx_neg_prelim = idx_train_argsort[:n_neg]
        random_idx = random.sample(range(0,n_neg), n)
        idx_neg = idx_neg_prelim[random_idx]
        X_train_neg = X_train[idx_neg]
        y_train_neg = y_train[idx_neg]

        X_train_balanced = np.concatenate((X_train_pos,X_train_neg), axis=0)
        y_train_balanced = np.concatenate((y_train_pos,y_train_neg),axis=0)
    
    
        #train model
        for idx, model_func in enumerate(models):
            model_name = model_names[idx]
            model =model_func
            model.fit(X_train_balanced, y_train_balanced)
            y_pred = model.predict(X_test)
    
            cm = confusion_matrix(y_test, y_pred, labels=[0,1])
            pos_recall = cm[1,1] / (cm[1,0] + cm[1,1])
            neg_recall = cm[0,0] / (cm[0,0] + cm[0,1])
            mh_recall =  2*neg_recall*pos_recall / (neg_recall + pos_recall)
            report = classification_report(y_test, y_pred,output_dict=True)
        
            print('mh_recall = ', mh_recall)
            print('pos_recall = ', pos_recall)
            print('neg_recall = ', neg_recall)
        
            df_y_result = df_test[['ticker_', 'Filed_Date']]
            df_y_result['true_'] = y_test
            df_y_result['pred_'] = y_pred
    
            dict_results[label][model_name] = {'df_y_result': df_y_result, 'class_report': report}
            
          
with open(output_filename, 'wb') as handle:                                     
    pickle.dump(dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

