# Functions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
import statsmodels.api as sm
from statsmodels.imputation.mice import MICE, MICEData
# import fancyimpute
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.impute import SimpleImputer
# from imblearn.over_sampling import SMOTE
import time
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.neural_network import MLPClassifier
#import all the functions we wrote ourselves
%pip install import_ipynb
import import_ipynb
#import Functions as functions

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
def combine_keyfigures_and_Companies(arr1,arr3):
    '''
    This Dataframe joins keyfigures and companylist using Permno as key
    :param arr1: Pandas-Dataframe of keyfigures
    :param arr3: Pandas-Dataframe of S&PCompanylist
    :return: joint Dataframe on Permno
    '''
    assert type(arr1)== pd.DataFrame
    assert type(arr3) == pd.DataFrame
    arr1=pd.DataFrame(arr1)
    arr3 = pd.DataFrame(arr3)
    arr3=arr3.rename(columns={'PERMNO':'permno'})
    output=pd.merge_asof(arr1,arr3,'permno')
    return output

In [3]:
def add_key(arr):
    '''
    This function adds a unique-key to a dataframe so it can be used as identifier of a row. Key = public_date + Ticker
    :param arr: pandas dataframe
    :return: df + col key out of public-date and TICKER
    '''
    assert type(arr)==pd.DataFrame
    arr=pd.DataFrame(arr)
    arr['key'] = arr.public_date.astype(str) + arr.TICKER.astype(str)
    arr['key']=arr['key'].astype(str)
    return arr

In [4]:
def combine_Ratings_and_Rest(rest,arr2):
    '''
    This function combines the 2 already combined Dataframes with the Ratings using date and ticker
    :param rest: Pandas-Dataframe of keyfigures and S&PCompanylist
    :param arr2: Pandas-Dataframe of Ratings
    :return: joint Pandas-Dataframe of all. Uncleaned
    '''
    assert type(rest)== pd.DataFrame
    assert type(arr2) == pd.DataFrame
    rest=pd.DataFrame(rest)
    arr2 = pd.DataFrame(arr2)
    arr2 = arr2.rename(columns={'tic': 'TICKER'})
    arr2['public_date']=arr2['datadate']
    #Since the format of the date does not match between the dataframe, they are going to be reformatted first
    rest['public_date']=rest['public_date'].astype(str)
    rest['public_date'] = rest['public_date'].str.replace(r'/', '')
    arr2['public_date'] = arr2['public_date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
    rest['public_date'] = rest['public_date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
    
    #Now that the formats are matching, the function add_key(DATAFRAME) is used, to help merge the 2 Dataframes together
    arr2=add_key(arr2)
    rest=add_key(rest)
    output = rest.merge(arr2, how='left', on=['key'])
    return output

In [5]:
def create_lagging_Ratings_1M(path):
    '''
    This function creates a nested list of keys and the according rating of the previous month for merging with pandas-df-
    :param path: path to csv-file with data
    :return: return list with tuple (key, Rating), so it can be merged to Dataframe. Empty values represented by None.
    '''
    output=[['key', 'Lagging-Rating_1M']]
    with open(path,'r') as f:
        #create comparision-list:
        compdict={}
        for line in f.readlines():
            li = line.split(',')
            Kürzel=str(li[79][:7])+str(li[79][10:])

            compdict[Kürzel]=li[81]
        f.close()
        with open(path,'r') as r:
            for line in r.readlines()[1:]:
                key=None
                lagrating=None
                sublis=()
                newdate=None
                liste=line.split(',')
                #create date of previous month of this line
                newdate=str(liste[4][:5])+str(liste[4][5:7])

                #if month between 01 and 08
                if int(liste[4][5:7]) in range(9):
                    newdate = str(liste[4][:5]) + '0'+str(int(liste[4][5:7]) + 1)

                #if month >=9
                elif 11>=int(liste[4][5:7])>=9:
                    newdate =str(liste[4][:5])+str(int(liste[4][5:7])+1)

                #if month ==12 (We gotta subtract a Year too)
                elif int(liste[4][5:7])==12:
                    newdate = str(int(liste[4][:4])+1) + '-01'
                key = str(liste[4]) + str(liste[74])
                newkey=newdate+str(liste[74])
                try:
                    #If the previous month exists with a rating, use this
                    lagrating = compdict[newdate + str(liste[74])]
                except KeyError:
                    #if there is no previous-month-rating, fill-in None
                    lagrating=None
                sublis=[key,lagrating]
                output.append(sublis)
    return output

In [1]:
def my_iterative_imputer(df):
    """ 
    Impute the missing values (NaN) with the IterativeImputer
    :param df: feature matrix with NaN values to be imputed
    :return: imputed Pandas-Dataframe
    """
    from sklearn.experimental import enable_iterative_imputer 
    from sklearn.impute import IterativeImputer
    #Define all column with numeric values (the features)
    num_cols = ['CAPEI', 'bm', 'evm', 'pe_op_basic', 'pe_op_dil', 'pe_exi', 'pe_inc', 'ps', 'pcf', 
                'dpr', 'npm', 'opmbd', 'opmad', 'gpm', 'ptpm', 'cfm', 'roa', 'roe', 'roce', 'efftax', 'aftret_eq',
                'aftret_invcapx', 'aftret_equity', 'pretret_noa', 'pretret_earnat', 'GProf', 'equity_invcap',
                'debt_invcap', 'totdebt_invcap', 'capital_ratio', 'int_debt', 'int_totdebt', 'cash_lt', 'invt_act',
                'rect_act', 'debt_at', 'debt_ebitda', 'short_debt', 'curr_debt', 'lt_debt', 'profit_lct', 'ocf_lct',
                'cash_debt', 'fcf_ocf', 'lt_ppent', 'dltt_be', 'debt_assets', 'debt_capital', 'de_ratio', 'intcov',
                'intcov_ratio', 'cash_ratio', 'quick_ratio', 'curr_ratio', 'cash_conversion', 'inv_turn', 'at_turn',
                'rect_turn', 'pay_turn', 'sale_invcap', 'sale_equity', 'sale_nwc', 'accrual', 'ptb',
                'DIVYIELD', 'PEG_1yrforward', 'PEG_ltgforward']

    # Copy df to df_imputed
    df_imputed = df[num_cols].copy(deep=True)

    # Initialize IterativeImputer
    mice_imputer = IterativeImputer(max_iter=20)

    # Impute using fit_tranform on df
    df_imputed.iloc[:, :] = mice_imputer.fit_transform(df[num_cols])
    
    return df_imputed.iloc[:, :]

In [7]:
from sklearn.ensemble import RandomForestClassifier

def feature_selection(x, y, thres):
    """ 
    Find out, which the most important features are. Return a list of the most important features
    which wil be used for the algorithms.
    :param x: feature input without NaN values
    :param y: classification input
    :param thres: input as percentage value, features with relative importance over this value will be in the output 
    :return: the list of important features
    """
    
    feat_labels = x.columns[:]
    
    # Create Random Forest object, fit data and
    # extract feature importance attributes
    forest = RandomForestClassifier(random_state=1, class_weight='balanced')
    forest.fit(x, y)
    importances = forest.feature_importances_
    
    #Define n as number of importances over the value thres
    n = sum(importances > thres)
    
    # Get cumsum of the n most important features
    feat_imp = np.sort(importances)[::-1]
    sum_feat_imp = np.cumsum(feat_imp)[:n]
    
    # Sort output (by relative importance) and 
    # print top n features
    indices = np.argsort(importances)[::-1]
    for i in range(n):
        print('{0:2d}) {1:7s} {2:6.4f}'.format(i + 1, 
                                           feat_labels[indices[i]],
                                           importances[indices[i]]))
        
    
    # Plot Feature Importance (both cumul., individual)
    plt.figure(figsize=(12, 8))
    plt.bar(range(n), importances[indices[:n]], align='center')
    plt.xticks(range(n), feat_labels[indices[:n]], rotation=90)
    plt.xlim([-1, n])
    plt.xlabel('Feature')
    plt.ylabel('Rel. Feature Importance')
    plt.step(range(n), sum_feat_imp, where='mid', 
         label='Cumulative importance')
    plt.tight_layout();
    
    
    # Create a list with the important features for ML algorhithms
    feature_list = [None] * n
    for i in range(n):
        feature_list[i] = feat_labels[indices[i]]
    
    # return the list of important features
    return feature_list

In [8]:
def LogReg(X_train, Y_train):
    '''
    This function performs a Logistic Regression on the X_train and Y_train and uses grid-cross validation on the datasets.
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :return: Cross-Validation Hyperparameter grid.
    '''
    lor = LogisticRegression(max_iter=100, tol=0.001,random_state=1, n_jobs=-1,solver='saga',warm_start=True) #increasing iterations to 1000 increases score by only 1% -> it is not worth the additional time

    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('logreg', lor)])

    param_grid = {'logreg__penalty': ['elasticnet'], #elastic nets combines l1&l2
                  'logreg__C':[6,6.5,7,7.5,8],
                  'logreg__l1_ratio':[0,0.05,0.1,0.15,0.2,1]} #if 0, or 1 then l2 or l1 would be best. If between then the combination of both

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train,Y_train)
    
    return(grid)

In [9]:
def SVM_poly(X_train,Y_train):
    '''
    This function uses Support Vector Machines on X_train and Y_train with a Polynomial Kernel Function.
    It uses Grid-Crossvalidation to find the best hyperparameters for the dataset
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :return: Cross-Validation Hyperparameter grid
    '''
    
    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_poly', SVC(kernel='poly', random_state=0, max_iter=100000))])
    # Define parameter grid
    param_grid = {'svm_poly__C': [900,1000,1100], 
                  'svm_poly__degree': [3,4,5],
                  'svm_poly__gamma': [0,0.05,0.1],
                  'svm_poly__coef0':[0.6]}  #Larger gridsearch yielded 0.6 to be the best coef0 with this combination. As it does not greatly change the cv accuracy(<1%) we don't include it in this grid search to lower the computing time.

    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [10]:
def SVM_rbf(X_train,Y_train):
    '''
    This function uses Support Vector Machines on X_train and Y_train with a Radial Basis Kernel Function(rbf).
    It uses Grid-Crossvalidation to find the best hyperparameters for the dataset
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :return: Cross-Validation Hyperparameter grid
    '''

    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100000))])
    # Define parameter grid
    param_grid = {'svm_rbf__C': [100,150,200], 
                  'svm_rbf__gamma': [0.25,0.3,0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [11]:
def SVM_rbf_bal(X_train,Y_train):
    '''
    This function uses Support Vector Machines on X_train and Y_train with a Radial Basis Kernel Function(rbf).
    It uses Grid-Crossvalidation to find the best hyperparameters for the dataset where we use balanced class weights.
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :return: Cross-Validation Hyperparameter grid
    '''

    # Create pipeline object with standard scaler and SVC estimator
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('svm_rbf', SVC(kernel='rbf', random_state=0, max_iter=100000, class_weight='balanced'))])

    # Define parameter grid
    param_grid = {'svm_rbf__C': [100,200,300], 
                  'svm_rbf__gamma': [0.25,0.3,0.35]} 
    # Run grid search
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1) #cv=5 yields same accuracy
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [1]:
def random_forest(X_train, Y_train, n_estimators, maxDepth, minSamplesNode, minSamplesLeaf):
    """ 
    This function applies Random Forest Classifier on X_train and Y_train and uses grid-cross validation on the datasets.
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :param n_estimators: array of values which will be tested for variable n_estimators
    :param maxDepth: array of values which will be tested for variable max_depth
    :param minSamplesNode: array of values which will be tested for variable min_samples_split
    :param minSamplesLeaf: array of values which will be tested for variable min_samples_leaf
    """
    # Define the hyperparameter values to be tested
    param_grid = {"n_estimators": n_estimators,
                  'max_depth': maxDepth,
                  'min_samples_split': minSamplesNode,
                  'min_samples_leaf': minSamplesLeaf},

    # Run brute-force grid search
    grid = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv= 5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)

In [2]:
def neural(X_train, Y_train, hidden_layer, maxIter):
    """ 
    This function applies MLP Classifier on X_train and Y_train and uses grid-cross validation on the datasets.
    :param X_train: Training Set of X values
    :param Y_train: Training Set of Y values(factorized)
    :param hidden_layer: array of values which will be tested for variable hidden_layer_sizes
    :param maxIter: array of values which will be tested for variable max_iter
        
    """

    mlp = MLPClassifier(random_state=0, solver= "lbfgs", warm_start= True)
    
    pipe = Pipeline([('scaler', StandardScaler()), 
                     ('neural', MLPClassifier())])
    
    # Define the hyperparameter values to be tested
    param_grid = {"neural__hidden_layer_sizes" : hidden_layer,
                  'neural__max_iter': maxIter},


    # Run brute-force grid search
    #solver "lbfgs" has proven to be the best
    grid = GridSearchCV(pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv= 5, n_jobs=-1)
    grid = grid.fit(X_train, Y_train)
    return(grid)