### 1) Importing required modules

In [1]:
import  pandas                   as      pd
import  numpy                    as      np
from    sklearn.model_selection  import  train_test_split
from    sklearn.linear_model     import  LogisticRegression
from    sklearn.metrics          import  accuracy_score, roc_auc_score
from    scipy.stats              import  mode
from    matplotlib               import  pyplot   as  plt

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import time
from    datetime   import datetime
from    datetime   import timedelta

### 2) Read data

In [4]:
file_name     =   r'C:\DrPKV\DrPKV-main\Data\imputed_compact_dataset_ten_20220707110612AM.csv'
df1           =   pd.read_csv(file_name)
print('\nDataset dimension {}'.format(df1.shape))
print(df1.columns)


Dataset dimension (9000, 11)
Index(['Target', 'Cash_ratio', 'Changeinsales_Industry', 'debt_equity',
       'debt_income', 'Interest_coverage', 'Quick_ratio', 'ROE(new)',
       'ROS(new)', 'Sales_CE', 'Total shareholders' funds'],
      dtype='object')


In [5]:
y             =    df1['Target']
X             =    df1.drop(['Target'], axis = 1)     

### 3) Implementing cross validation

In [6]:
"""
Function Name: compare_models

   Description: This **function** compares various algorithms on 
                 1) AUROC 2) Precision, 3) Recall
   
   Input: 1) splits for k fold 
          2) random seed number
          3) Training data for predictor variables
          4) Training data for target variable



   Output: Model comparison on these metrics 1) AUROC 2) Metrics - Precision, Recall
   
"""
def compare_models(random_state, X, y):  

    ### To compare algorithms
    
    from    matplotlib                    import   pyplot                 as       plt
    from    sklearn.model_selection       import   KFold
    from    sklearn.metrics               import   roc_auc_score
    from    sklearn.linear_model          import   LogisticRegression
    from    sklearn.tree                  import   DecisionTreeClassifier
    from    sklearn.svm                   import   SVC
    from    sklearn.discriminant_analysis import   LinearDiscriminantAnalysis
    from    sklearn.neighbors             import   KNeighborsClassifier
    from    sklearn.ensemble              import   RandomForestClassifier
    from    sklearn.model_selection       import   StratifiedKFold
    from    xgboost                       import   XGBClassifier
    ### ----------------------------------------------------------------------------------------------    
    ### Prepare models
    ### ----------------------------------------------------------------------------------------------
    
    X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.30, stratify = y,  random_state = random_state)  
        
    results_df   =   pd.DataFrame()  
    models       =   []
    models.append(('CART', DecisionTreeClassifier().fit(X_train, y_train)))
    models.append(('RF', RandomForestClassifier().fit(X_train, y_train)))
    models.append(('XGBoost', XGBClassifier().fit(X_train, y_train)))
    ### ----------------------------------------------------------------------------------------------   
    ### Evaluate model in turn
      
    scores_req      =  ['roc_auc', 'precision', 'recall']

    res_df_tr       =  pd.DataFrame() 
    res_df_test     =  pd.DataFrame() 
    
    for name, clf in models:
                 
            y_train_pred                     =  clf.predict(X_train); print(y_train_pred.shape)
            y_test_pred                      =  clf.predict(X_test); print(y_test_pred.shape)

            res_train_df                     =  pd.DataFrame({'Train_y_true' :  y_train, 'Train_y_pred' : y_train_pred})
            res_test_df                      =  pd.DataFrame({'Test_y_true' :   y_test, 'Test_y_pred' : y_test_pred})
            res_train_df['Model']            =  name
            res_test_df['Model']             =  name
            res_df_tr                        =  pd.concat([res_df_tr, res_train_df], axis = 0)
            res_df_test                      =  pd.concat([res_df_test, res_test_df], axis = 0)
           
    result                           =  [res_df_tr, res_df_test]
    
    return result
### ------------------------------------------------------------------------------------------

In [7]:
random_state         =   12345
tr, te               =  compare_models(random_state, X, y)

(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)


In [8]:
tr.columns

Index(['Train_y_true', 'Train_y_pred', 'Model'], dtype='object')

In [9]:
reqd_cols                  =    ['Train_y_true', 'Train_y_pred']
new_training_df            =    tr.copy()

In [10]:
### 1) mODEL CART
CART_new_training_df                    =    tr.loc[tr.Model == 'CART', reqd_cols ]
new_training_df['CART_Train_y_pred']    =    CART_new_training_df['Train_y_pred'] 

### 2) MODEL RF
RF_new_training_df                      =    tr.loc[tr.Model == 'RF', reqd_cols ]
new_training_df['RF_Train_y_pred']      =    RF_new_training_df['Train_y_pred'] 

### 3) MODEL XGBoost
XGB_new_training_df                     =    tr.loc[tr.Model == 'XGBoost', reqd_cols ]
new_training_df['XGBoost_Train_y_pred'] =    XGB_new_training_df['Train_y_pred'] 

new_training_df

Unnamed: 0,Train_y_true,Train_y_pred,Model,CART_Train_y_pred,RF_Train_y_pred,XGBoost_Train_y_pred
3654,0,0,CART,0,0,0
3171,0,0,CART,0,0,0
4669,0,0,CART,0,0,0
8936,1,1,CART,1,1,1
447,0,0,CART,0,0,0
...,...,...,...,...,...,...
6222,0,0,XGBoost,0,0,0
3508,0,0,XGBoost,0,0,0
126,0,0,XGBoost,0,0,0
8073,1,1,XGBoost,1,1,1


In [11]:
new_training_df.isnull().sum()

Train_y_true            0
Train_y_pred            0
Model                   0
CART_Train_y_pred       0
RF_Train_y_pred         0
XGBoost_Train_y_pred    0
dtype: int64

In [12]:
list_tr     =  ['CART_Train_y_pred', 'RF_Train_y_pred', 'XGBoost_Train_y_pred']
new_training_df['Consensus'] = new_training_df[list_tr].mode(axis = 1)
new_training_df.drop(['Train_y_pred'], axis = 1, inplace = True)

In [13]:
new_training_df.head(100).T

Unnamed: 0,3654,3171,4669,8936,447,948,3047,4104,8963,4972,...,3186,3850,3586,5456,7909,4637,8731,7138,4429,1691
Train_y_true,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
Model,CART,CART,CART,CART,CART,CART,CART,CART,CART,CART,...,CART,CART,CART,CART,CART,CART,CART,CART,CART,CART
CART_Train_y_pred,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
RF_Train_y_pred,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
XGBoost_Train_y_pred,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
Consensus,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0


mdf = pd.DataFrame({'CART_Train_y_pred': [0,0,0,1,0,0,0,1,1,1],\
                    'RF_Train_y_pred' : [0,0,0,1,0,0,0,0,1,1],\
                    'XGBoost_Train_y_pred' : [0,0,0,1,0,0,0,0,1,1]})



In [14]:
# Write the output of Predicted values and actual values for each observation in training set for each model

out_tr_filename      =  'C:/DrPKV/DrPKV-main/Output/Predicted_Values_train_data_' +  str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
new_training_df.to_csv(out_tr_filename, index = False)

In [15]:
te.columns

Index(['Test_y_true', 'Test_y_pred', 'Model'], dtype='object')

In [16]:
reqd_cols                  =    ['Test_y_true', 'Test_y_pred']
new_test_df                =    te.copy()

In [17]:
### 1) MODEL CART
CART_new_test_df                        =    te.loc[te.Model == 'CART', reqd_cols ]
new_test_df['CART_Test_y_pred']         =    CART_new_test_df['Test_y_pred'] 

### 2) mODEL RF
RF_new_test_df                          =    te.loc[te.Model == 'RF', reqd_cols ]
new_test_df['RF_Test_y_pred']           =    RF_new_test_df['Test_y_pred'] 

### 3) MODEL XGBoost
XGB_new_test_df                         =    te.loc[te.Model == 'XGBoost', reqd_cols ]
new_test_df['XGBoost_Test_y_pred']      =    XGB_new_test_df['Test_y_pred'] 

new_test_df

Unnamed: 0,Test_y_true,Test_y_pred,Model,CART_Test_y_pred,RF_Test_y_pred,XGBoost_Test_y_pred
7989,1,1,CART,1,0,0
8659,1,1,CART,1,1,1
3809,0,0,CART,0,0,0
2250,0,0,CART,0,0,0
5337,0,0,CART,0,0,0
...,...,...,...,...,...,...
4045,0,0,XGBoost,0,0,0
3993,0,0,XGBoost,0,0,0
5728,0,0,XGBoost,0,0,0
8057,1,1,XGBoost,1,1,1


In [18]:
list_te                  =  ['CART_Test_y_pred', 'RF_Test_y_pred', 'XGBoost_Test_y_pred']
new_test_df['Consensus'] =  new_test_df[list_te].mode(axis = 1)
new_test_df.drop(['Test_y_pred'], axis = 1, inplace = True)


In [19]:
# Write the output of Predicted values and actual values for each observation in test set for each model

out_test_filename      =  'C:/DrPKV/DrPKV-main/Output/Predicted_Values_test_data_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
new_test_df.to_csv(out_test_filename, index = False)

### END