### 1) Importing required modules

In [18]:
import  pandas                   as      pd
import  numpy                    as      np
from    sklearn.model_selection  import  train_test_split
from    sklearn.linear_model     import  LogisticRegression
from    sklearn.metrics          import  accuracy_score, roc_auc_score
from    scipy.stats              import  mode
from    matplotlib               import  pyplot   as  plt

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
import time
from    datetime   import datetime
from    datetime   import timedelta

### 2) Read data

In [21]:
file_name     =   r'D:\DrPKV\20220618\Output\Important_9_columns_Data.csv'
df1           =   pd.read_csv(file_name)
print('\nDataset dimension {}'.format(df1.shape))
print(df1.columns)


Dataset dimension (9000, 10)
Index(['Target', 'Cash_ratio', 'Interest_coverage', 'Inventory_turnover',
       'Operating Cash Flow/Total Debt', 'Operating Cash Flow/Total Sales',
       'Shareholderquity_code', 'Total shareholders' funds',
       'YOY EBIT Growth Rate', 'debt_equity'],
      dtype='object')


In [22]:
y             =    df1['Target']
X             =    df1.drop(['Target'], axis = 1)     

### 3) Implementing cross validation

In [23]:
"""
Function Name: compare_models

   Description: This **function** compares various algorithms on 
                 1) AUROC 2) Precision, 3) Recall
   
   Input: 1) splits for k fold 
          2) random seed number
          3) Training data for predictor variables
          4) Training data for target variable



   Output: Model comparison on these metrics 1) AUROC 2) Metrics - Precision, Recall
   
"""
def compare_models(random_state, X, y):  

    ### To compare algorithms
    
    from    matplotlib                    import   pyplot                 as       plt
    from    sklearn.model_selection       import   KFold
    from    sklearn.metrics               import   roc_auc_score
    from    sklearn.linear_model          import   LogisticRegression
    from    sklearn.tree                  import   DecisionTreeClassifier
    from    sklearn.svm                   import   SVC
    from    sklearn.discriminant_analysis import   LinearDiscriminantAnalysis
    from    sklearn.neighbors             import   KNeighborsClassifier
    from    sklearn.ensemble              import   RandomForestClassifier
    from    sklearn.model_selection       import   StratifiedKFold
    ### ----------------------------------------------------------------------------------------------    
    ### Prepare models
    ### ----------------------------------------------------------------------------------------------
    
    X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.30, stratify = y,  random_state = random_state)  
        
    results_df   =   pd.DataFrame()  
    models       =   []
    models.append(('LR', LogisticRegression().fit(X_train, y_train)))
    models.append(('CART', DecisionTreeClassifier().fit(X_train, y_train)))
    models.append(('RF',RandomForestClassifier().fit(X_train, y_train)))
    models.append(('SVC', SVC().fit(X_train, y_train)))
    models.append(('LDA', LinearDiscriminantAnalysis().fit(X_train, y_train)))
    models.append(('KNN', KNeighborsClassifier().fit(X_train, y_train)))
    ### ----------------------------------------------------------------------------------------------   
    ### Evaluate model in turn
      
    scores_req      =  ['roc_auc', 'precision', 'recall']

    res_df_tr       =  pd.DataFrame() 
    res_df_test     =  pd.DataFrame() 
    
    for name, clf in models:
                 
            y_train_pred                     =  clf.predict(X_train); print(y_train_pred.shape)
            y_test_pred                      =  clf.predict(X_test); print(y_test_pred.shape)

            res_train_df                     =  pd.DataFrame({'Train_y_true' :  y_train, 'Train_y_pred' : y_train_pred})
            res_test_df                      =  pd.DataFrame({'Test_y_true' :   y_test, 'Test_y_pred' : y_test_pred})
            res_train_df['Model']            =  name
            res_test_df['Model']             =  name
            res_df_tr                        =  pd.concat([res_df_tr, res_train_df], axis = 0)
            res_df_test                      =  pd.concat([res_df_test, res_test_df], axis = 0)
           
    result                           =  [res_df_tr, res_df_test]
    
    return result
### ------------------------------------------------------------------------------------------

In [24]:
random_state         =   12345
tr, te               =  compare_models(random_state, X, y)

(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)


In [25]:
tr.columns

Index(['Train_y_true', 'Train_y_pred', 'Model'], dtype='object')

In [26]:
reqd_cols                  =    ['Train_y_true', 'Train_y_pred']
new_training_df            =    tr.loc[tr.Model == 'LR', reqd_cols ]
new_training_df.columns    =    ['Train_y_true', 'LR_Train_y_pred']

In [27]:
new_training_df.head().T

Unnamed: 0,4148,2070,5234,4151,2115
Train_y_true,1,1,1,1,1
LR_Train_y_pred,1,1,1,1,0


In [28]:
### 1) mODEL CART
CART_new_training_df                    =    tr.loc[tr.Model == 'CART', reqd_cols ]
new_training_df['CART_Train_y_pred']    =    CART_new_training_df['Train_y_pred'] 

### 2) mODEL RF
RF_new_training_df                      =    tr.loc[tr.Model == 'RF', reqd_cols ]
new_training_df['RF_Train_y_pred']      =    RF_new_training_df['Train_y_pred'] 

### 3)mODEL SVC
SVC_new_training_df                     =    tr.loc[tr.Model == 'SVC', reqd_cols ]
new_training_df['SVC_Train_y_pred']     =    SVC_new_training_df['Train_y_pred'] 

### 4) MODEL LDA
LDA_new_training_df                     =    tr.loc[tr.Model == 'LDA', reqd_cols ]
new_training_df['LDA_Train_y_pred']     =    LDA_new_training_df['Train_y_pred'] 

### mODEL KNN
KNN_new_training_df                     =    tr.loc[tr.Model == 'KNN', reqd_cols ]
new_training_df['KNN_Train_y_pred']     =    KNN_new_training_df['Train_y_pred'] 


new_training_df

Unnamed: 0,Train_y_true,LR_Train_y_pred,CART_Train_y_pred,RF_Train_y_pred,SVC_Train_y_pred,LDA_Train_y_pred,KNN_Train_y_pred
4148,1,1,1,1,1,1,1
2070,1,1,1,1,1,1,1
5234,1,1,1,1,1,1,1
4151,1,1,1,1,1,1,1
2115,1,0,1,1,1,1,1
...,...,...,...,...,...,...,...
2098,1,1,1,1,1,1,1
7745,0,1,0,0,1,1,0
5170,1,1,1,1,1,1,1
8686,0,1,0,0,1,1,0


In [29]:
new_training_df.isnull().sum()

Train_y_true         0
LR_Train_y_pred      0
CART_Train_y_pred    0
RF_Train_y_pred      0
SVC_Train_y_pred     0
LDA_Train_y_pred     0
KNN_Train_y_pred     0
dtype: int64

In [30]:
new_training_df.head(20).T

Unnamed: 0,4148,2070,5234,4151,2115,6259,6590,780,2264,5276,1140,8264,7358,5083,5833,3642,2878,4115,2436,7330
Train_y_true,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
LR_Train_y_pred,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1
CART_Train_y_pred,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
RF_Train_y_pred,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
SVC_Train_y_pred,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
LDA_Train_y_pred,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
KNN_Train_y_pred,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1


In [35]:
# Write the output of Predicted values and actual values for each observation in training set for each model

out_tr_filename      =  './../Output/Predicted_Values_train_data_' +  str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
new_training_df.to_csv(out_tr_filename, index = False)

In [32]:
te.columns

Index(['Test_y_true', 'Test_y_pred', 'Model'], dtype='object')

In [33]:
reqd_cols                  =    ['Test_y_true', 'Test_y_pred']

### 1) Logistic Regression model
new_test_df            =    te.loc[te.Model == 'LR', reqd_cols ]
new_test_df.columns    =    ['Test_y_true', 'LR_Test_y_pred']

### 2) MODEL CART
CART_new_test_df                        =    te.loc[te.Model == 'CART', reqd_cols ]
new_test_df['CART_Test_y_pred']         =    CART_new_test_df['Test_y_pred'] 

### 3) mODEL RF
RF_new_test_df                          =    te.loc[te.Model == 'RF', reqd_cols ]
new_test_df['RF_Test_y_pred']           =    RF_new_test_df['Test_y_pred'] 

### 4) MODEL SVC
SVC_new_test_df                         =    te.loc[te.Model == 'SVC', reqd_cols ]
new_test_df['SVC_Test_y_pred']          =    SVC_new_test_df['Test_y_pred'] 

### 5) MODEL LDA
LDA_new_test_df                         =    te.loc[te.Model == 'LDA', reqd_cols ]
new_test_df['LDA_Test_y_pred']          =    LDA_new_test_df['Test_y_pred'] 

### 6) MODEL KNN
KNN_new_test_df                         =    te.loc[te.Model == 'KNN', reqd_cols ]
new_test_df['KNN_Test_y_pred']          =    KNN_new_test_df['Test_y_pred'] 


new_test_df

Unnamed: 0,Test_y_true,LR_Test_y_pred,CART_Test_y_pred,RF_Test_y_pred,SVC_Test_y_pred,LDA_Test_y_pred,KNN_Test_y_pred
1781,1,1,1,1,1,1,1
6471,1,1,1,1,1,1,1
8612,0,0,0,0,1,1,1
3002,1,1,1,1,1,1,1
3513,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...
6901,1,1,1,1,1,1,1
4580,1,1,1,1,1,1,1
5355,1,1,1,1,1,1,1
3879,1,1,1,1,1,1,1


In [36]:
# Write the output of Predicted values and actual values for each observation in test set for each model

out_test_filename      =  './../Output/Predicted_Values_test_data_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
new_test_df.to_csv(out_test_filename, index = False)

### END