### 1) Importing required modules

In [1]:
import  pandas                   as      pd
import  numpy                    as      np
from    sklearn.model_selection  import  train_test_split
from    sklearn.linear_model     import  LogisticRegression
from    sklearn.metrics          import  accuracy_score, roc_auc_score

from    matplotlib               import  pyplot   as  plt

In [2]:
import time
from    datetime   import datetime
from    datetime   import timedelta

### 2) Read data

In [3]:
file_name     =   r'D:\DrPKV\20220618\Output\Important_9_columns_Data.csv'
df1           =   pd.read_csv(file_name)
print('\nDataset dimension {}'.format(df1.shape))
print(df1.columns)


Dataset dimension (9000, 10)
Index(['Target', 'Cash_ratio', 'Interest_coverage', 'Inventory_turnover',
       'Operating Cash Flow/Total Debt', 'Operating Cash Flow/Total Sales',
       'Shareholderquity_code', 'Total shareholders' funds',
       'YOY EBIT Growth Rate', 'debt_equity'],
      dtype='object')


In [4]:
y             =    df1['Target']
X             =    df1.drop(['Target'], axis = 1)     

### 3) Implementing cross validation

In [5]:
"""
Function Name: compare_models

   Description: This **function** compares various algorithms on 
                 1) AUROC 2) Precision, 3) Recall
   
   Input: 1) splits for k fold 
          2) random seed number
          3) Training data for predictor variables
          4) Training data for target variable



   Output: Model comparison on these metrics 1) AUROC 2) Metrics - Precision, Recall
   
"""
def compare_models(random_state, X, y):  

    ### To compare algorithms
    
    from    matplotlib                    import   pyplot                 as       plt
    from    sklearn.model_selection       import   KFold
    from    sklearn.metrics               import   roc_auc_score
    from    sklearn.linear_model          import   LogisticRegression
    from    sklearn.tree                  import   DecisionTreeClassifier
    from    sklearn.svm                   import   SVC
    from    sklearn.discriminant_analysis import   LinearDiscriminantAnalysis
    from    sklearn.neighbors             import   KNeighborsClassifier
    from    sklearn.ensemble              import   RandomForestClassifier
    from    sklearn.model_selection       import   StratifiedKFold
    ### ----------------------------------------------------------------------------------------------    
    ### Prepare models
    ### ----------------------------------------------------------------------------------------------
    
    X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.30, stratify = y,  random_state = random_state)  
        
    results_df   =   pd.DataFrame()  
    models       =   []
    models.append(('LR', LogisticRegression().fit(X_train, y_train)))
    models.append(('CART', DecisionTreeClassifier().fit(X_train, y_train)))
    models.append(('RF',RandomForestClassifier().fit(X_train, y_train)))
    models.append(('SVC', SVC().fit(X_train, y_train)))
    models.append(('LDA', LinearDiscriminantAnalysis().fit(X_train, y_train)))
    models.append(('KNN', KNeighborsClassifier().fit(X_train, y_train)))
    ### ----------------------------------------------------------------------------------------------   
    ### Evaluate model in turn
      
    scores_req      =  ['roc_auc', 'precision', 'recall']

    res_df_tr       =  pd.DataFrame() 
    res_df_test     =  pd.DataFrame() 
    
    for name, clf in models:
                 
            y_train_pred                     =  clf.predict(X_train); print(y_train_pred.shape)
            y_test_pred                      =  clf.predict(X_test); print(y_test_pred.shape)

            res_train_df                     =  pd.DataFrame({'Train_y_true' :  y_train, 'Train_y_pred' : y_train_pred})
            res_test_df                      =  pd.DataFrame({'Test_y_true' :   y_test, 'Test_y_pred' : y_test_pred})
            res_train_df['Model']            =  name
            res_test_df['Model']             =  name
            res_df_tr                        =  pd.concat([res_df_tr, res_train_df], axis = 0)
            res_df_test                      =  pd.concat([res_df_test, res_test_df], axis = 0)
           
    result                           =  [res_df_tr, res_df_test]
    
    return result
### ------------------------------------------------------------------------------------------

In [6]:
random_state         =   12345
tr, te               =  compare_models(random_state, X, y)

(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)


In [7]:
# Write the output of Predicted values and actual values for each observation in training set for each model

out_tr_filename      =  './../Output/Predicted_Values_train_data_' +  str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
tr.to_csv(out_tr_filename, index = False)

In [8]:
# Write the output of Predicted values and actual values for each observation in test set for each model

out_test_filename      =  './../Output/Predicted_Values_test_data_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
te.to_csv(out_test_filename, index = False)