In [30]:
import  os

folder           =  r'D:\DrPKV\20220713'
os.chdir(folder)
filename         =  './Output/imputed_compact_dataset_ten_20220713084509AM.csv'

In [31]:
import time
from    datetime   import datetime
from    datetime   import timedelta

In [32]:
def esc(code):
    '''
    f-strings are string literals that have an f at the beginning and curly braces containing expressions
    that will be replaced with their values.
    
    Most common ANSI escape sequences take the following form:
    \033[{}m or 0

    We use the Code for non-printable escape character is \033
    m is the character code in either upper case or lower case character
    0 is one or more numbers separated with ;
    Explanation for the expression that will be replaced by their values
    
    Select Graphic Rendition (SGR) parameters sets display attributes arranged in the same sequence separated by semicolons
    to establish GRAPHIC RENDITION COMBINATION MODE (GRCM) .
    
     Code  Meaning
     ----  --------
     30    Black color
     31    Red color
     32    Green color

      1    Bold or increased intensity
      2    Decreased intensity
      4    Underline
      
    https://en.wikipedia.org/wiki/ANSI_escape_code  
    http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-048.pdf

    '''
    return f'\033[{code}m'
    
def  bold_text(text, color = 'R'):
      '''
      This function converts the given text into bold and underlined text in the given color
      '''
      if color == 'B':
         cd = '30;1;4'
      elif color == 'R':
         cd = '31;1;4'
      elif  color == 'G':
         cd =  '32;1;4'
         
      return esc(cd)+ text + esc(0)


In [33]:
"""
   Function Name: prepare_train_test_data

   Description: This **function** predicts the target variable for the CART, RF and XGB models and
                 split the data into 70:30 ratio for train and test data
   
   Input: 1) random seed number
          2) X, the set of independent variables
          3) y, the dependent variable

   Output: DataFrames for training and test data
   
"""
def prepare_train_test_data(random_state, X, y):  

    from    sklearn.model_selection       import   StratifiedKFold
    from    sklearn.tree                  import   DecisionTreeClassifier
    from    sklearn.ensemble              import   RandomForestClassifier
    from    sklearn.model_selection       import   StratifiedKFold
    from    xgboost                       import   XGBClassifier
    ### ----------------------------------------------------------------------------------------------    
    ### Prepare models
    ### ----------------------------------------------------------------------------------------------
    X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size = 0.30, stratify = y,  random_state = random_state)  

    print("\nNull values in X_train"); print(X_train.isnull().sum())
    print("\nNull values in y_train"); print(y_train.isnull().sum())
    print("\nNull values in X_test"); print(X_test.isnull().sum())
    print("\nNull values in y_test"); print(y_test.isnull().sum())
    
    result   =  pd.DataFrame()   

    models       =   []
    models.append(('CART', DecisionTreeClassifier().fit(X_train, y_train)))
    models.append(('RF', RandomForestClassifier().fit(X_train, y_train)))
    models.append(('XGBoost', XGBClassifier().fit(X_train, y_train)))
    
    ### ----------------------------------------------------------------------------------------------   
    ### Predict using each model in turn
    ### ----------------------------------------------------------------------------------------------   
      
    res_df_tr       =  pd.DataFrame() 
    res_df_test     =  pd.DataFrame() 
    
    for name, clf in models:
                
         y_train_pred                   =    clf.predict(X_train); print(y_train_pred.shape)
         y_test_pred                    =    clf.predict(X_test); print(y_test_pred.shape)

         res_train_df                   =    pd.DataFrame() 
         res_test_df                    =    pd.DataFrame() 
        
         ### ----------------------------------------------
         ### Training data
         ### ----------------------------------------------
         res_train_df                   =    X_train.copy()
         res_train_df['Train_y_pred']   =    y_train_pred
         res_train_df['Model']          =    name
         ### ----------------------------------------------
         ### Test data
         ### ----------------------------------------------
         res_test_df                   =    X_test.copy()
         res_test_df['Test_y_pred']    =    y_test_pred
         res_test_df['Model']          =    name
         ###
         ### Append each dataframe to res_df_tr
         ###
         res_df_tr                        =  pd.concat([res_df_tr, res_train_df], axis = 0)
         ###
         ### Append each dataframe to res_df_test
         ###
         res_df_test                      =  pd.concat([res_df_test, res_test_df], axis = 0)
    ###
    ### Return both datasets
    ###
    result                                       =  [res_df_tr, res_df_test]
    return result
### ------------------------------------------------------------------------------------------

In [34]:
import   pandas                   as      pd
import   numpy                    as      np
from     sklearn                  import  metrics
from     sklearn.metrics          import  classification_report
from     sklearn.model_selection  import  train_test_split
import   seaborn                  as      sns
import   matplotlib.pyplot        as      plt

In [35]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [36]:
def measures_cm(model, datatype, act, pred):
   
    c_matrix = pd.crosstab(act,pred, rownames=['Actual'], colnames=['Predicted'])

    print("\nModel performance of %s" %model)
    print("\nDataset used %s" %datatype)

    sns.heatmap(c_matrix, annot = True, fmt = '.6g')
    plt.show()
    target_names = [ 'Healthy', 'NPA']
    print(classification_report(act, pred, target_names = target_names))

In [37]:
def draw_roc( actual, probs ):

    import pandas            as   pd
    import numpy             as   np
    import seaborn           as   sns
    import matplotlib.pyplot as   plt
    import sklearn.metrics   as   metrics
    import seaborn           as   sns
    
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
    drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(6, 4))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.show()

In [38]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [39]:
df             =    pd.read_csv(filename)
print('Data shape {}'.format(df.shape))

Data shape (9000, 13)


In [40]:
df.columns

Index(['Target', 'Cash_ratio', 'Changeinsales_Industry', 'debt_equity',
       'debt_income', 'Interest_coverage', 'Quick_ratio', 'ROE(new)',
       'ROS(new)', 'Sales_CE', 'Total shareholders' funds', 'Slno',
       'Shareholderquity_code'],
      dtype='object')

In [41]:
df.isnull().sum()

Target                          0
Cash_ratio                      0
Changeinsales_Industry          0
debt_equity                     0
debt_income                     0
Interest_coverage               0
Quick_ratio                     0
ROE(new)                        0
ROS(new)                        0
Sales_CE                        0
Total shareholders' funds       0
Slno                            0
Shareholderquity_code        3790
dtype: int64

In [42]:
# replacing na values in college with No college
df["Shareholderquity_code"].fillna(9999, inplace = True)

In [43]:
clean_dataset(df)

Unnamed: 0,Target,Cash_ratio,Changeinsales_Industry,debt_equity,debt_income,Interest_coverage,Quick_ratio,ROE(new),ROS(new),Sales_CE,Total shareholders' funds,Slno,Shareholderquity_code
0,0.00000,2.01764,0.11152,0.01399,0.03539,0.00000,1.55202,0.39534,5.74985,0.00000,1439044420.00000,0.00000,0.00000
1,0.00000,1.55843,0.11152,0.01663,0.04271,12559.25001,1.08407,0.38944,0.28722,1.00223,1012901796.00000,1.00000,0.00000
2,0.00000,0.85337,0.11152,0.01936,0.03716,10471.05499,0.38471,0.52113,0.28324,1.43067,754886314.00000,2.00000,0.00000
3,0.00000,0.96842,0.11152,0.02084,0.04207,15731.10741,0.37537,0.49546,0.29300,1.50804,770367465.00000,3.00000,0.00000
4,0.00000,1.21196,0.11152,0.03462,0.06253,23204.77952,0.61504,0.55367,0.30306,1.82694,790918031.00000,4.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,1.00000,0.00040,0.22944,-14.89225,-108.14208,-0.09498,0.00040,0.13771,-18.71199,-0.15401,-19210382.00000,8995.00000,3.00000
8996,1.00000,0.00025,0.22944,-17.24193,-95.97450,-70.95195,0.00025,0.17965,-0.46940,-3.70561,-16539466.00000,8996.00000,3.00000
8997,1.00000,0.00005,0.22944,-20.85594,-84.20796,-14.46334,0.00005,0.24767,-362.05500,-0.00074,-13511431.00000,8997.00000,3.00000
8998,1.00000,0.00012,0.22944,-28.39428,-10.48174,10698.04671,0.00012,2.70893,-49.24356,-0.00622,-9890881.00000,8998.00000,3.00000


In [44]:
y                    =    df['Target']
X                    =    df.drop(['Target'], axis = 1)
random_state         =    12345

In [45]:
X.isnull().sum()

Cash_ratio                   0
Changeinsales_Industry       0
debt_equity                  0
debt_income                  0
Interest_coverage            0
Quick_ratio                  0
ROE(new)                     0
ROS(new)                     0
Sales_CE                     0
Total shareholders' funds    0
Slno                         0
Shareholderquity_code        0
dtype: int64

In [46]:
y.isnull().sum()

0

In [47]:
random_state         =  12345
df_train, df_test    =   prepare_train_test_data(random_state, X, y)


Null values in X_train
Cash_ratio                   0
Changeinsales_Industry       0
debt_equity                  0
debt_income                  0
Interest_coverage            0
Quick_ratio                  0
ROE(new)                     0
ROS(new)                     0
Sales_CE                     0
Total shareholders' funds    0
Slno                         0
Shareholderquity_code        0
dtype: int64

Null values in y_train
0

Null values in X_test
Cash_ratio                   0
Changeinsales_Industry       0
debt_equity                  0
debt_income                  0
Interest_coverage            0
Quick_ratio                  0
ROE(new)                     0
ROS(new)                     0
Sales_CE                     0
Total shareholders' funds    0
Slno                         0
Shareholderquity_code        0
dtype: int64

Null values in y_test
0
(6300,)
(2700,)
(6300,)
(2700,)
(6300,)
(2700,)


In [48]:
train_slno                             =    df_train['Slno'].tolist()
df_train['Target']                     =    df.loc[df.Slno.isin(train_slno), 'Target']

In [49]:
df_train.head().T

Unnamed: 0,3654,3171,4669,8936,447
Cash_ratio,0.50100,0.01410,0.05606,0.05984,0.04183
Changeinsales_Industry,13.35479,1.23813,0.23064,0.10694,0.11152
debt_equity,4.74254,1.14272,1.88845,4.07424,0.14959
debt_income,3.27540,-6.21445,6.96032,-25.71285,0.75675
Interest_coverage,1.26915,0.56057,0.00000,10.01845,3.72220
Quick_ratio,0.50100,-0.71591,0.05606,-0.07351,-0.51106
ROE(new),1.44793,-0.18388,0.27132,-0.15845,0.19768
ROS(new),0.10267,-0.00668,5.06671,0.09141,0.07968
Sales_CE,3.69942,73.35916,0.00000,0.05292,0.69746
Total shareholders' funds,552096914.00000,24271212.00000,15267330000.00000,4223691760.00000,8362257000.00000


In [50]:
tr                                      =    df_train.copy()
new_training_df                         =    tr.copy()

In [51]:
tr.columns

Index(['Cash_ratio', 'Changeinsales_Industry', 'debt_equity', 'debt_income',
       'Interest_coverage', 'Quick_ratio', 'ROE(new)', 'ROS(new)', 'Sales_CE',
       'Total shareholders' funds', 'Slno', 'Shareholderquity_code',
       'Train_y_pred', 'Model', 'Target'],
      dtype='object')

In [52]:
test_slno                              =    df_test['Slno'].tolist()
df_test['Target']                      =    df.loc[df.Slno.isin(test_slno), 'Target']

In [53]:
te                                      =    df_test.copy()
new_test_df                             =    te.copy()

In [54]:
reqd_cols                               =    ['Cash_ratio', 'Changeinsales_Industry', 'debt_equity',\
                                              'debt_income','Interest_coverage', 'Quick_ratio', 'ROE(new)',\
                                              'ROS(new)', 'Sales_CE',"Total shareholders' funds", 'Slno',\
                                              'Shareholderquity_code','Train_y_pred', 'Model','Target']
### 1) MODEL CART
CART_new_training_df                    =    tr.loc[tr.Model == 'CART', reqd_cols ]
new_training_df['CART_Train_y_pred']    =    CART_new_training_df['Train_y_pred'] 

### 2) MODEL RF
RF_new_training_df                      =    tr.loc[tr.Model == 'RF', reqd_cols ]
new_training_df['RF_Train_y_pred']      =    RF_new_training_df['Train_y_pred'] 

### 3) MODEL XGBoost
XGB_new_training_df                     =    tr.loc[tr.Model == 'XGBoost', reqd_cols ]
new_training_df['XGBoost_Train_y_pred'] =    XGB_new_training_df['Train_y_pred'] 

new_training_df.tail(100).T

Unnamed: 0,4147,4120,3985,7126,5817,7912,6606,4877,6329,2644,...,8746,4300,2561,603,7325,6222,3508,126,8073,4032
Cash_ratio,0.07825,0.02639,0.00896,0.78246,0.06662,0.03302,0.03524,0.17487,0.02944,5.41532,...,0.23607,0.00854,7.71750,1.14116,0.83291,0.00228,0.37036,2.07661,0.06835,0.79790
Changeinsales_Industry,0.32090,0.32090,5.31897,0.49242,0.56960,14.90132,0.52980,2324.79471,0.26580,13.35479,...,14.90132,0.00000,13.35479,0.31412,0.10694,14.98953,2.08974,0.19960,0.11152,0.49242
debt_equity,0.46642,0.26481,3.47005,0.00000,0.30151,0.58114,0.00703,0.22520,2.25334,0.00208,...,0.00000,-0.00200,0.00000,0.02886,0.61332,0.02269,0.01923,5.22050,1.23392,0.00618
debt_income,1.46290,10.41457,3.00870,0.00000,1.30391,1.32435,0.06634,1.58213,3.91925,-0.05038,...,0.00000,-0.01578,0.00000,-0.54627,1.44158,0.03910,0.06209,0.00000,3.52090,-0.00678
Interest_coverage,0.78023,1.60082,1.79678,-216.10312,146.35945,1.83337,39.61235,1.09537,1.23460,31.49847,...,11.12485,39501.48694,0.64945,5041.16400,10.20654,8.91892,33680.50999,1.49644,1.59811,-192.32434
Quick_ratio,-0.00251,-0.35797,-0.44556,0.78246,-0.01592,-0.49992,-0.25743,0.07626,0.02944,5.37655,...,-1.61265,0.00854,7.71750,1.14116,0.20323,-0.82163,0.37036,1.46709,-0.15638,0.79790
ROE(new),0.31883,0.02543,1.15334,0.57612,0.23124,0.43881,0.10592,0.14234,0.57494,-0.04120,...,0.14211,0.12697,-0.00030,-0.05283,0.42545,0.58019,0.30974,0.49307,0.35046,-0.91158
ROS(new),-0.06597,-0.00500,0.04184,-1.06934,0.10384,-31.55770,0.13381,-0.00040,-0.21476,0.15777,...,0.02067,0.80809,-0.20051,-0.01209,0.10154,0.04213,0.58176,0.00447,0.00329,-102.22494
Sales_CE,2.69230,4.56710,8.34606,0.96734,1.29531,0.00000,0.76897,8.01434,0.40331,1.58830,...,0.02559,0.83179,21.26907,0.01470,1.16831,1.87960,0.74207,1635.26781,6.13892,0.00235
Total shareholders' funds,7194519000.00000,489561000.00000,395159877.00000,17185550.00000,2112660000.00000,203080000.00000,6011092637.00000,911207665.00000,2928958769.00000,1454645793.00000,...,216502901.00000,4368154000.00000,31022935.00000,6340740000.00000,3521270000.00000,700800000.00000,4238965992.00000,0.00000,1967701000.00000,113081740.00000


In [55]:
te.columns

Index(['Cash_ratio', 'Changeinsales_Industry', 'debt_equity', 'debt_income',
       'Interest_coverage', 'Quick_ratio', 'ROE(new)', 'ROS(new)', 'Sales_CE',
       'Total shareholders' funds', 'Slno', 'Shareholderquity_code',
       'Test_y_pred', 'Model', 'Target'],
      dtype='object')

In [56]:
reqd_cols                               =    ['Cash_ratio', 'Changeinsales_Industry', 'debt_equity', 'debt_income',\
                                              'Interest_coverage', 'Quick_ratio', 'ROE(new)', 'ROS(new)', 'Sales_CE',\
                                              "Total shareholders' funds", 'Slno', 'Shareholderquity_code',\
                                              'Test_y_pred', 'Model', 'Target']

### 1) MODEL CART
CART_new_test_df                        =    te.loc[te.Model == 'CART', reqd_cols ]
new_test_df['CART_Test_y_pred']         =    CART_new_test_df['Test_y_pred'] 

### 2) MODEL RF
RF_new_test_df                          =    te.loc[te.Model == 'RF', reqd_cols ]
new_test_df['RF_Test_y_pred']           =    RF_new_test_df['Test_y_pred'] 

### 3) MODEL XGBoost
XGB_new_test_df                         =    te.loc[te.Model == 'XGBoost', reqd_cols ]
new_test_df['XGBoost_Test_y_pred']      =    XGB_new_test_df['Test_y_pred'] 

new_test_df.tail(200).T

Unnamed: 0,7536,609,7605,5189,2194,1140,6082,2805,7099,8404,...,6714,1468,3321,2743,8654,4045,3993,5728,8057,4442
Cash_ratio,0.30308,2.04569,0.19391,3.27344,0.04849,0.07454,0.05531,2.07656,0.99256,0.14113,...,9.13922,0.00022,0.64199,0.03888,0.06038,2.07658,0.20200,0.47510,1.84988,0.03037
Changeinsales_Industry,0.48051,0.31412,2.08974,0.46981,0.10491,0.48051,0.49242,0.48051,0.49242,14.90132,...,0.49242,5.31897,0.49242,14.90132,0.58867,0.32090,0.64317,2.08974,0.22160,-0.11134
debt_equity,0.83023,0.00044,0.02570,0.00000,0.90702,1.47363,0.07519,5.23955,0.00000,3.54157,...,0.08131,-1.17653,0.10031,0.36573,4.93613,5.22445,1.07546,0.85162,-15.38742,0.00132
debt_income,1.42565,-0.07347,-0.73108,0.00000,8.41334,5.55325,0.21254,0.00000,0.00000,-34.81901,...,0.45102,-0.93552,2.78412,-20.46538,28.42140,0.00000,-7.22208,0.89918,0.00000,-0.01100
Interest_coverage,26.31699,-18773.91479,8092.61220,43.43547,-3.21717,4.22834,0.00000,10.37257,-2057.04470,5.19927,...,7017.81484,-3.76898,-2.39120,2.44855,1.73229,4.93758,6.34368,0.06004,2.55169,-13.80066
Quick_ratio,-1.76719,2.04569,0.19391,3.27344,0.03623,-0.19828,0.05252,1.46706,0.99256,-0.00284,...,9.13922,-0.21590,0.63967,-0.15108,-0.14074,1.46708,-0.23031,0.18275,1.26504,-2.09235
ROE(new),0.58235,-0.00594,-0.03515,0.22492,0.10781,0.26536,0.35379,0.49308,1.10299,-0.10171,...,0.18029,1.25763,0.03603,-0.01787,0.17368,0.49308,-0.14891,0.94711,0.38658,-0.11995
ROS(new),0.12661,-0.01895,0.25531,0.15409,-0.02034,0.05903,-1.73930,0.02745,0.02076,0.04793,...,0.12424,-0.31910,-23.61129,0.00257,-0.03154,0.05870,0.03462,-0.20866,0.02017,-1.62666
Sales_CE,7.68220,109.58355,5.27274,1.23961,0.09171,0.65215,0.00000,1635.27213,6.01992,3.88633,...,0.84922,-3.97498,0.00000,6.82451,2.74858,1635.26866,9.82371,11.31321,1541.42667,0.09189
Total shareholders' funds,109730951.00000,189714497.00000,277321478.00000,29461027.00000,601638000.00000,9715252885.00000,529663102.00000,0.00000,579072.00000,6673593.00000,...,2485731119.00000,-111365375.00000,498432.00000,625451000.00000,82940318.00000,0.00000,10273685.00000,3818926332.00000,0.00000,2872183000.00000


In [57]:
# Write the output of Predicted values and actual values for each observation in training set for each model

out_tr_filename      =  r'D:\DrPKV\20220713\Output\df_train_predicted_' +  str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
new_training_df.to_csv(out_tr_filename, index = False)

In [58]:
file_out_te   =  r'D:\DrPKV\20220713\Output\df_test_predicted_'+ str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
new_test_df.to_csv(file_out_te, index = False)