# Importing all the required libraries

In [220]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
from sklearn.impute import KNNImputer #for imputing the outlier values
filterwarnings('ignore')
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.preprocessing import RobustScaler # for scaling 

In [241]:
#reading the raw data set into df called 'data'
data = pd.read_csv('data.csv')
#data.head(1).transpose()

In [242]:
#describing the data
#data.describe()

In [243]:
# we can use min max scaler to scale this, as most of the columns are ratios which values will lie in between 0 and 1
#col_needs_scaling

In [244]:
#making the copy of original data
data_copy = data.copy()
data_copy1 = data_copy.copy()

# Treatment of Outliers

In [245]:
# Custom function for calculating the number of outliers for each column 
#which are outside of standard IQR range (between 25th & 75th percentie)
def cal_num_outliers(data,col,q1,q3):
    Q1 = data[col].quantile(q1)
    Q3 = data[col].quantile(q3)
    IQR = Q3 - Q1
    data_outliers = data[(data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)]
    return data_outliers.shape[0]
    

In [246]:
#Custom function for calculating the number of outliers for each column which are outside of q1 & q3 range
def cal_num_outliers1(data,col,q1,q3):
    Q_low = data[col].quantile(q1)
    Q_upper = data[col].quantile(q3)
    #IQR = Q3 - Q1
    data_outliers = data[(data[col] < Q_low) | (data[col] > Q_upper)][col]
    return data_outliers.shape[0]
    

In [247]:
#defining three empty lists(for calculating total number of outliers in each column for three Quartile ranges) 
#and appending the count of outliers retrieved from custom functions
lst_25_75 = []
lst_05_95 = []
lst_01_99 = []
for i in data.columns:
    lst_25_75.append(cal_num_outliers(data,i,0.25,0.75))
    lst_05_95.append(cal_num_outliers1(data,i,0.05,0.95))
    lst_01_99.append(cal_num_outliers1(data,i,0.01,0.99))

In [248]:
#defining a comparision data frame and placing the the total num of outlier values for each 'Feature'
df_comp = pd.DataFrame()
df_comp['No of Outliers for 0.25 & 0.75'] = lst_25_75
df_comp['No of Outliers for 0.05 & 0.95'] = lst_05_95
df_comp['No of Outliers for 0.01 & 0.99'] = lst_01_99
df_comp['Features'] = data.columns
df_comp.set_index('Features',inplace= True)

In [249]:
#displaying the comparision data frame
df_comp

Unnamed: 0_level_0,No of Outliers for 0.25 & 0.75,No of Outliers for 0.05 & 0.95,No of Outliers for 0.01 & 0.99
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bankrupt?,220,220,0
ROA(C) before interest and depreciation before interest,391,682,138
ROA(A) before interest and % after tax,561,680,138
ROA(B) before interest and depreciation after tax,432,682,138
Operating Gross Margin,320,682,138
Realized Sales Gross Margin,318,682,138
Operating Profit Rate,716,680,138
Pre-tax net Interest Rate,773,682,138
After-tax net Interest Rate,867,681,138
Non-industry income and expenditure/revenue,1094,682,138


In [250]:
#to detect the outliers, we will get the index of all the outlier values and then remove these index from df
#finding all the outlier indexes in data and storing them in set to avoid duplicates(as same index can have multiple outliers)
#Method1
idx_set = set()
def remove_outliers(data,col,q1,q3):
    Q_low = data[col].quantile(q1)
    Q_upper = data[col].quantile(q3)
    #IQR = Q3 - Q1
    data_outliers = data[(data[col] < Q_low) | (data[col] > Q_upper)][col]
    idx = data_outliers.index
    idx_set.update(idx)
for i in data.columns:
    remove_outliers(data,i,0.01,0.99)
print('Total number of index values to be removed:',len(idx_set))
print('Total no of rows in the data after removing the outliers:',data.shape[0] - len(idx_set))

Total number of index values to be removed: 3227
Total no of rows in the data after removing the outliers: 3592


In [251]:
#Method 2:
#Replacing outliers with NaN values
for col in data_copy1.columns:
    Q_low = data_copy1[col].quantile(0.01)
    Q_upper = data_copy1[col].quantile(0.99)
    data_outliers = data_copy1[(data_copy1[col] < Q_low) | (data_copy1[col] > Q_upper)][col]
    for idx in data_outliers.index:
        data_copy1.loc[idx,col] = np.NaN

In [252]:
#dropping the NaN values results in 3592 rows (which are left after removing the outliers)
print('Total no of rows in the data after removing the outliers using Method2:',data_copy1.dropna().shape[0])

Total no of rows in the data after removing the outliers using Method2: 3592


In [253]:
#Method 3:
#print(data.quantile(0.01))
#print(data.quantile(0.99))
#print(data.shape[0])
print('Total no of rows in the data after removing the outliers using Method3:',
      data[~((data < data.quantile(0.01)) | (data > data.quantile(0.99)))].dropna().shape[0])

Total no of rows in the data after removing the outliers using Method3: 3592


In [254]:
#removing the outliers
#while removing the outliers missing values get introduced
#so these missing values are actually the outlier values

data_without_outliers = data[~((data < data.quantile(0.01)) | (data > data.quantile(0.99)))]

In [255]:
#we are considering to drop the row(index), if it has 11 outier values (here it is missing values) 
#we can use thresh parameter in drop na
#thresh states, min number of values in the row without missing values to be considered not to drop the row

data_thresh_10 = data_without_outliers.dropna(thresh = data_without_outliers.shape[1] - 10)
data_thresh_10.shape

(6525, 96)

In [256]:
imputer = KNNImputer()
x = imputer.fit_transform(data_thresh_10)
x.shape

(6525, 96)

In [257]:
data_cleaned = pd.DataFrame(data = x,columns=data_thresh_10.columns)

In [258]:
data_cleaned.shape

(6525, 96)

In [259]:
data_cleaned.isna().sum()

Bankrupt?                                                   0
 ROA(C) before interest and depreciation before interest    0
 ROA(A) before interest and % after tax                     0
 ROA(B) before interest and depreciation after tax          0
 Operating Gross Margin                                     0
 Realized Sales Gross Margin                                0
 Operating Profit Rate                                      0
 Pre-tax net Interest Rate                                  0
 After-tax net Interest Rate                                0
 Non-industry income and expenditure/revenue                0
 Continuous interest rate (after tax)                       0
 Operating Expense Rate                                     0
 Research and development expense rate                      0
 Cash flow rate                                             0
 Interest-bearing debt interest rate                        0
 Tax rate (A)                                               0
 Net Val

In [260]:
#taking a different data (small data set ) from seaborn
#df_small1 = sns.load_dataset('tips')
#df_small1.head()
#df_small1.shape[0]
#df_small = df_small1.select_dtypes(np.number)
#considering all the values which are greater than 1% and less than 99%
#df_small[~((df_small < df_small.quantile(0.01)) | (df_small > df_small.quantile(0.99)))].isna().sum()
#we have considered the range of values in between 1 and 99 percentile

#python compares all the values with this range, if the condition is satisfied it will pick the value else it will consider
#as missing value

#so this is why missing values are introduced in the data set

#lets drop NAN values and see the shape
#we have 5 outliers in total_bill and 3 outliers in tip 
#so in actual 8 values needs to be dropped

#df_small[~((df_small < df_small.quantile(0.01)) | (df_small > df_small.quantile(0.99)))].dropna().shape[0]

#calculating the difference between len of original data and len of data after outliers are dropped
#df_small1.shape[0] - df_small[~((df_small < df_small.quantile(0.01)) | (df_small > df_small.quantile(0.99)))].dropna().shape[0]

#so we had 8 outliers, so it should be 244 - 8 which is 236 (theory)
#but there is chance of outliers present in the same index(row) for the different columns
#which is why the actual len of data set with outliers removed is 238

#Instead of dropping these Outliers we can impute them using mean or trimmed mean

#data_fill_na = data_copy1.copy()
#for col in data_copy1.columns:
    #data_fill_na[col] = data_copy1[col].fillna(data_copy1[col].mean())

# Row wise outliers checking

In [261]:
#data_rows = data.transpose()

In [262]:
#lst3 = []
#lst4 = []
#lst5 = []
#for i in data_rows.columns:
    #lst.append(cal_num_outliers(data_rows,i,0.25,0.75))
    #lst1.append(cal_num_outliers(data_rows,i,0.05,0.95))
    #lst2.append(cal_num_outliers(data_rows,i,0.01,0.99))

In [263]:
#df_comp_row = pd.DataFrame()
#df_comp_row['No of Outliers for 0.25 & 0.75'] = lst3
#df_comp_row['No of Outliers for 0.05 & 0.95'] = lst4
#df_comp_row['No of Outliers for 0.01 & 0.99'] = lst5

In [264]:
#df_comp_row

In [265]:
#data.info()

In [266]:
#((data_rows < Q1 - 1.5*IQR) | (data_rows > Q3 + 1.5*IQR)).sum()

# Checking for Derived Variables

# Checking Correlation

In [267]:
#checking for co-relation
df_corr = data_cleaned.corr()
df_corr

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (A),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Cash Flow Per Share,Revenue Per Share (Yuan ¥),Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Realized Sales Gross Profit Growth Rate,Operating Profit Growth Rate,After-tax Net Profit Growth Rate,Regular Net Profit Growth Rate,Continuous Net Profit Growth Rate,Total Asset Growth Rate,Net Value Growth Rate,Total Asset Return Growth Rate Ratio,Cash Reinvestment %,Current Ratio,Quick Ratio,Interest Expense Ratio,Total debt/Total net worth,Debt ratio %,Net worth/Assets,Long-term fund suitability ratio (A),Borrowing dependency,Contingent liabilities/Net worth,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Inventory and accounts receivable/Net value,Total Asset Turnover,Accounts Receivable Turnover,Average Collection Days,Inventory Turnover Rate (times),Fixed Assets Turnover Frequency,Net Worth Turnover Rate (times),Revenue per person,Operating profit per person,Allocation rate per person,Working Capital to Total Assets,Quick Assets/Total Assets,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Operating Funds to Liability,Inventory/Working Capital,Inventory/Current Liability,Current Liabilities/Liability,Working Capital/Equity,Current Liabilities/Equity,Long-term Liability to Current Assets,Retained Earnings to Total Assets,Total income/Total expense,Total expense/Assets,Current Asset Turnover Rate,Quick Asset Turnover Rate,Working capitcal Turnover Rate,Cash Turnover Rate,Cash Flow to Sales,Fixed Assets to Assets,Current Liability to Liability,Current Liability to Equity,Equity to Long-term Liability,Cash Flow to Total Assets,Cash Flow to Liability,CFO to Assets,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
Bankrupt?,1.0,-0.191894,-0.198699,-0.19996,-0.110607,-0.110101,-0.130956,-0.201959,-0.206882,-0.177211,-0.224278,-0.001178,-0.017912,-0.102205,-0.013611,-0.103269,-0.14497,-0.148789,-0.147658,-0.19417,-0.095704,-0.053211,-0.133837,-0.188473,-0.029441,-0.055531,-0.109973,-0.09967,-0.092974,-0.030133,-0.116448,-0.067768,-0.073654,-0.111728,-0.116272,-0.061269,0.259166,0.216862,-0.216862,-0.031194,0.286711,0.024351,-0.133089,-0.18688,0.093571,-0.073298,-0.018873,0.039495,-0.001739,0.086309,0.00894,0.012475,-0.103896,0.118814,-0.156043,-0.073977,-0.045282,-0.086894,-0.108815,-0.067505,0.164549,-0.097906,-0.009945,-0.008811,-0.028741,-0.150304,0.222638,-0.009534,-0.209001,-0.167911,0.053873,0.010076,0.034275,-0.089764,-0.019765,-0.056381,0.066988,-0.028741,0.222638,0.14377,-0.056071,-0.038188,-0.108484,-0.068241,0.211545,,-0.223674,0.048055,-0.072201,-0.110606,-0.288289,0.253555,0.017764,-0.068787,,-0.121866
ROA(C) before interest and depreciation before interest,-0.191894,1.0,0.905785,0.963984,0.440476,0.438122,0.694309,0.725664,0.704194,0.373224,0.7264,0.048136,0.105785,0.430008,0.029166,0.316477,0.503003,0.504581,0.50427,0.825221,0.455531,0.227277,0.753031,0.808776,0.173895,0.278346,0.346772,0.345174,0.351351,0.016902,0.430133,0.466289,0.311,0.164266,0.207429,0.015578,-0.24935,-0.239178,0.239178,0.031605,-0.29196,-0.104461,0.752125,0.809912,-0.114989,0.233968,0.016505,-0.121082,-0.081827,-0.073029,0.086196,0.008782,0.481953,-0.094686,0.263149,0.210315,0.117033,0.253369,0.192735,0.156924,-0.18386,0.441491,-0.074525,-0.010905,0.05445,0.21397,-0.219483,-0.012323,0.707124,0.684824,-0.169041,0.005619,-0.03716,0.033451,-0.040831,0.191633,0.000238,0.05445,-0.219483,-0.101986,0.234247,0.201883,0.465663,0.223629,-0.263196,,0.894106,-0.007028,0.101532,0.440476,0.833148,-0.244337,-0.045315,0.051441,,0.12829
ROA(A) before interest and % after tax,-0.198699,0.905785,1.0,0.930685,0.427045,0.424893,0.703042,0.784427,0.774164,0.475681,0.763239,0.052967,0.074613,0.366468,0.041598,0.296252,0.544431,0.5447,0.544631,0.848741,0.403418,0.268401,0.749031,0.844839,0.169173,0.284156,0.3855,0.384073,0.359583,0.0247,0.448901,0.500369,0.265541,0.180296,0.20735,0.050131,-0.243759,-0.234297,0.234297,0.084403,-0.304769,-0.06518,0.747719,0.843866,-0.050347,0.265169,0.030472,-0.119726,-0.07059,-0.169387,0.11746,0.080585,0.512624,-0.125823,0.318337,0.245115,0.193145,0.243633,0.193281,0.140977,-0.150403,0.388438,-0.046283,0.001593,0.105646,0.283802,-0.196267,-0.006054,0.778932,0.727675,-0.191156,0.003254,-0.033149,0.08053,-0.038656,0.197756,-0.149534,0.105646,-0.196267,-0.155132,0.237911,0.205099,0.401035,0.232001,-0.301513,,0.972035,-0.013173,0.096054,0.427046,0.903653,-0.238361,-0.020889,0.082519,,0.127116
ROA(B) before interest and depreciation after tax,-0.19996,0.963984,0.930685,1.0,0.454309,0.452233,0.698989,0.736659,0.724028,0.381366,0.747445,0.04569,0.10262,0.434102,0.034625,0.255643,0.507611,0.507868,0.50759,0.833402,0.45472,0.214018,0.741746,0.802399,0.18466,0.279743,0.353441,0.351603,0.35763,0.022324,0.429913,0.479618,0.310621,0.172058,0.210215,0.028285,-0.257489,-0.246387,0.246387,0.030242,-0.288403,-0.107834,0.740719,0.802785,-0.116872,0.220057,0.022394,-0.126816,-0.07008,-0.072705,0.0714,0.003364,0.479602,-0.090814,0.271328,0.200997,0.117731,0.250955,0.195595,0.158591,-0.195537,0.44151,-0.065266,-0.013223,0.046319,0.221461,-0.230052,-0.012945,0.722423,0.692336,-0.17005,-0.001125,-0.04061,0.048178,-0.044121,0.195221,-0.00157,0.046319,-0.230052,-0.093918,0.235582,0.204456,0.46609,0.22524,-0.271127,,0.907269,0.001994,0.095918,0.454309,0.844618,-0.252753,-0.034138,0.063753,,0.132887
Operating Gross Margin,-0.110607,0.440476,0.427045,0.454309,1.0,0.997467,0.469439,0.402861,0.386902,0.062415,0.399554,-0.346611,-0.02336,0.379498,0.025087,0.092031,0.165968,0.163553,0.163602,0.355289,0.273557,-0.213139,0.392247,0.337468,0.11242,0.1496,0.143557,0.146517,0.154555,0.010295,0.192621,0.216207,0.215618,0.352746,0.366644,-0.015801,-0.299496,-0.355493,0.355493,0.038257,-0.282516,-0.203865,0.391712,0.34013,-0.23318,-0.182502,0.028043,0.005814,0.103563,0.00106,-0.26919,-0.252803,0.255878,-0.115327,0.319331,0.150382,0.102376,0.303232,0.363666,0.328558,-0.312926,0.388617,-0.082918,0.00729,0.072075,0.213851,-0.283879,0.009819,0.31081,0.256216,0.38003,-0.202122,-0.211651,0.346589,-0.098566,0.132017,-0.009565,0.072075,-0.283879,-0.150389,0.140772,0.158417,0.320396,0.11209,-0.304622,,0.417164,-0.124209,0.030628,1.0,0.363303,-0.300294,-0.067645,0.003133,,0.327428
Realized Sales Gross Margin,-0.110101,0.438122,0.424893,0.452233,0.997467,1.0,0.472525,0.403065,0.38456,0.058865,0.397186,-0.346944,-0.023943,0.381655,0.024515,0.092117,0.163239,0.160831,0.160881,0.352472,0.274203,-0.214008,0.391546,0.334462,0.112878,0.149418,0.141789,0.144814,0.152972,0.010347,0.191603,0.214462,0.217158,0.35299,0.367193,-0.016331,-0.299291,-0.355081,0.355081,0.039061,-0.281639,-0.205558,0.391005,0.337231,-0.233719,-0.182989,0.027967,0.005063,0.102793,0.001465,-0.269659,-0.252726,0.256091,-0.114604,0.319714,0.151275,0.102527,0.30524,0.36417,0.330307,-0.312666,0.391271,-0.082547,0.007292,0.072615,0.213866,-0.283754,0.009671,0.308988,0.253877,0.381794,-0.202268,-0.21172,0.346606,-0.098359,0.133201,-0.007788,0.072615,-0.283754,-0.149581,0.141184,0.159141,0.321766,0.112681,-0.30445,,0.41538,-0.123825,0.030251,0.997467,0.361335,-0.300088,-0.067313,0.002273,,0.326984
Operating Profit Rate,-0.130956,0.694309,0.703042,0.698989,0.469439,0.472525,1.0,0.78909,0.774287,0.177533,0.805154,0.039895,0.096504,0.406163,0.010344,0.279534,0.389174,0.392385,0.392419,0.606324,0.372842,0.118183,0.641782,0.592244,0.117562,0.261566,0.258796,0.25253,0.26303,0.068992,0.243911,0.283051,0.266452,0.141021,0.163134,0.084216,-0.145568,-0.152915,0.152915,0.00015,-0.16469,-0.066437,0.640795,0.591687,-0.011757,0.115407,0.042129,-0.0814,-0.02414,-0.083114,0.021008,0.050427,0.512969,-0.028716,0.231382,0.145614,0.141518,0.157006,0.14893,0.11233,-0.105022,0.404806,0.003901,0.000704,0.057702,0.211269,-0.117677,-0.014598,0.613183,0.559279,-0.148176,-0.043945,-0.027628,0.062215,-0.003873,0.135881,-0.028707,0.057702,-0.117677,-0.0823,0.152781,0.12916,0.401245,0.141082,-0.239194,,0.709813,-0.007879,0.035904,0.469441,0.657793,-0.141333,-0.000592,0.090887,,0.089622
Pre-tax net Interest Rate,-0.201959,0.725664,0.784427,0.736659,0.402861,0.403065,0.78909,1.0,0.977125,0.595039,0.940239,0.049258,0.085525,0.367225,0.036777,0.302996,0.456739,0.460641,0.460331,0.66435,0.337364,0.123631,0.552586,0.667105,0.109685,0.235577,0.352249,0.347152,0.324015,0.070354,0.291905,0.356547,0.217718,0.165037,0.199612,0.100857,-0.209301,-0.216245,0.216245,0.044752,-0.244215,-0.030465,0.551717,0.66536,-0.064616,0.110193,0.0237,-0.069194,-0.044899,-0.132359,0.009333,0.040961,0.440543,-0.072039,0.224442,0.14527,0.098129,0.161333,0.182266,0.139649,-0.159736,0.359378,-0.016898,0.001679,0.046752,0.197499,-0.172812,-0.011216,0.690646,0.67192,-0.255894,-0.027008,-0.032148,0.08602,-0.010692,0.157253,-0.096953,0.046752,-0.172812,-0.12939,0.158515,0.137033,0.353617,0.156911,-0.262211,,0.798042,0.009019,0.068582,0.402862,0.76049,-0.20648,0.012082,0.118566,,0.141999
After-tax net Interest Rate,-0.206882,0.704194,0.774164,0.724028,0.386902,0.38456,0.774287,0.977125,1.0,0.586408,0.956438,0.050336,0.081485,0.356003,0.036468,0.253364,0.445904,0.44999,0.449689,0.647184,0.323415,0.125642,0.530048,0.641529,0.115073,0.239043,0.356886,0.349536,0.324651,0.069306,0.294465,0.357748,0.211584,0.158241,0.190692,0.11591,-0.199868,-0.205604,0.205604,0.047411,-0.232923,-0.029184,0.528957,0.641302,-0.053368,0.115001,0.020643,-0.071581,-0.040364,-0.128628,0.017301,0.045031,0.431277,-0.067744,0.223544,0.143928,0.102924,0.156444,0.173937,0.131184,-0.151146,0.347982,-0.007848,0.000807,0.046933,0.199329,-0.164805,-0.008975,0.685008,0.658238,-0.257464,-0.023898,-0.028272,0.084601,-0.006839,0.155165,-0.096647,0.046933,-0.164805,-0.119776,0.157688,0.13753,0.34284,0.15514,-0.257426,,0.79166,0.018329,0.064575,0.386903,0.760682,-0.197122,0.024008,0.129306,,0.134464
Non-industry income and expenditure/revenue,-0.177211,0.373224,0.475681,0.381366,0.062415,0.058865,0.177533,0.595039,0.586408,1.0,0.518876,0.032769,0.024821,0.100808,0.064769,0.17673,0.31929,0.322699,0.322721,0.389851,0.094029,0.0654,0.110928,0.419502,0.016949,0.067357,0.268496,0.266946,0.217503,0.046678,0.192303,0.250273,0.026279,0.07785,0.115655,0.072375,-0.201723,-0.198954,0.198954,0.076277,-0.244502,0.053586,0.110169,0.413341,-0.11621,0.033329,-0.017653,0.002564,-0.061948,-0.128487,-0.019447,0.012033,0.094788,-0.09509,0.082686,0.041092,-0.03994,0.06517,0.104652,0.063882,-0.170959,0.090339,-0.044434,-0.004178,-0.022267,0.067491,-0.182089,-0.001282,0.426845,0.4886,-0.272316,0.011337,-0.02199,0.067204,-0.012904,0.075088,-0.160859,-0.022267,-0.182089,-0.128382,0.072491,0.046639,0.083209,0.074024,-0.150178,,0.499916,0.038518,0.082357,0.062415,0.480718,-0.202023,0.016105,0.099085,,0.124325


In [268]:
data_cleaned['Bankrupt?'].corr(data_cleaned[' ROA(C) before interest and depreciation before interest'])

-0.1918940864675842

In [269]:
#finding the names of highly correlated columns using custom function correlation

col_set = set()#filtered columns
def correlation(data_set,threshold):
    corr_matrix = data_set.corr()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if corr_matrix.iloc[i,j] > threshold:
                col_set.add(corr_matrix.columns[i])
correlation(data_cleaned,0.8)

In [270]:
print('total number of col which are highly correlated are:',len(col_set))

total number of col which are highly correlated are: 34


In [271]:
data_no_correlation = data_cleaned.drop(col_set, axis = 1)

In [272]:
data_no_correlation.shape

(6525, 62)

In [273]:
data_no_correlation.var() == 0

Bankrupt?                                                   False
 ROA(C) before interest and depreciation before interest    False
 Operating Gross Margin                                     False
 Operating Profit Rate                                      False
 Pre-tax net Interest Rate                                  False
 Non-industry income and expenditure/revenue                False
 Operating Expense Rate                                     False
 Research and development expense rate                      False
 Cash flow rate                                             False
 Interest-bearing debt interest rate                        False
 Tax rate (A)                                               False
 Net Value Per Share (B)                                    False
 Cash Flow Per Share                                        False
 Revenue Per Share (Yuan ¥)                                 False
 Realized Sales Gross Profit Growth Rate                    False
 Operating

In [274]:
#we can remove net income flag liability assets flag as var is 0

In [275]:
data_no_correlation.columns

Index(['Bankrupt?', ' ROA(C) before interest and depreciation before interest',
       ' Operating Gross Margin', ' Operating Profit Rate',
       ' Pre-tax net Interest Rate',
       ' Non-industry income and expenditure/revenue',
       ' Operating Expense Rate', ' Research and development expense rate',
       ' Cash flow rate', ' Interest-bearing debt interest rate',
       ' Tax rate (A)', ' Net Value Per Share (B)', ' Cash Flow Per Share',
       ' Revenue Per Share (Yuan ¥)',
       ' Realized Sales Gross Profit Growth Rate',
       ' Operating Profit Growth Rate', ' After-tax Net Profit Growth Rate',
       ' Continuous Net Profit Growth Rate', ' Total Asset Growth Rate',
       ' Net Value Growth Rate', ' Total Asset Return Growth Rate Ratio',
       ' Cash Reinvestment %', ' Current Ratio', ' Interest Expense Ratio',
       ' Total debt/Total net worth', ' Net worth/Assets',
       ' Long-term fund suitability ratio (A)',
       ' Contingent liabilities/Net worth',
       ' I

In [276]:
data_no_correlation.drop([' Net Income Flag',' Liability-Assets Flag'],axis = 1,inplace = True)

# Multicollinearity using VIF

In [277]:
#checking for multicollinearity with VIF
vif = pd.DataFrame()
vif_list = []
for idx in range(data_no_correlation.shape[1]):
    vif_list.append(VIF(data_no_correlation.values,idx))
vif['features'] = data_no_correlation.columns
vif['VIF'] = vif_list

In [278]:
columns_with_vif_greater_than_5 = vif[vif['VIF'] > 5]['features'].values

In [279]:
data_cleaned_vif = data_no_correlation.drop(columns_with_vif_greater_than_5,axis = 1)

In [280]:
data_cleaned_vif.shape

(6525, 52)

# Scaling

In [281]:
data_scaled = data_cleaned_vif.copy()

In [282]:
#finding all the columns which needs scaling based on the column max value
col_needs_scaling = []
for i in data_scaled.columns:
    maximum = data[i].max()
    if maximum > 1:
        col_needs_scaling.append(i)
        

In [291]:
len(col_needs_scaling)

20

In [292]:
data_cleaned_vif.shape[1]

52

In [284]:
RS = RobustScaler()
data_req_scaling = data_scaled[col_needs_scaling]
scaled = RS.fit_transform(data_req_scaling)

In [285]:
data_scaled_columns = pd.DataFrame(data=scaled,columns=col_needs_scaling)

In [286]:
data_scaled_columns.shape

(6525, 20)

In [287]:
data_scaled[col_needs_scaling] = data_scaled_columns

In [290]:
#data_cleaned.describe()
data_scaled.describe()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,Operating Gross Margin,Operating Profit Rate,Non-industry income and expenditure/revenue,Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,Tax rate (A),Net Value Per Share (B),Cash Flow Per Share,Revenue Per Share (Yuan ¥),Realized Sales Gross Profit Growth Rate,Operating Profit Growth Rate,After-tax Net Profit Growth Rate,Continuous Net Profit Growth Rate,Total Asset Growth Rate,Net Value Growth Rate,Total Asset Return Growth Rate Ratio,Cash Reinvestment %,Current Ratio,Interest Expense Ratio,Long-term fund suitability ratio (A),Contingent liabilities/Net worth,Accounts Receivable Turnover,Average Collection Days,Inventory Turnover Rate (times),Fixed Assets Turnover Frequency,Revenue per person,Operating profit per person,Allocation rate per person,Cash/Total Assets,Inventory/Working Capital,Inventory/Current Liability,Current Liabilities/Liability,Long-term Liability to Current Assets,Retained Earnings to Total Assets,Total income/Total expense,Total expense/Assets,Current Asset Turnover Rate,Quick Asset Turnover Rate,Working capitcal Turnover Rate,Cash Turnover Rate,Cash Flow to Sales,Fixed Assets to Assets,Equity to Long-term Liability,Cash Flow to Total Assets,Total assets to GNP price,No-credit Interval,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT)
count,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0,6525.0
mean,0.026054,0.506798,0.607982,0.999022,0.303525,0.485586,0.38794,0.467151,24695690000.0,0.110733,0.189583,0.3235,0.311256,0.022141,0.848082,0.68943,0.217599,-0.365102,0.399937,0.264125,0.380072,0.369678,0.630963,0.007046,0.005724,0.622029,0.077815,0.4649485,302970800000.0,0.536691,0.399241,0.501903,0.119301,0.277326,248256400.0,0.766212,300387000.0,0.936473,0.002384,0.027653,4363279000000.0,0.4481514,0.593977,0.279129,0.671577,0.168498,0.114859,0.650092,0.991549,0.623849,0.026972,0.565493
std,0.159307,0.049518,0.010431,0.00015,0.000206,0.7770086,0.727781,0.00927,201502000000.0,0.119523,0.025011,0.010471,1.066779,0.000208,0.000322,0.001002,0.000108,1.223598,1.577865,0.000693,0.012267,1.175341,0.001198,0.004367,0.000712,2.530709,0.776496,0.6928698,761452600000.0,1.525841,0.014318,1.632466,0.122065,0.000763,4608578000.0,0.196077,5153356000.0,0.014698,0.000253,0.018736,10272260000000.0,0.6933157,7.3e-05,0.644469,3.5e-05,0.681266,0.005453,0.036671,3.441242,0.00119,0.000771,0.001409
min,0.0,0.338956,0.581213,0.997638,0.3022,-4.258946e-14,-0.173178,0.437687,-1.00627,0.0,0.140618,0.282061,-0.871601,0.021676,0.846406,0.678531,0.216619,-2.832599,-2.428825,0.261932,0.321253,-0.98327,0.624254,0.004911,0.005366,-0.88102,-1.402252,-1.499445e-13,-0.1570862,-0.667658,0.357387,-0.745152,0.002658,0.273272,-0.8502951,0.20591,-0.2376767,0.854337,0.001837,0.004296,-0.3574058,-2.486251e-14,0.593795,-0.276018,0.671379,-0.70929,0.110933,0.524336,-0.454693,0.615681,0.025333,0.555637
25%,0.0,0.478487,0.600672,0.998973,0.30347,-2.906539e-14,-0.173178,0.461783,-0.3573668,0.0,0.174287,0.318101,-0.381672,0.022066,0.847988,0.689279,0.217581,-0.581498,-0.359202,0.263782,0.375135,-0.351476,0.630612,0.005252,0.005366,-0.351275,-0.476606,-1.341258e-13,-0.1134787,-0.326159,0.392687,-0.347182,0.034479,0.277046,-0.4250165,0.633721,-0.2376767,0.931754,0.002243,0.014779,-0.1923444,-1.643254e-14,0.593936,-0.276018,0.671566,-0.394137,0.110933,0.634118,-0.276215,0.623647,0.026791,0.565158
50%,0.0,0.503924,0.606048,0.999024,0.303526,0.0,0.0,0.465216,0.0,0.07936,0.184695,0.322664,0.0,0.022102,0.848044,0.68944,0.217598,0.0,0.0,0.264054,0.380537,0.0,0.630715,0.005664,0.005366,0.0,0.0,0.0,0.0,0.0,0.396045,0.0,0.076414,0.277188,0.0,0.806421,0.0,0.937874,0.002341,0.02266,0.0,0.0,0.593963,0.0,0.671574,0.0,0.112434,0.645533,0.0,0.623881,0.026812,0.565271
75%,0.0,0.535173,0.613572,0.999092,0.303583,1.0,0.826822,0.470857,0.6426332,0.204778,0.199275,0.32857,0.618328,0.022151,0.848121,0.689643,0.217621,0.418502,0.640798,0.264382,0.386526,0.648524,0.631148,0.006784,0.005765,0.648725,0.523394,1.0,0.8865213,0.673841,0.401777,0.652818,0.16035,0.277433,0.5749835,0.939612,0.7623233,0.944751,0.002488,0.035324,0.8076556,1.0,0.594,0.723982,0.671586,0.605863,0.117054,0.663116,0.723785,0.624152,0.026919,0.565741
max,1.0,0.664262,0.651919,0.999401,0.304819,2.405405,2.579009,0.516026,2350862000000.0,0.64298,0.309216,0.367723,6.232125,0.02448,0.850845,0.696948,0.218548,1.462555,12.957871,0.267758,0.420998,7.533869,0.638056,0.048182,0.010021,33.84136,3.577752,2.119826,3098363000000.0,10.098811,0.503709,15.014895,0.675504,0.283636,176522300000.0,1.0,162589500000.0,0.970437,0.003718,0.125107,36963550000000.0,2.056485,0.594533,1.900452,0.671774,2.113114,0.141748,0.804182,37.356815,0.6333,0.036329,0.572881


# Anomalies

In [None]:
#Check Anomalies

# Check for Imbalance in Target Variable

In [None]:
#check proportion

# Univariate Analysis

In [None]:
#check univariate Analysis

# BiVariate Analysis

In [None]:
#check Bi Variate Analysis