## Predict Bankruptcy

### 1) Load required modules

In [1]:
import pandas                   as     pd
import numpy                    as     np
import seaborn                  as     sns
import matplotlib.pyplot        as     plt
import statsmodels.api          as     sm
from   sklearn.preprocessing    import OrdinalEncoder
from   sklearn                  import metrics
from   sklearn                  import tree
import sys, os
import os
os.chdir(r'D:\DrPKV\20220618')

In [2]:
import time
from    datetime   import datetime
from    datetime   import timedelta

In [3]:
from sklearn.experimental  import   enable_iterative_imputer
from sklearn.impute        import   IterativeImputer
from sklearn.linear_model   import   LinearRegression


In [4]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)

### Define required functions

####  Function to detect zero & null values and report column-wise count & percentage of zero & missing values

In [5]:
"""
Function name : missing_zero_values_table
Arguments:
Input: Dataset name
Output : Report on column-wise count & percentage of zero & missing values

"""
def missing_zero_values_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero & Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero & Missing Values'] = 100 * mz_table['Total Zero & Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
#         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
        return mz_table

### 2) Read dataset from an Excel file

In [6]:
file_name = './Data/BankruptcyData.xlsx'

In [7]:
df     =   pd.read_excel(file_name, sheet_name = 'Full data')

### 3) Data Pre-processing

#### 3.1)  To know the data type of all the variables

In [8]:
df.info() # to know the structure of dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 34 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Row                                       9000 non-null   int64  
 1   Year                                      9000 non-null   int64  
 2   Company_name                              9000 non-null   object 
 3   Year Encoded                              9000 non-null   int64  
 4    Dummy Coded :Healthy=1; NPA= 0           9000 non-null   int64  
 5   Asset_turnover                            8299 non-null   float64
 6   Receivable_turnover(new)                  7390 non-null   float64
 7   Inventory_turnover                        5702 non-null   float64
 8   Cash_ratio                                8278 non-null   float64
 9   Quick_ratio                               8278 non-null   float64
 10  Current_ratio                       

In [9]:
df_collist = df.columns # to get the list of column names

#### 3.2) Convert categorical column to numeric using OrdinalEncoder

Some algorithms such as Logistic Regression cannot handle categorical variables directly. So, we need to convert data into numeric format.

Ref: https://pbpython.com/categorical-encoding.html

In [10]:
ord_enc             = OrdinalEncoder()
df["Company Code"]  = ord_enc.fit_transform(df[["Company_name"]])

#### Retain the numeric column, Company Code and drop the column Company_name

In [11]:
df.drop(['Company_name'], axis = 1, inplace = True)

##### Rename column names for easy access

In [12]:
df.columns

Index(['Row', 'Year', 'Year Encoded', ' Dummy Coded :Healthy=1; NPA= 0',
       'Asset_turnover', 'Receivable_turnover(new)', 'Inventory_turnover',
       'Cash_ratio', 'Quick_ratio', 'Current_ratio', 'ROA(new)', 'ROE(new)',
       'ROS(new)', 'ROI(new)', 'debt_asset', 'debt_equity', 'debt_income',
       'Interest_coverage', 'Asset_coverage', 'EBIT_Sales', 'Sales_CE',
       'ROCE_CE', 'Changeinsales_Industry', 'Grossvaluedadded', 'Ln_GVA',
       'Operating Cash Flow/Total Sales', 'Operating Cash Flow/Total Debt',
       'Operating Cash Flow/Shareholder's Equity',
       'Fixed Asset Turnover Ratio', 'YOY Sales Growth Rate',
       'YOY EBIT Growth Rate', 'Total shareholders' funds',
       'Shareholderquity_code', 'Company Code'],
      dtype='object')

In [13]:
collist  = [ "Row", "Year", "Year Encoded","Target", "Asset_turnover",\
"Receivable_turnover(new)", "Inventory_turnover", "Cash_ratio",\
"Quick_ratio", "Current_ratio", "ROA(new)", "ROE(new)", "ROS(new)", 'ROI(new)',\
 "debt_asset", "debt_equity", "debt_income",\
"Interest_coverage", "Asset_coverage", "EBIT_Sales", "Sales_CE",\
"ROCE_CE", "Changeinsales_Industry", "Grossvaluedadded", "Ln_GVA",\
"Operating Cash Flow/Total Sales", "Operating Cash Flow/Total Debt",\
"Operating Cash Flow/Shareholder's Equity",\
"Fixed Asset Turnover Ratio", "YOY Sales Growth Rate",\
"YOY EBIT Growth Rate", "Total shareholders' funds", 'Shareholderquity_code', 'Company_name']


In [14]:
df.columns = collist

#### 3.3) Missing values treatment

##### Get the report on missing values with column name, count & percentage of zeros & missing values

In [15]:
mdf =  missing_zero_values_table(df)
mdf.shape

Your selected dataframe has 34 columns and 9000 Rows.
There are 29 columns that have missing values.


(29, 6)

In [16]:
mdf

Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero & Missing Values,% Total Zero & Missing Values,Data Type
ROI(new),25,4132,45.9,4157,46.2,float64
Shareholderquity_code,3305,3790,42.1,7095,78.8,float64
Inventory_turnover,4364,3298,36.6,7662,85.1,float64
Receivable_turnover(new),401,1610,17.9,2011,22.3,float64
Interest_coverage,218,1569,17.4,1787,19.9,float64
Operating Cash Flow/Total Debt,15,1306,14.5,1321,14.7,float64
Asset_coverage,0,1306,14.5,1306,14.5,float64
ROCE_CE,70,1258,14.0,1328,14.8,float64
Fixed Asset Turnover Ratio,476,1007,11.2,1483,16.5,float64
Cash_ratio,21,722,8.0,743,8.3,float64


### Observation

We observe that there are missing values in 29 columns and the percentage of missing values range from 0.10%  to 45.9% out of 9000 observations. Instead of removing the missing values which results in data loss and inaccurate analysis.
We shall remove the columns having more than 40% of missing values.


1. ROI(new)
2. Shareholderquity_code


##### Drop variables having more than 40% of missing values

In [17]:
print("\nBefore removing null values \n Rows %d Columns %d" % (df.shape[0], df.shape[1]))
df.drop(['ROI(new)','Shareholderquity_code' ], axis = 1, inplace = True)
print("\nAfter removing null values \n Rows %d Columns %d" % (df.shape[0], df.shape[1]))


Before removing null values 
 Rows 9000 Columns 34

After removing null values 
 Rows 9000 Columns 32


##### Imputation of missing values through Multiple Imputation by Chained Equation

* Detecting and handling missing values in the correct way is important, as they can impact the results of the analysis. It cannot be imputed with general ways of using mean, mode, or median which ignores the inherent relationship among data and also it can pollute the data. 

* We observe that on a few occasions, data is missing in a dataset and is related to the other features and hence they can be predicted using other feature values.  Imputing by prediction of missing values is superior to other techniques since the inherent relationship among data is not ignored.

* We are imputing missing numerical values using the IterativeImputer class in sklearn. 


Ref: https://www.numpyninja.com/post/mice-and-knn-missing-value-imputations-through-python



In [18]:
lreg         =  LinearRegression()
imp          =  IterativeImputer(estimator= lreg, missing_values = np.nan, max_iter = 10, verbose = 2,\
                                 imputation_order= 'roman',random_state = 0)
X            =  imp.fit_transform(df)

[IterativeImputer] Completing matrix with shape (9000, 32)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.29
[IterativeImputer] Change: 4503748469.25826, scaled tolerance: 350030860.0 
[IterativeImputer] Ending imputation round 2/10, elapsed time 0.52
[IterativeImputer] Change: 2713006560.8395057, scaled tolerance: 350030860.0 
[IterativeImputer] Ending imputation round 3/10, elapsed time 0.84
[IterativeImputer] Change: 103631161.88895419, scaled tolerance: 350030860.0 
[IterativeImputer] Early stopping criterion reached.


In [19]:
X_df    =    pd.DataFrame(X, columns = df.columns)

In [20]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 32 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Row                                       9000 non-null   float64
 1   Year                                      9000 non-null   float64
 2   Year Encoded                              9000 non-null   float64
 3   Target                                    9000 non-null   float64
 4   Asset_turnover                            9000 non-null   float64
 5   Receivable_turnover(new)                  9000 non-null   float64
 6   Inventory_turnover                        9000 non-null   float64
 7   Cash_ratio                                9000 non-null   float64
 8   Quick_ratio                               9000 non-null   float64
 9   Current_ratio                             9000 non-null   float64
 10  ROA(new)                            

### Observations

All the variables are of float data type. We shall convert the following variables to integer data type:
* 1) Row
* 2) Year
* 3) Year Encoded
* 4) Target
* 5) Company_name

In [21]:
X_df['Row']                                  =  X_df['Row'].astype(int)
X_df['Year']                                 =  X_df['Year'].astype(int)
X_df['Year Encoded']                         =  X_df['Year Encoded'].astype(int)
X_df['Target']                               =  X_df['Target'].astype(int)
X_df['Company_name']                         =  X_df['Company_name'].astype(int)

In [22]:
X_df.head().T

Unnamed: 0,0,1,2,3,4
Row,1.0,1.0,1.0,1.0,1.0
Year,2010.0,2011.0,2012.0,2013.0,2014.0
Year Encoded,-8.0,-7.0,-6.0,-5.0,-4.0
Target,1.0,1.0,1.0,1.0,1.0
Asset_turnover,0.0,0.75666,1.07063,1.18632,1.42893
Receivable_turnover(new),0.0,3.46077,3.95393,4.54417,6.10799
Inventory_turnover,0.0,0.0,0.0,0.0,0.0
Cash_ratio,2.01764,1.55843,0.85337,0.96842,1.21196
Quick_ratio,1.55202,1.08407,0.38471,0.37537,0.61504
Current_ratio,3.73717,3.03393,2.56104,2.98033,3.18905


In [23]:
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 32 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Row                                       9000 non-null   int32  
 1   Year                                      9000 non-null   int32  
 2   Year Encoded                              9000 non-null   int32  
 3   Target                                    9000 non-null   int32  
 4   Asset_turnover                            9000 non-null   float64
 5   Receivable_turnover(new)                  9000 non-null   float64
 6   Inventory_turnover                        9000 non-null   float64
 7   Cash_ratio                                9000 non-null   float64
 8   Quick_ratio                               9000 non-null   float64
 9   Current_ratio                             9000 non-null   float64
 10  ROA(new)                            

In [24]:
output_file_name =  './Output/Bankruptcy-imputed-data-' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
X_df.to_csv(output_file_name, index = False)

In [25]:
print('Location of the file name created %s' %os.getcwd())
print('File name %s is created' % output_file_name)

Location of the file name created D:\DrPKV\20220618
File name ./Output/Bankruptcy-imputed-data-2022_06_18_18_10_31.csv is created


### END