In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Importing the Dataset

In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')

In [None]:
df

In [None]:
# splitting in X (features) and y (target)
X_df = df[df.columns[1:]]

y_df = pd.DataFrame(df['Bankrupt?'])
y_df.columns = ['Bankrupt?']

In [None]:
X_df

In [None]:
y_df

# 2. Data Preprocessing

## 2.1 Check missing Values and categorical values

In [None]:
# check if there are missing values

print('Missing values in X_df: {}'.format(X_df.isnull().values.any()))
print('Missing values in y_df: {}'.format(y_df.isnull().values.any()))

In [None]:
# check if there are categorical features

numCols = X_df.select_dtypes('number').columns
catCols = X_df.select_dtypes('object').columns

numCols= list(set(numCols))
catCols= list(set(catCols))

print('Number of numerical features: {}'.format(len(numCols)))
print('Number of categorical features: {}'.format(len(catCols)))


## 2.2 Feature Scaling

In [None]:
# features scaling preserving the flag features

from sklearn.preprocessing import StandardScaler

def get_scaling(X_df, flag_features):
    
    '''
    The function takes in input the original dataframe X_df and
    a list of index corresponding to the flag features 
    (i.e., features with values 0,1,2 etc., for example obtained 
    with One Hot Encoder if categorical in the original dataframe).

    First, the function scales all the features.

    The flag features should not be scaled: thus, the function
    drops all the flag features in a for loop from the scaled dataframe.

    Then, the flag features from the original dataframe are added to the
    scaled dataframe with their original values.
    '''
    
    sc = StandardScaler()

    X_scaled = pd.DataFrame(sc.fit_transform(X_df))

    X_scaled.columns = X_df.columns
    
    X_flag = []
    flag_names = []
    
    for flg_ftrs in flag_features:
        
        X_scaled = X_scaled.drop([X_df.columns[flg_ftrs]],axis = 1)
        
        print('Flag feature with index {}: {}'.format(flg_ftrs,X_df.columns[flg_ftrs]))
        
        X_flag.append(pd.DataFrame(X_df.values[:,flg_ftrs]))
        flag_names.append(X_df.columns[flg_ftrs])
        
    X_flag = pd.concat(X_flag, axis = 1)
    X_flag.columns = flag_names
    
    X_scaled = pd.concat([X_scaled, X_flag],axis = 1)
    
    return X_scaled

In [None]:
# flag features 84: Liability-Assets Flag
# flag features 93: Net Income Flag

X_scaled = get_scaling(X_df, flag_features = [84,93])

## 2.3 Visualize data distribution

In [None]:
# features distribution after scaling

import seaborn as sns

fig = plt.figure(figsize = (20,5))

ax = fig.add_subplot(1,2,1)
X_scaled.boxplot()

ax = fig.add_subplot(1,2,2)
sns.histplot(y_df)

We can notice two aspects:

1) There are **many outliers** in the dataset

2) The dataset is **clearly imbalanced** because the majority of the instances belong to the negative class 0 (Not Bankrupted). We can visualize the imbalance also in a scatter plot.

Let's visualize the data in a scatterplot:

In [None]:
fig = plt.figure(figsize = (20,20))

for i in range(9):
    
    fig.add_subplot(3,3,i+1)
    
    ax_1 = 3*i
    ax_2 = 2*i+1

    plt.scatter(X_scaled.values[np.where(y_df == 0),ax_1],X_scaled.values[np.where(y_df == 0),ax_2], c = 'b')
    plt.scatter(X_scaled.values[np.where(y_df == 1),ax_1],X_scaled.values[np.where(y_df == 1),ax_2], c = 'r')
    plt.xlabel(X_scaled.columns[ax_1])
    plt.ylabel(X_scaled.columns[ax_2])
    plt.legend(['Not Bankrupted','Bankrupted'])


    



## 2.4 Dealing with Imbalanced Dataset

There are different techniques to deal with **imbalanced datasets** (https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/). In particular one can:

**1) Resampling the dataset**: Adding copies of instances from the under-represented class called over-sampling (or more formally sampling with replacement), or deleting instances from the over-represented class, called under-sampling.**

**2) Generating synthetic samples from the minority class.** The most popular of such algorithms is called **SMOTE or the Synthetic Minority Over-sampling Technique.** SMOTE is an oversampling method. It works by creating synthetic samples from the minor class instead of creating copies. The algorithm selects two or more similar instances (using a distance measure) and perturbing an instance one attribute at a time by a random amount within the difference to the neighboring instances.**


In this example, we use the **SMOTE algorithm (generation of synthetic samples from the minority class**.

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

X_smote, y_smote = smote.fit_resample(X_scaled, y_df)

In [None]:
# data distribution before and after SMOTE

fig = plt.figure(figsize = (20,5))

ax = fig.add_subplot(1,2,1)
sns.histplot(y_df)
plt.title('Imbalanced Dataset',fontsize  = 20)

ax = fig.add_subplot(1,2,2)
sns.histplot(y_smote)
plt.title('SMOTE Dataset',fontsize  = 20)


## 2.5 Removing Outliers

In [None]:
# function to remove the outliers from the dataset (data which fall outside of the whiskers in the box plot)
# the outliers are removed for each feature separately. It can happen that data that are outliers
# for a certain feature are not outliers for another feature. 
# The threshold to consider data outliers or not is defined by q. 

# upper quartile or 75h percentile (Q3): value for which 75% of the data are less than this value (upper threshold of the box)
# lower quartile or 25th percentile (Q1): value for which 25% of the data are less than this value (lower theshold of the box)
# iqr: distance between the upper and lower quartile: Q3 - Q1

# upper whisker: Q3 + q*iqr
# lower whisler: Q1 - q*iqr

# data > upper whisker or < lower whisker : outliers

# q controls the amount of data to consider as outliers

def get_remove_outliers(X,y,q):
    
    data = np.column_stack((X.values,y.values))
    
    for j in range(data.shape[1]):
    
        median = np.median(data[:,j])
        
        upper_quartile = np.percentile(data[:,j], 75)
        lower_quartile = np.percentile(data[:,j], 25)

        iqr = upper_quartile - lower_quartile
        upper_whisker = (data[:,j][data[:,j] <= upper_quartile + q * iqr]).max()
        lower_whisker = (data[:,j][data[:,j] >= lower_quartile - q * iqr]).min()
        
        data_clean = data[data[:,j] <= upper_whisker]
        data_clean = data_clean[data_clean[:,j] >= lower_whisker]
        
        data = data_clean
        
        
    X_clean = pd.DataFrame(data_clean[:,:data_clean.shape[1] - 1])
    X_clean.columns = X.columns
    
    y_clean = pd.DataFrame(data_clean[:,data_clean.shape[1] - 1])
    y_clean.columns = y.columns
    
    return X_clean,y_clean

In [None]:
q = 11

X_clean, y_clean = get_remove_outliers(X_smote,y_smote, q)

fraction_removed_outliers =  1 - X_clean.shape[0] / X_scaled.shape[0]

print('Removed {:.1f} % data'.format(fraction_removed_outliers * 100))

In [None]:
# data distribution after removing outliers

fig = plt.figure(figsize = (20,10), dpi = 200)

ax = fig.add_subplot(2,2,1)
X_smote.boxplot()
plt.title('X before removing outliers', fontsize = 20)

ax = fig.add_subplot(2,2,2)
sns.histplot(y_smote)
plt.title('y before removing outliers', fontsize = 20)

ax = fig.add_subplot(2,2,3)
X_clean.boxplot()
plt.title('X after removing outliers', fontsize = 20)

ax = fig.add_subplot(2,2,4)
sns.histplot(y_clean)
plt.title('y after removing outliers', fontsize = 20)

Removing 13.2% of the outliers we obtain features distributions which are closer to the mean. Before removing the outliers, we had huge variance, for example some features were from -80 to +80. After removing the outliers, the features range is concentrated between -9 and +6. Removing the outliers has resulted in having a low imbalance of the negative class, but compared to the original imbalance rate, this scenario is more than acceptable.

Let's visualize the boxplots and the histogram of a smaller set of features for a more clear representation.

**Boxplots**

In [None]:
features = X_smote.columns[29: 32]

fig = plt.figure(figsize = (10,5), dpi = 100)

X_smote[features].boxplot()
plt.title('X before removing outliers Boxplot', fontsize = 20)

fig = plt.figure(figsize = (10,5), dpi = 100)

X_clean[features].boxplot()
plt.title('X after removing outliers Boxplot', fontsize = 20)

**Histograms**

In [None]:
fig = plt.figure(figsize = (10,10), dpi = 200)

X_scaled[features].hist(bins = 100, figsize = (20,10), edgecolor='white')
plt.title('X before removing outliers Boxplot', fontsize = 20)


fig = plt.figure(figsize = (10,10), dpi = 200)

X_clean[features].hist(bins = 50, figsize = (20,10), edgecolor='white')
plt.title('X before removing outliers Boxplot', fontsize = 20)

We can visualize the stastical parameters (mean,std,min, lower whisker (25%), mean (50%), upper whisker (75%) and max:

In [None]:
X_clean.describe()

Visualizing the data into a scatter plot we can realize that we have sufficient balance between the two classes now.

In [None]:
fig = plt.figure(figsize = (20,20))

for i in range(9):
    
    fig.add_subplot(3,3,i+1)
    
    ax_1 = 3*i
    ax_2 = 2*i+1

    plt.scatter(X_clean.values[np.where(y_clean == 0),ax_1],X_clean.values[np.where(y_clean == 0),ax_2], c = 'b')
    plt.scatter(X_clean.values[np.where(y_clean == 1),ax_1],X_clean.values[np.where(y_clean == 1),ax_2], c = 'r')
    plt.xlabel(X_clean.columns[ax_1])
    plt.ylabel(X_clean.columns[ax_2])
    plt.legend(['Not Bankrupted','Bankrupted'])

In [None]:
# let's define X,y as the dataset with removed outliers

X,y = X_clean, y_clean

# 3. Features Selection using Random Forest Classifier with number of features chosen via Cross-Validation

In [None]:
from sklearn.ensemble import RandomForestClassifier


def get_RF_features_importances(X,y,n_features, plot):
    
    rf = RandomForestClassifier()
    
    rf.fit(X,y)
    
    features_names = X.columns
    
    data = {'feature_name' : features_names,'feature_importance' : rf.feature_importances_}
    
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by = ['feature_importance'], ascending=False,inplace=True)
    
    selected_features = fi_df['feature_name'].values[:n_features]
    
    X_rf = X[selected_features]
    
    if plot:

        #Define size of bar plot
        plt.figure(figsize = (20,16))
        #Plot Searborn bar chart
        sns.barplot(x = fi_df['feature_importance'], y = fi_df['feature_name'])
        #Add chart labels
        plt.title('Random Forest Feature Importance')
        plt.xlabel('Feature Importance')
        plt.ylabel('Feature Name')
    

    return X_rf

In [None]:
n_features = 20

X_rf = get_RF_features_importances(X,y.values.ravel(),n_features, plot = True)


In [None]:
X_rf

In [None]:
from sklearn.model_selection import cross_val_score

cv = 10

n_features = [1, 5, 10, 15, 20, 25,30,35, 40]

cv_score = []


for n_ftrs in n_features:
    
    X_rf = get_RF_features_importances(X,y.values.ravel(),n_ftrs, plot = False)

    rf_clf = RandomForestClassifier()
    
    cross_val = cross_val_score(estimator = rf_clf, X = X_rf.values, y = y.values.ravel(), cv = cv)
    
    cv_score.append(np.average(cross_val))
    
    
plt.plot(np.array(n_features),np.array(cv_score),'bo-')
plt.xlabel('N. features')
plt.ylabel('Cross-Val Accuracy')

With Random Forest, we have identified 35 features wich are important to predict the target. Using this smalles subset of features, it is possible to achieve a validation accuracy ~98%

# 4. Data Analysis on the Preprocessed Dataset

In [None]:
# first we get Random Forest dataset for the original dataset, not training/test separately

n_features = 35

X_rf = get_RF_features_importances(X,y.values.ravel(), n_features = 35, plot = False)

## 4.1 Features Correlations

In [None]:
fig, ax = plt.subplots(figsize=(28,24))

mask = np.zeros_like(pd.concat([y,X_rf],axis = 1).corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(pd.concat([y,X_rf],axis = 1).corr(), 
            mask=mask,
            vmin=-1, vmax=1, cmap=sns.diverging_palette(20, 220, as_cmap=True), annot=True)

As expected, all the features extracted by Random Forest are correlated to the target. With the heatmap, we can identify some patterns in the data and identify the features with the strongest positive and negative correlations with the target. The barplot below sorts the features by strongest positive/negative correlations.

In [None]:
features_names = X_rf.columns

corr = pd.concat([y,X_rf],axis = 1).corr().values[0,1:]
    
data = {'feature_name' : features_names, 'corr' : corr}
    
corr_df = pd.DataFrame(data)
    
#Sort the DataFrame in order decreasing feature importance
corr_df.sort_values(by = ['corr'], ascending = False,inplace=True)

plt.figure(figsize = (20,16))
#Plot Searborn bar chart
sns.barplot(x = corr_df['corr'], y = corr_df['feature_name'])
#Add chart labels
plt.xlabel('Feature Correlation')
plt.ylabel('Feature Name')
    

In [None]:
print('Top 4 features with strongest positive correlation with the target:')
print('')
for i in range(4):
    
    print('{} : corr = {:.2f}'.format(corr_df.values[i,0], corr_df.values[i,1]))

In [None]:
print('Top 4 features with strongest negative correlation with the target:')
print('')

for i in range(4):
    
    print('{} : corr = {:.2f}'.format(corr_df.values[::-1][i,0], corr_df.values[::-1][i,1]))

**Scatterplots of the top 4 features with strongest positive correlation with the target**

In [None]:
# scatterplot of features with strongest negative correlation with the target

fig = plt.figure(figsize = (15,15))


for i in range(4):
    
    fig.add_subplot(2,2,i+1)
    
    ax_1 = corr_df.values[i,0]
    ax_2 = corr_df.values[i + 1,0]

    plt.scatter(X_rf[ax_1].values[np.where(y == 0)[0]],
            X_rf[ax_2].values[np.where(y == 0)[0]],
           c = 'b')

    plt.scatter(X_rf[ax_1].values[np.where(y == 1)[0]],
            X_rf[ax_2].values[np.where(y == 1)[0]],
           c = 'r')
    
    plt.xlabel(ax_1)
    plt.ylabel(ax_2)
    plt.title('Corr = {:.3f}'.format(corr_df.values[i,1]))
    plt.legend(['Not Bankrupted','Bankrupted'])
    
    plt.axhline(y = 0, c = 'r')
    plt.axvline(x = 0, c = 'r')

The companies labeled as 'Bankrupted' are all in the top-right side of the scatterplots, given the positive correlations.

**Scatterplots of the top 4 features with strongest negative correlation with the target**

In [None]:
# scatterplot of features with strongest negative correlation with the target

fig = plt.figure(figsize = (15,15))


for i in range(4):
    
    fig.add_subplot(2,2,i+1)
    
    ax_1 = corr_df.values[::-1][i,0]
    ax_2 = corr_df.values[::-1][i + 1,0]

    plt.scatter(X_rf[ax_1].values[np.where(y == 0)[0]],
            X_rf[ax_2].values[np.where(y == 0)[0]],
           c = 'b')

    plt.scatter(X_rf[ax_1].values[np.where(y == 1)[0]],
                X_rf[ax_2].values[np.where(y == 1)[0]],
               c = 'r')
    
    
    plt.xlabel(ax_1)
    plt.ylabel(ax_2)
    plt.title('Corr = {:.3f}'.format(corr_df.values[::-1][i,1]))
    plt.legend(['Not Bankrupted','Bankrupted'])
    
    plt.axhline(y = 0, c = 'r')
    plt.axvline(x = 0, c = 'r')

The companies labeled as 'Bankrupted' are all in the bottom-left side of the scatterplots, given the negative correlations.

We can also plot the boxplots of the features of interest, splitted in the two classes. For the features with strong negative correlation, we expect the boxplots to show that main distribution (inside the box, 50% of the data) of the 'Bankrupt' class (1), to have negative values, and lower than the values in the boxplots of the 'Not Bankrupt' class (0).

On the contrary, for the features with positive correlation, we expect the boxplots to show that main distribution  of the 'Bankrupt' class (1), to have positive values, and higher than the values in the boxplots of the 'Not Bankrupt' class (0).


**Boxplots of the top 4 features with strongest positive correlation with the target**

In [None]:
fig = plt.figure(figsize = (15,15))


for i in range(4):
    
    fig.add_subplot(2,2,i+1)
    
    ax = corr_df.values[i,0]
    

    plt.title('Corr = {:.3f}'.format(corr_df.values[i,1]))

    sns.boxplot(x = y_clean.columns[0],
                y = X_rf[ax], 
              data = pd.concat([X_rf,y_clean],axis = 1))
            

**Boxplots of the top 4 features with strongest nevative correlation with the target**

In [None]:
fig = plt.figure(figsize = (15,15))


for i in range(4):
    
    fig.add_subplot(2,2,i+1)
    
    ax = corr_df[::-1].values[i,0]
    
    plt.title('Corr = {:.3f}'.format(corr_df[::-1].values[i,1]))

    sns.boxplot(x = y_clean.columns[0],
                y = X_rf[ax], 
                data = pd.concat([X_rf,y_clean],axis = 1))
            

## 4.2 PCA

Performing PCA, we can identify reduced dimension which explain most of the variance in the dataset.

In [None]:
from sklearn.decomposition import PCA

def get_pca(X, n_components):

    pca = PCA(n_components = n_components)

    pca.fit(X)

    X_pca = pca.fit_transform(X)

    X_pca = pd.DataFrame(X_pca)
    
    fn = []
    
    for i in range(n_components):
        
        fn.append('component ' + str(i + 1))
        
    X_pca.columns = fn
    
    pve = pca.explained_variance_ratio_
    
    #for i in range(n_components):
    
       # print('PVE component {}: {:.1f} %'.format(i + 1, pve[i] * 100))
    print('Cumulative PVE with {} components: {:.1f}%'.format(n_components,sum(pve)*100))
    
    return X_pca, pca, pve

def get_scree_plot(X,n_components):
    
    cumulative_pve = np.zeros((len(n_components)))
    
    for i,n in enumerate(n_components):
        
        _, _, pve = get_pca(X,n)
        
        
        cumulative_pve[i] += 100 * sum(pve)
        
    fig = plt.figure(figsize = (5,5))
        
    plt.plot(n_components, cumulative_pve,'bo-')
    plt.xlabel('N. components')
    plt.ylabel('Cumulative PVE')
        
        
# the biplot illustrates the loading vectors of each features and 
# the data plotted in the principal component space

# in an element phi_jm of the loading vector phi_m of the feature m associated with the feature j
# has a high positive value, it means that the feature j has a high contribution to the component m


def get_biplot(X_pca,y,comp_ax_hor,comp_ax_ver,features_names):
    
    score = X_pca.values
    coeff = np.transpose(pca.components_[[comp_ax_hor-1,comp_ax_ver-1], :])
    
    
    ax_1 = score[:,0]
    ax_2 = score[:,1]
    n = coeff.shape[0]
    scale_ax_1 = 1.0/(ax_1.max() - ax_1.min())
    scale_ax_2 = 1.0/(ax_2.max() - ax_2.min())
    
    fig = plt.figure(figsize = (10,10))
    plt.scatter(ax_1[np.where(y == 0)[0]] * scale_ax_1,
                ax_2[np.where(y == 0)[0]] * scale_ax_2,
                c = 'b',
                s=5)
    

    plt.scatter(ax_1[np.where(y == 1)[0]] * scale_ax_1,
                ax_2[np.where(y == 1)[0]] * scale_ax_2,
                c = 'r',
                s=5)
    
    plt.xlabel('Component {}'.format(comp_ax_hor))
    plt.ylabel('Component {}'.format(comp_ax_ver))
    plt.legend(['Not Bankrupted','Bankrupted'])
    
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if features_names is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, "Var"+str(i+1), color = 'green', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, features_names[i], color = 'g', ha = 'center', va = 'center')
        


In [None]:
get_scree_plot(X_rf, n_components = [1,5,10,15,20])

The **elbow** of the **scree plot** corresponds to **10 principal components (cumulative PVE: ~97%)**.

In [None]:
X_pca,pca,_ = get_pca(X_rf, n_components = 10)

comp_ax_hor = 1

comp_ax_ver = 2

features_names = X_rf.columns

get_biplot(X_pca,y,comp_ax_hor, comp_ax_ver, features_names )


The biplot illustrates that Bankrupted companies tend to have negative values of the first component, where Current Liability to Assets, Debt ratio % and Borrow dependency (in particular) have positive values: it means that these high values of these features tend to represent Bankrupted companies. 

On the other hand, Not Bankrupted companies have positive values of the first components, which are representative (in particular) of Net Income to Total Assets, ROA(A), ROA(B) and ROA(C). Strong of these features tend to represent Not Bankrupted companies.

These analysis made for the Biplot is in line to the analysis made for the Heatmap about the positive/negative correlations between the features and the target.


Since a dataset reduced to 10 components reprents the 97% of variance of the original dataset, we can use the reduced dataset to fit classifiers, being sure that the most significant statistical information is retained in this dataset.

The performed data analysis involving features scaling, balancing classes, outliers removal, features selection and dimensionality reduction has provided a suitable dataset to fit ML models.

In [None]:
X_pca

# 5. Training-Test Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pca,y, test_size = 0.33, random_state = 42)

# 6. Fitting Classifiers

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,auc,precision_recall_curve,plot_roc_curve,plot_precision_recall_curve
from sklearn.metrics import confusion_matrix,plot_confusion_matrix


def plot_classification_performance(clf,X,y_true, step, binary):
   
    '''
    performance of classification is evaluated with:
    
    1) accuracy: (TP + TN)/(TP + TN + FP + FN)
    
    2) precision: TP / (TP + FP)
    
    3) recall (sensitivity, true positive rate): TP / (TP + FN)
    
    4) f_score: 2 * precision * recall / (precision + recall)
    
    5) precsion_recall_curve: x-axis: precision, y-axis: recall
    
    6) roc_curve: x-axis: true positive rate, y-axis: false positive rate
                
                true positive rate (recall, sensitivity): TP / (TP + FN)
                false positive rate (fall out, 1 - specificity): FP / (FP + TN) = 1 - specificity = 1 - TN / (TN + FP)
                
                specifity (or true negative rate): TN / (TN + FP) 
                
    ROC Curves summarize the trade-off between the true positive rate and false positive rate for a predictive model using different probability thresholds.
    
    Precision-Recall curves summarize the trade-off between the true positive rate and the positive predictive value for a predictive model using different probability thresholds.
    
    ROC curves are appropriate when the observations are balanced between each class, whereas precision-recall curves are appropriate for imbalanced datasets.
    
    7) confusion_matrix = [TP  FP
                           FN  TN]  
    
    '''
    
    accuracy = clf.score(X,y_true)
    
    if binary:
    
       
    
        y_pred = clf.predict(X)
        
        report = classification_report(y_true,y_pred)
        print('{} Classification Report'.format(step))
        print(report)
    
    
        plot_roc_curve(clf, X, y_true)
        plt.title('{} ROC curve'.format(step))
    
        plot_precision_recall_curve(clf, X, y_true)
        plt.title('{} Precision Recall curve'.format(step))
        
        
        plot_confusion_matrix(clf,X, y_true)
        plt.title('{} Confusion Matrix'.format(step))
        
    else:
        
        
        n_classes = len(clf.classes_)
        #y_score = clf.predict_proba(X)
        y_score = clf.decision_function(X)
        y_pred = clf.predict(X)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
    
        precision = dict()
        recall = dict()
        f_score = dict()
    
        y_true_dummies = pd.get_dummies(y_true, drop_first = False).values
    
        for i in range(n_classes):
        
            fpr[i], tpr[i], _ = roc_curve(y_true_dummies[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
            precision[i], recall[i],_ =  precision_recall_curve(y_true_dummies[:, i], y_score[:, i])
            
        report = classification_report(y_true,y_pred)
        print('{} Classification Report'.format(step))
        print(report)
                                                                
       
    
        figsize=(8, 5)
        fig, ax = plt.subplots(figsize = figsize)
        ax.plot([0, 1], [0, 1], 'k--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('{} ROC curve'.format(step))
        
        for i in range(n_classes):
            ax.plot(fpr[i], tpr[i], label = 'Class {} (area = {:.2f})'.format(i, roc_auc[i]))
    
        ax.legend(loc="best")
        ax.grid(alpha=.4)
        sns.despine()
        plt.show()
    
        figsize=(8, 5)
        fig, ax = plt.subplots(figsize = figsize)

        ax.set_xlabel('Precision')
        ax.set_ylabel('Recall')
        ax.set_title('{} Precision Recall curve'.format(step))
        
        for i in range(n_classes):
            ax.plot(precision[i], recall[i], label = 'Class {}'.format(i))
        
        ax.legend(loc="best")
        ax.grid(alpha=.4)
        sns.despine()
        plt.show()
    
        figsize=(8, 5)
        plot_confusion_matrix(clf,X,y_true)
        
        
    

## 6.1 Random Forest Classifier

In [None]:
rf = RandomForestClassifier()

rf.fit(X_train,y_train)

plot_classification_performance(rf, X_train,y_train, step ='Train', binary = True)

In [None]:
plot_classification_performance(rf, X_test,y_test, step ='Test', binary = True)

## 6.2 Support Vector Classifier with C chosen via Cross-Validation

In [None]:
from sklearn.svm import SVC

cv = 10

C_values = [1e2,1e3, 1e4, 1e5]

cv_score = []


for C in C_values:
    


    svc = SVC(C = C, kernel = 'rbf')
    
    cross_val = cross_val_score(estimator = svc, X = X_train.values, y = y_train.values.ravel(), cv = cv)
    
    cv_score.append(np.average(cross_val))
    
    
plt.plot(np.array(C_values),np.array(cv_score),'bo-')
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('Cross-Val Accuracy')

In [None]:
svc = SVC(C = 1e3)

svc.fit(X_train,y_train)

plot_classification_performance(svc, X_train,y_train.values.ravel(), step ='Train', binary = True)

In [None]:
plot_classification_performance(svc, X_test,y_test, step ='Test', binary = True)

## 6.3 K-Nearest Neighbors with K chosen via Cross-Validation

In [None]:
from sklearn.neighbors import KNeighborsClassifier


cv = 10

k_values = [1,10, 100, 1000]

cv_score = []


for k in k_values:
    


    knn  = KNeighborsClassifier(n_neighbors = k)
    
    cross_val = cross_val_score(estimator = knn, X = X_train.values, y = y_train.values.ravel(), cv = cv)
    
    cv_score.append(np.average(cross_val))
    
    
plt.plot(np.array(k_values),np.array(cv_score),'bo-')

plt.xlabel('N. Neighbors')
plt.ylabel('Cross-Val Accuracy')

In [None]:
knn = KNeighborsClassifier(n_neighbors = 1)

knn.fit(X_train,y_train)

plot_classification_performance(knn, X_train,y_train, step ='Train', binary = True)

In [None]:
plot_classification_performance(knn, X_test,y_test, step ='Test', binary = True)

## 6.4 Neural Network

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import InputLayer,Dense, Dropout

In [None]:
model = Sequential()

model.add(Dense(units = 10))

model.add(Dropout(0.1))

model.add(Dense(units = 5))

model.add(Dropout(0.1))

model.add(Dense(units = 2))

model.add(Dropout(0.1))

model.add(Dense(units = 1, activation = 'sigmoid'))

model.compile(optimizer = 'adam', loss = 'mse', metrics = ['accuracy'])

In [None]:
history = model.fit(X_train,y_train, epochs = 200, validation_split = 0.1)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
fig = plt.figure(figsize = (10,5), dpi = 200)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel('Epochs',fontsize = 20)
plt.ylabel('Accuracy',fontsize = 20)
plt.legend(['Training','Validation'])

# 7. Conclusion

- **Most relevant features: 35 out of 95 from the initial dataset**

- **Features (among relevant) which most characterize tendency of bankrupting:**

1) Borrowing dependency (high)

2) Debt ratio % (high)

3) Liability to Equity (high)

4) Current Liability to Equity (high)

5) Persistent EPS in the Last Four Seasons (low)

6) Net worth/Assets (low)

7) ROA(C) before interest and depreciation before interest (low)

8) Net profit before tax/Paid-in capital (low)


- **A dataset reduced to 10 components retain ~97% of the variance**

- **Most performant classifiers fitted on reduced dataset: Random Forest, SVC

**Test results**

- precision: 97%

- recall: 98%

- f1: 98% 

- accuracy: 98%


