In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('../input/financial-distress/Financial Distress.csv')

In [None]:
data.info()

In [None]:
data.head()

Let's make a copy of our main dataframe.
Always follow this step, so that if something gets messed up, you have a back-up.

In [None]:
df = data.copy()

Let's check if there are any missing values.

In [None]:
def missing_values_table(df):
    total_missing = df.isnull().sum().sort_values(ascending=False)
    percentage_missing = (100*df.isnull().sum()/len(df)).sort_values(ascending=False)
    missing_table = pd.DataFrame({'missing values':total_missing,'% missing':percentage_missing})
    return missing_table

In [None]:
missing_values_table(df)

It looks like there aren't any missing values.

In [None]:
df.shape

Checking for highly correlated features.

In [None]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
df.drop(to_drop, axis=1, inplace=True)

Now let's look at our dataframe's shape.

In [None]:
df.shape

It looks like about 8 features were highly correlated, and were removed.

Now let's convert target variables into binary form, i.e, 0 and 1.

1 - Company is bankrupt.
0 - Company is healthy.

Also removing ***Company*** and ***Time*** features.

In [None]:
Y = df.iloc[:,2].values
for y in range(0,len(Y)):
       if Y[y] > -0.5:
              Y[y] = 0
       else:
              Y[y] = 1
X = df.iloc[:,3:].values

Let's take a look at the distribution of our target variable.****

In [None]:
print(df['Financial Distress'].value_counts())
df['Financial Distress'].value_counts().plot(kind='bar')

In [None]:
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

In [None]:
X.head()

In [None]:
Y.head()

Splitting the data into train and test set.

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size = 0.30, random_state = 0)

In [None]:
print(f"Shape of X_train is :{X_train.shape},\nShape of X_test is :{X_test.shape},\nShape of y_train is :{y_train.shape},\nShape of y_test is :{y_test.shape}")

Now that everything looks good, let's get to training.

In [None]:
#Importing Evaluation metrics.
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

## Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
BNB = BernoulliNB()

In [None]:
BNB.fit(X_train,y_train)

In [None]:
BNB_pred = BNB.predict(X_test)

In [None]:
accuracy_score(BNB_pred,y_test)

In [None]:
BNB_CM = pd.DataFrame(confusion_matrix(BNB_pred,y_test), index = ['Actual No','Actual Yes'], columns=['Predicted No','Predicted Yes'])

In [None]:
BNB_CM

In [None]:
print(classification_report(BNB_pred,y_test))

## LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
LDA = LinearDiscriminantAnalysis()

In [None]:
LDA.fit(X_train,y_train)

In [None]:
LDA_pred = LDA.predict(X_test)

In [None]:
LDA_pred = pd.DataFrame(LDA_pred)

In [None]:
accuracy_score(LDA_pred,y_test)

In [None]:
LDA_CM = pd.DataFrame(confusion_matrix(LDA_pred,y_test), index = ['Actual No','Actual Yes'], columns=['Predicted No','Predicted Yes'])
LDA_CM

In [None]:
LDA_pred.head(60)

In [None]:
LDA_pred.tail(60)

We can see that number of **1**'s in predictions are lower compared **0**'s because of low number of **1**'s in our training data.
Since the target variable is highly imbalanced, we see such problems.

After trying with different models, I felt that LDA worked better compared to others as it's confusion matrix looks more balanced compared to other models,hence it looks like a more robust model.

Feel free to try out with other models as well.