In [None]:
#import library
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error

## Data Extraction

In [None]:
financial = pd.read_csv('/kaggle/input/financial-distress/Financial Distress.csv')

In [None]:
financial.shape

In [None]:
financial.head()

In [None]:
financial.describe()

In [None]:
financial.info()

In [None]:
#handling missing value
financial.isnull().sum()

## Transform Data

In [None]:
financial['Financial Distress'] = financial['Financial Distress'].astype(int)

## Selection Data

In [None]:
df = financial[['Company', 'Time', 'Financial Distress']]
df.head()

In [None]:
#rename target
df = df.rename(columns = {'Financial Distress' : 'Financial_Distress'})

## K-Means Clustering Model

In [None]:
#split data
X = df.drop('Financial_Distress', axis = 1)
y = df['Financial_Distress']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 1)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
kmc = KMeans()
kmc.fit(X_train, y_train)
print(kmc)

In [None]:
#prediction
y_pred = kmc.predict(X_test)
print(y_pred)

In [None]:
#accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy Score : ', accuracy)
print('Percentage : ', int(accuracy * 100), '%')

In [None]:
#check RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print('Root Mean Squared Error : ', rmse)

In [None]:
#confusion matrix
matrix = pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames = ['Predicted'])
print(matrix)

In [None]:
#classification report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
#heatmap correlation
plt.figure(figsize = (10,6))
sns.heatmap(matrix.corr(), annot = True, cmap = 'inferno')

In [None]:
#distribution
plt.figure(figsize = (10,6))
sns.distplot(df.Financial_Distress)
plt.title("Distribution of Financial Distress")
plt.show()

In [None]:
#check cluster
kmc = KMeans(n_clusters = 12).fit(df)
cluster = kmc.cluster_centers_
print(cluster)

In [None]:
#cluster plotting
plt.figure(figsize=(10,6))
plt.scatter(df['Company'], df['Financial_Distress'], c= kmc.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(cluster[:, 0], cluster[:, 1], c='red', s=50)
plt.title("Company ~ Financial Distress")
plt.xlabel("Company")
plt.ylabel("Financial Distress")
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(data = df, x = 'Time', y = 'Financial_Distress')