In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/company-bankruptcy-prediction/data.csv')
df

In [None]:
df.dtypes

In [None]:
df.sum().isnull()

In [None]:
df['Bankrupt?'].value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
def balancing_dataset(X, y):
  smote = SMOTE(sampling_strategy='minority')
  X_sm, y_sm = smote.fit_sample(X,y)
  return X_sm, y_sm

In [None]:
X, y = balancing_dataset(df.drop('Bankrupt?', axis=1), df['Bankrupt?'])

In [None]:
y.value_counts()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, plot_roc_curve, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier

models = {
  "svc": SVC(),
  "gradBoost": GradientBoostingClassifier(),
  "AdaBoost": AdaBoostClassifier(),
  "RandomForest": RandomForestClassifier(),
  "XGB": XGBClassifier(),
  "XGBRF": XGBRFClassifier(),
  "LGBM": LGBMClassifier(),
  "logReg": LogisticRegression(),
  "NB_gauss": GaussianNB(),
  "KNN": KNeighborsClassifier(),
}

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate

def cross_val_scoring(models, X, y, cv, scoring):
  np.random.seed(42)
  model_scoring = {}
  for name, model in models.items():
    pipe = Pipeline(steps=[
                      ('imputer', SimpleImputer()),
                      ('scaler', StandardScaler()),
                      ('model', model),
    ])
    scores = cross_validate(pipe, X, y, scoring=scoring)
    model_scoring[name] = scores
  return model_scoring

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

scoring = {
    'accuracy': 'accuracy',
}

In [None]:
cv_model_scores = cross_val_scoring(models, X, y, kf, scoring)

In [None]:
for name, score in cv_model_scores.items():
  print("=== "+name+" (test)===");
  print("mean accuracy: {} (+/- {})".format(score['test_accuracy'].mean(), score['test_accuracy'].std()))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
#from sklearn.model_selection import GridSearchCV
model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.metrics import f1_score
y_pred = model.predict(X_test)
f1_score(y_test, y_pred)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_test, y_test ,cmap='Blues')