<font size="+3"><strong>5.5. Bankruptcy in Taiwan 🇹🇼</strong></font>

In [None]:
# Import libraries here
import gzip
import json
import pickle

import ipywidgets as widgets
import pandas as pd
import wqet_grader
from imblearn.over_sampling import RandomOverSampler
from IPython.display import VimeoVideo
from ipywidgets import interact
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from teaching_tools.widgets import ConfusionMatrixWidget

wqet_grader.init("Project 5 Assessment")

# Prepare Data

## Import

In [None]:
# Load data file
with gzip.open("taiwan-bankruptcy-data.json.gz", "r") as f:
        taiwan_data = json.load(f)
print(type(taiwan_data))

In [None]:
taiwan_data_keys = taiwan_data.keys()
print(taiwan_data_keys)

In [None]:
n_companies = len(taiwan_data["observations"])
print(n_companies)

In [None]:
n_features = len(taiwan_data["schema"]['fields'])
print(n_features)

In [None]:
# Create wrangle function
def wrangle(filename):
    
    # Open compressed file, load into dictionary
    with gzip.open(filename, "r") as f:
        data = json.load(f)

    # Load dictionary into DataFrame, set index
    df = pd.DataFrame().from_dict(data["observations"]).set_index("id")

    return df

In [None]:
df = wrangle("data/taiwan-bankruptcy-data.json.gz")
print("df shape:", df.shape)
df.head()

In [None]:
df.isna().any().shape

In [None]:
nans_by_col = df.isna().any()
print("nans_by_col shape:", nans_by_col.shape)
nans_by_col.head()

In [None]:
# Plot class balance
df["bankrupt"].value_counts(normalize=True).plot(kind="bar");
plt.xlabel("Bankrupt")
plt.ylabel("Frequency")
plt.title("Class Balance")
# Don't delete the code below 👇
plt.savefig("images/5-5-7.png", dpi=150)

In [None]:
target = "bankrupt"
X = df.drop(columns=target)
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
over_sampler = RandomOverSampler(random_state=42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

# Build Model

## Iterate

In [None]:
clf = GradientBoostingClassifier()

In [None]:
cv_scores = cross_val_score(clf, X_train_over, y_train_over, cv=5, n_jobs=-1)
print(cv_scores)

In [None]:
params = {
    "gradientboostingclassifier__n_estimators": range(20, 31, 5),
    "gradientboostingclassifier__max_depth": range(2, 5,)
}

In [None]:
model = GridSearchCV(
    clf,
    param_grid = params,
    cv = 5,
    n_jobs = -1,
    verbose=1
)

In [None]:
model.fit(X_train_over, y_train_over)

In [None]:
cv_results = pd.DataFrame(model.cv_results_)
cv_results.head(5)

In [None]:
best_params = model.best_params_
print(best_params)

## Evaluate

In [None]:
acc_train = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)

print("Model Training Accuracy:", round(acc_train, 4))
print("Model Test Accuracy:", round(acc_test, 4))

In [None]:
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
# Don't delete the code below 👇
plt.savefig("images/5-5-16.png", dpi=150)

In [None]:
class_report = classification_report(y_test, model.predict(X_test))
print(class_report)

# Communicate

In [None]:
features = X_train_over.columns
importances = model.best_estimator_.feature_importances_
feat_imp = pd.Series(importances, index=features).sort_values()
feat_imp.tail(10).plot(kind="barh")
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");
# Don't delete the code below 👇
plt.savefig("images/5-5-17.png", dpi=150)

In [None]:
# Save model
with open("model-5-5.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
def wrangle(filename):
    
    # Open compressed file, load into dictionary
    with gzip.open(filename, "r") as f:
        data = json.load(f)

    # Load dictionary into DataFrame, set index
    df = pd.DataFrame().from_dict(data["observations"]).set_index("id")

    return df

In [None]:
# Add make_predictions function from lesson 5.3
def make_predictions(data_filepath, model_filepath):
    # Wrangle JSON file
    X_test = wrangle(data_filepath)
    # Load model
    with open(model_filepath, "rb")as f:
        model = pickle.load(f)
    # Generate predictions
    y_test_pred = model.predict(X_test)
    # Put predictions into Series with name "bankrupt", and same index as X_test
    y_test_pred = pd.Series(y_test_pred, index=X_test.index, name="bankrupt")
    return y_test_pred

In [None]:
# Import your module
#from my_predictor_assignment import make_predictions

# Generate predictions
y_test_pred = make_predictions(
    data_filepath="taiwan-bankruptcy-data-test-features.json.gz",
    model_filepath="model-5-5.pkl",
)

print("predictions shape:", y_test_pred.shape)
y_test_pred.head()