In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

import xgboost as xgb

import sys
from importlib import reload

if "multiCollinearityEliminator" in sys.modules: 
    reload(sys.modules["multiCollinearityEliminator"])
from multiCollinearityEliminator import MultiCollinearityEliminator

if "visualizations" in sys.modules: 
    reload(sys.modules["visualizations"])
else:
    import visualizations
vis = visualizations.Visualizations()

if "modelling" in sys.modules: 
    reload(sys.modules["modelling"])
else:
    import modelling
mod = modelling.Modelling(vis=vis)

RANDOM_SEED = 1243

In [None]:
def getRemovedColumns(oldDF, newDF):
    return ([col for col in oldDF.columns if col not in newDF.columns])

In [None]:
rawData = pd.read_csv("data/data.csv")
print("Size of the raw data: %s" % str(rawData.shape))
rawData.head()

In [None]:
print("There are %d null values in the raw data." % (rawData.isnull().values.sum()))
rawData.isnull().sum(axis = 0)

In [None]:
vis.plotCounts(xData=rawData["Bankrupt?"], figSize=(6,5), plotTitle="Class Distributions \n (0: Stable || 1: Bankrupt)")

In [None]:
vis.plotHistograms(data=rawData, bins=50, figSize=(35,30))

In [None]:
vis.plotBoxPlots(data=rawData, figSize=(35,30), plotTitle="Raw Data Boxplots")

## Steps for data cleaning

1. Split the data into training/ test sets
2. Remove highly correlated columns
3. Remove row-wise outliers
4. Normalize the data column-wise (center/ scale)
5. Remove zero/ low variance columns
6. Replicate all the steps on the test set


In [None]:
X_raw = rawData.iloc[:, 1:]
y_raw = rawData.iloc[:, 0]
TARGET_COL = y_raw.name

In [None]:
X_raw_train, X_raw_test, y_raw_train, y_raw_test = [df.reset_index(drop=True) for df in \
    train_test_split(X_raw, y_raw, test_size=0.2, stratify=y_raw, random_state=RANDOM_SEED)]

In [None]:
vis.plotCorrelationMatrix(data=X_raw_train, figSize=(25,20), plotTitle="Correlation Heatmap")

In [None]:
CORRELATION_THRESH = 0.95
mce = MultiCollinearityEliminator(pd.concat([y_raw_train, X_raw_train], axis=1), TARGET_COL, CORRELATION_THRESH)
X_no_corrs_train = mce.autoEliminateMulticollinearity()
X_no_corrs_train.drop(columns=TARGET_COL, inplace=True)

highCorrColumns = getRemovedColumns(oldDF=X_raw_train, newDF=X_no_corrs_train)
print("%d columns were dropped due to high correlation." % len(highCorrColumns))
print(highCorrColumns)

In [None]:
outlierModel = LocalOutlierFactor(n_neighbors=100, metric="manhattan", contamination=0.05)
outlierPreds = outlierModel.fit_predict(X_no_corrs_train)
outlierTrainIndex = np.where(outlierPreds == -1)[0]
X_no_outliers_train = X_no_corrs_train.drop(outlierTrainIndex, axis=0)
y_train = y_raw_train.drop(outlierTrainIndex, axis=0)

print("%d rows were removed due to outlier values." % len(outlierTrainIndex))

In [None]:
vis.plotBoxPlots(data=X_no_outliers_train, figSize=(20,20), plotTitle="Preprocessed Boxplots - No Outliers")

In [None]:
robustScalerModel = preprocessing.RobustScaler()
X_scaled_train = robustScalerModel.fit_transform(X_no_outliers_train)
X_scaled_train = pd.DataFrame(X_scaled_train)
X_scaled_train.columns = X_no_outliers_train.columns

In [None]:
vis.plotBoxPlots(data=X_scaled_train, figSize=(20,20), plotTitle="Preprocessed Boxplots - No Outliers")

In [None]:
def lowVarianceFeatureRemover(data, thresh):
    varModel = VarianceThreshold(threshold=thresh)
    varModel.fit(X=data)
    return (data[data.columns[varModel.get_support(indices=True)]])

In [None]:
VARIANCE_THRESH = 1e-3
X_train = lowVarianceFeatureRemover(data=X_scaled_train, thresh=VARIANCE_THRESH)
lowVarCols = getRemovedColumns(oldDF=X_scaled_train, newDF=X_train)
print("%d columns are removed due to low variance <= %s" % (len(lowVarCols), str(VARIANCE_THRESH)))
print(lowVarCols)

In [None]:
print("Shape of the training data: %s" % str(X_train.shape))
X_train.head()

In [None]:
X_no_corrs_test = X_raw_test.drop(columns=highCorrColumns)
X_scaled_test = pd.DataFrame(robustScalerModel.transform(X_no_corrs_test))
X_scaled_test.columns = X_no_corrs_test.columns
X_test = X_scaled_test.drop(columns=lowVarCols)
y_test = y_raw_test

print("Shape of the test data: %s" % str(X_test.shape))

In [None]:
smoteModel = SMOTE(random_state=RANDOM_SEED)
X_train_sm, y_train_sm = smoteModel.fit_resample(X_train, y_train)

In [None]:
rfModel = RandomForestClassifier(n_estimators=200, random_state=RANDOM_SEED)
rfModel.fit(X_train_sm, y_train_sm)
rfPreds = rfModel.predict(X_test)

In [None]:
mod.getModelPerformance(trueVals=y_test, preds=rfPreds, figSize=(5,5), plotTitle="Random Forests Performance", targetNames=["Stable","Bankrupt"])

In [None]:
#thread
#eta
#min_child_weight
#max_depth
#max_leaf_nodes
#gamma
#subsample
#colsample_bytree

xgbModel = xgb.XGBClassifier(
    nrounds= 1000, max_depth=3, eta=0.1, objective="binary:logistic", eval_metric="logloss", 
    verbosity=0, use_label_encoder=False, random_state=RANDOM_SEED)
xgbModel.fit(X_train_sm, y_train_sm)
xgbPreds = xgbModel.predict(X_test)


In [None]:
mod.getModelPerformance(trueVals=y_test, preds=xgbPreds, figSize=(5,5), plotTitle="XGBoost Performance", targetNames=["Stable","Bankrupt"])

In [None]:
scores = cross_val_score(xgbModel, X_train_sm, y_train_sm, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

In [None]:
kfold = KFold(n_splits=10, shuffle=True)
kfScores = cross_val_score(xgbModel, X_train_sm, y_train_sm, cv=kfold)
print("K-fold CV average score: %.2f" % kfScores.mean())