### Import Basic Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt   #visualization tools
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from IPython.display import display

### Import Data

In [None]:
path = "default of credit card clients.xls"
data = pd.read_excel(path)
data.head()

### Preprocessing

In [None]:
data.columns = data.iloc[0,]
data.drop(columns = ['ID'], inplace = True)
data.drop(0, inplace = True)
data.head(3)

In [None]:
pd.options.display.max_columns = None
display(data.describe())

In [None]:
data.EDUCATION.value_counts()

In [None]:
#replace 0's with NAN, replace others too 
others = (data.EDUCATION == 5) | (data.EDUCATION == 6) | (data.EDUCATION == 0)
data.loc[others, 'EDUCATION'] = 4
data.EDUCATION.value_counts()

In [None]:
data.MARRIAGE.value_counts()

In [None]:
data.loc[data.MARRIAGE == 0, 'MARRIAGE'] = 3
data.MARRIAGE.value_counts()

### Train/Test Split

In [None]:
X = data.drop(['default payment next month'], axis = 1)
y = data['default payment next month']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    shuffle = True, stratify = y)

In [None]:
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

In [None]:
X_train.head()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is:", accuracy)

### Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(log_reg, X_train, y_train, scoring = "accuracy", cv = 5)
print(scores)
print(scores.mean())

In [None]:
from sklearn.metrics import SCORERS
sorted(SCORERS.keys())

### Using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'penalty': ['l1', 'l2'], 'solver': ['liblinear','saga']}
]

grid_search = GridSearchCV(log_reg, param_grid, cv = 4,
                          scoring = 'accuracy', return_train_score = True)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
final = grid_search.cv_results_
for mean_score, params in zip(final["mean_test_score"], final["params"]):
    print(mean_score, params)

### Decision Tree

![link](tree_limited.png)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth = 3)
tree_clf.fit(X_train, y_train)

In [None]:
y_pred = tree_clf.predict(X_test)
accuracy = tree_clf.score(X_test, y_test)
print("Accuracy is:", accuracy)

In [None]:
scores_dt = cross_val_score(tree_clf, X_train, y_train, scoring = "accuracy", cv = 5)
print(scores_dt)
print(scores_dt.mean())

### Visualize

In [None]:
from sklearn.tree import export_graphviz

export_graphviz(
        tree_clf,
        out_file = "tree_limited.dot",
        feature_names = data.columns[:23],
        class_names = ['0', '1'],
        rounded=True,
        filled=True
    )

In [None]:
!dot -Tpng tree_limited.dot -o tree_limited.png -Gdpi=600

In [None]:
from IPython.display import Image
Image(filename = 'tree_limited.png')

### Voting Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

svc_clf = SVC()
dt_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()

In [None]:
# VC manual

In [None]:
voting_clf = VotingClassifier(
    estimators = [('svc', svc_clf), ('dt', dt_clf), ('knn', knn_clf)],
    voting = 'hard'
)

In [None]:
%%time
for clf in (svc_clf, dt_clf, knn_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = clf.score(X_test, y_test)
    print(clf.__class__.__name__, acc)

### Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500,
    max_samples = 100, bootstrap = True, n_jobs = -1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is:", accuracy)
## Random Patches vs Random Spaces

### Random Forest

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500, 
                                 max_leaf_nodes=16, 
                                 n_jobs=-1)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is:", accuracy)

In [None]:
for name, score in zip(data.columns, rf_clf.feature_importances_):
    print(name, score)

### Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)

ada_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

y_pred = ada_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is:", accuracy)

### XGBoost

In [None]:
#lets make one

### Stacking

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.5, shuffle = True)

In [None]:
base_clf1 = DecisionTreeClassifier()
base_clf2 = KNeighborsClassifier()

In [None]:
base_clf1.fit(x_train, y_train)
base_clf1.fit(x_train, y_train)

In [None]:
pred_1 = base_clf1.predict(x_val)
pred_2 = base_clf1.predict(x_val)

In [None]:
test_pred_1 = base_clf1.predict(X_test)
test_pred_2 = base_clf1.predict(X_test)

In [None]:
stacked_pred = np.column_stack((pred_1, pred_2))
stacked_test_pred = np.column_stack((test_pred_1, test_pred_2))

In [None]:
from sklearn.linear_model import LogisticRegression
meta_clf = LogisticRegression()

In [None]:
meta_clf.fit(stacked_pred, y_val)
final_pred = meta_clf.predict(stacked_test_pred)

In [None]:
accuracy = accuracy_score(y_test, final_pred)
print("Accuracy is:", accuracy)

### AutoML