In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree

In [2]:
##Uploading the datasets
df_training = pd.read_csv('training_clean')
df_test = pd.read_csv('test_clean')

In [3]:
ls_drop = ['addr_state_IA', 'home_ownership_OTHER', 'purpose_educational']
df_training = df_training.drop('Unnamed: 0', 1)
df_training = df_training.drop(ls_drop, 1)

In [4]:
df_test = df_test.drop('Unnamed: 0', 1)

In [None]:
## Decision Tree code from lab

In [None]:
dataset = df_training
dataset_test = df_test

In [None]:
dataset = dataset.drop('loan_status_current', 1)
dataset_test = dataset_test.drop('loan_status_current', 1)

In [None]:
X_train = dataset[['purpose_house', 'purpose_small_business', 'emp_length', 'annual_inc']]
X_test = dataset_test[['purpose_house', 'purpose_small_business', 'emp_length', 'annual_inc']]
y_train = df_training['loan_status_current']
y_test = df_test['loan_status_current']

In [None]:
X = df_training
y = df_training['loan_status_current']

In [None]:
# Train a DT classifier
classifier = DecisionTreeClassifier(class_weight = 'balanced', random_state=10)  
classifier.fit(X_train, y_train) 

In [None]:
# Make predictions for test data
y_pred = classifier.predict(X_test)

In [None]:
# Calculate accuracy 
acc = accuracy_score(y_test,y_pred) * 100
print('Accuracy is :{0}'.format(acc))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print('\nAUC is :{0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
# Alternative confusion matrix
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(classifier, X, y, values_format='.3g')
plt.show()

In [None]:
# Instantiate and fit a DecisionTreeClassifier
classifier_2 = DecisionTreeClassifier(random_state=10, criterion='entropy')  
classifier_2.fit(X_train, y_train)

In [None]:
# Plot and show decision tree
plt.figure(figsize=(16,16), dpi=500)
tree.plot_tree(classifier_2, 
               feature_names=X.columns,
               class_names=np.unique(y).astype('str'),
               filled=True, rounded=True)
plt.show()

In [None]:
## XGBoost code from lab


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
#from sklearn.grid_search import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df = df_training

In [None]:
labels = df['loan_status_current']
labels_removed_df = df.drop('loan_status_current', axis=1, inplace=False)
scaler = StandardScaler()
scaled_df = scaler.fit_transform(labels_removed_df)

X_train, X_test, y_train, y_test = train_test_split(scaled_df, labels)

In [None]:
xboost_train = df_training
xboost_test = df_test

In [None]:
X_train = xboost_train.drop('loan_status_current', 1)
X_test = xboost_test.drop('loan_status_current', 1)
y_train = xboost_train['loan_status_current']
y_test = xboost_test['loan_status_current']

In [None]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)



In [None]:
training_preds = clf.predict(X_train)
val_preds = clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

In [None]:
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
## Tuning XGBoost gridsearch

In [None]:
param_grid = {
    "learning_rate": [0.1],
    'max_depth': [6],
    'min_child_weight': [10],
    'subsample': [ 0.7],
    'n_estimators': [5, 30, 100, 250],
}

In [None]:
grid_clf = GridSearchCV(clf, param_grid, scoring='accuracy', cv=None, n_jobs=1)
grid_clf.fit(scaled_df, labels)

best_parameters = grid_clf.best_params_

print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = grid_clf.predict(X_train)
val_preds = grid_clf.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))

In [None]:
## Random Forest Code from lab (dsc-tree-ensembles-random-forests-lab)

In [None]:
import sklearn

In [None]:
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [None]:
# Import the data
df = df_training
df_random_forest = df

In [None]:
df.loan_status_current.value_counts()

In [None]:
df_random_forest_test_zero = df_test

In [None]:
df_random_forest_train = df_random_forest.drop('loan_status_current', 1)
loan_status_current_train = df_random_forest['loan_status_current']

In [None]:
df_random_forest_test = df_random_forest_test_zero.drop('loan_status_current', 1)
loan_status_current_test = df_random_forest_test_zero['loan_status_current']

In [None]:
tree_clf = DecisionTreeClassifier(class_weight = 'balanced', criterion='gini', max_depth=5) 
tree_clf.fit(df_random_forest_train, loan_status_current_train)

In [None]:
# Feature importance
tree_clf.feature_importances_

In [None]:
def plot_feature_importances(model):
    n_features = df_random_forest_train.shape[1]
    plt.figure(figsize=(18,16))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), df_random_forest_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

plot_feature_importances(tree_clf)

In [None]:
# Test set predictions
pred = tree_clf.predict(df_random_forest_test)

# Confusion matrix and classification report
print(confusion_matrix(loan_status_current_test, pred))
print(classification_report(loan_status_current_test, pred))

In [None]:
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(loan_status_current_test, pred) * 100))

In [None]:
bagged_tree =  BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), 
                                 n_estimators=20)

In [None]:
bagged_tree.fit(df_random_forest_train, loan_status_current_train)

In [None]:
bagged_tree.score(df_random_forest_train, loan_status_current_train)

In [None]:
# Test accuracy score
bagged_tree.score(df_random_forest_test, loan_status_current_test)

In [None]:
# Instantiate and fit a RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(df_random_forest_train, loan_status_current_train)

In [None]:
# Training accuracy score
forest.score(df_random_forest_train, loan_status_current_train)

In [None]:
# Test accuracy score
forest.score(df_random_forest_test, loan_status_current_test)

In [None]:
plot_feature_importances(forest)

In [None]:
# Instantiate and fit a RandomForestClassifier
forest_2 = RandomForestClassifier(n_estimators = 5, max_features= 10, max_depth= 2)
forest_2.fit(df_random_forest_train, loan_status_current_train)

In [None]:
# First tree from forest_2
rf_tree_1 = forest_2.estimators_[0]

In [None]:
# Feature importance
plot_feature_importances(rf_tree_1)

In [None]:
# Second tree from forest_2
rf_tree_2 = forest_2.estimators_[1]

In [None]:
# Feature importance
plot_feature_importances(rf_tree_2)

In [None]:
#sklearn.metrics.classification_report(loan_status_current_test, pred)