# CS 5228


## Import Data

In [7]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)

In [2]:
# Process Training Data
drop_columns = []
le = generate_labels()

base_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

In [3]:
# Process Test Data
base_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

In [None]:
base_dropna_x = base_dropna.drop(columns='ChargeOff')
base_dropna_y = base_dropna['ChargeOff']

## Scale Only Selected Columns

In [9]:

# Create scaler
scale_columns = ['DisbursementGross', 'GrAppv', 'SBA_Appv', 'Term', 'NoEmp']

minmax_transformer = Pipeline(steps=[
        ('minmax', preprocessing.MinMaxScaler())])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('mm', minmax_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)

# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)

print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")

Best Accuracy : 0.9316777391271854


## Scale all columns

In [10]:
# Create scaler
scale_columns = base_dropna_x.columns

minmax_transformer = Pipeline(steps=[
        ('minmax', preprocessing.MinMaxScaler())])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('mm', minmax_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)

# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)

print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")

Best Accuracy : 0.932059196697274


## Robust Scaler

In [None]:
# Create scaler
scale_columns = base_dropna_x.columns

robust_transformer = Pipeline(steps=[
        ('robust', preprocessing.RobustScaler())])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('rb', robust_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)


# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)

print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")

# Best Accuracy : 0.9311757915698567

## Standard Scaler

In [19]:
# Create scaler
scale_columns = base_dropna_x.columns

standard_transformer = Pipeline(steps=[
        ('standard', preprocessing.StandardScaler())])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('sd', standard_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)


# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)


print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")


Best Accuracy : 0.9320391445932451


## PowerTransformer

In [21]:
# Create scaler
scale_columns = base_dropna_x.columns

power_transformer = Pipeline(steps=[
        ('power', preprocessing.PowerTransformer())])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('pt', power_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)


# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)

print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")


Best Accuracy : 0.9317379458265069


## QuantileTransformer (Gaussian output)¶

In [22]:
# Create scaler
scale_columns = base_dropna_x.columns

quantile_gaussian_transformer = Pipeline(steps=[
        ('quantileg', preprocessing.QuantileTransformer(output_distribution='normal'))])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('qg', quantile_gaussian_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)


# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)

print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")


Best Accuracy : 0.9312360244705404


## QuantileTransformer (uniform output)¶

In [25]:
# Create scaler
scale_columns = base_dropna_x.columns

quantile_uniform_transformer = Pipeline(steps=[
        ('quantileu', preprocessing.QuantileTransformer())])

scaler = ColumnTransformer(
        remainder='passthrough', #passthough features not listed
        transformers=[
            ('qu', quantile_uniform_transformer , scale_columns)
        ])

# Scale
scaler.fit(base_dropna_x)

base_dropna_x_scaled = scaler.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled, columns=base_dropna_x.columns)


# Train Model
clf = GradientBoostingClassifier()
param_grid = {'learning_rate': [0.5],
              'loss': ['exponential'],
              'max_depth':[8],
              'max_features':[None],
              'n_estimators':[310],
              'min_samples_split':[2],
              
             }
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=5)
model.fit(base_dropna_x_normalized, base_dropna_y)

print("Best Accuracy :",model.best_score_)


# Prediction
x_scaled = scaler.transform(base_test)
test_normalized = pd.DataFrame(x_scaled, columns=base_test.columns)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")


Best Accuracy : 0.9318182489706242
