# Py-2021-CreditOne-POA-Student

Last update: 2021.12.13



## Import all required modules

In [None]:
# core
import numpy as np
import pandas as pd
import scipy
from math import sqrt
import matplotlib.pyplot as plt
import seaborn as sns

# sql
import sqlalchemy
from sqlalchemy import create_engine
import pymysql

# SKLearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# models
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC




# Task 1 - Getting Started

In [None]:
# import dataset
db_connection_str = 'mysql+pymysql://deepanalytics:Sqltask1234!@34.73.222.197/deepanalytics'
db_connection = create_engine(db_connection_str)
df = pd.read_sql('SELECT * FROM credit', con = db_connection)


In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

## Preprocess dataset
### Steps to preprocess dataset
* Change col labels
* Sort
* Remove cols 
* Add NaNs and remove
* Remove duplicates
* Use iloc to create new ds
* Write/read csv


In [None]:
# rename col header (refer to article)
df.columns = df.iloc[0]
df.head()

In [None]:
# sort
df = df.sort_values('AGE', ascending=False)
df.head()


In [None]:
# remove rows with labels and col ID
df = df.iloc[3:,1:]
df.head()


In [None]:
# replace missing values, other with NaN and then drop

df = df.replace(['','?'], np.nan)
df[df.isnull().values]
df = df.dropna()
df.isnull().values.any()


In [None]:
# remove any duplicates

df.duplicated().any()
df[df.duplicated()].shape
df = df.drop_duplicates()
df.duplicated().any()


In [None]:
# rename feature

df.rename(columns = {'default payment next month':'DEFAULT'}, inplace = True) 


In [None]:
df.dtypes

In [None]:
# change data types by write/read csv

# write csv 
df.to_csv('dfOOB.csv', index=False)

In [None]:
# read csv
dfOOB = pd.read_csv('dfOOB.csv')

In [None]:
dfOOB.dtypes

# Task 2 - EDA

In [None]:
dfOOB.info()

In [None]:
dfOOB.head()

### Statistics

In [None]:
dfOOB.describe()

### Visualizations

#### Histograms

In [None]:
# Plot using object data type

plt.hist(dfOOB['EDUCATION'])
plt.show()

In [None]:
# Change data type to category and plot same as above.

dfOOB['EDUCATION'] = dfOOB['EDUCATION'].astype('category')
plt.hist(dfOOB['EDUCATION'])
plt.show()

In [None]:
plt.hist(dfOOB['LIMIT_BAL'])
plt.show()

In [None]:
plt.hist(dfOOB['LIMIT_BAL'], bins=4)
plt.show()

#### Line Plots

In [None]:
plt.plot(dfOOB['LIMIT_BAL'])
plt.show()

#### Scatter Plots

In [None]:
x = dfOOB['PAY_0']
y = dfOOB['PAY_2']
plt.scatter(x,y)
plt.show()

#### Box Plots

In [None]:
# dfOOB.dtypes.index   # use code below I/O that in POA 
header = dfOOB.columns
print(header)

In [None]:
A = dfOOB['BILL_AMT1']
plt.boxplot(A,0,'gD')
plt.show()

#### Homework: Other visualiations based on Titanic tutorial

## Feature Selection

For this task, you will not be selecting features.

#### Correlation

In [None]:
corrMat = dfOOB.corr()
print(corrMat)

#### Covariance

In [None]:
covMat = dfOOB.cov()
print(covMat)

# Task 3 - Modeling

## Prepare data

#### OOB ds

In [None]:
# convert IV to dummy vars using get_dummies
oobX = pd.get_dummies(dfOOB.iloc[:,:-1])
oobX.dtypes

In [None]:
# convert DV to dummy vars using LabelEncoder
le = LabelEncoder()
oobY = le.fit_transform(dfOOB['DEFAULT'])
oobY

## Split/Train/Test

In [None]:
# OOB dataset

X_trainOOB, X_testOOB, y_trainOOB, y_testOOB = train_test_split(oobX, oobY)   
print( X_trainOOB.shape )
print( y_trainOOB.shape )
print( X_testOOB.shape )
print( y_testOOB.shape )

##  Create Models

In [None]:
# select 3 classification algorithms 
# replace algo1/2/3 with your chosen algorithms

algos = []
algos.append(('algo1', algo1()))
algos.append(('algo2', algo2()))
algos.append(('algo3', algo3()))


In [None]:
# build models

results = []
names = []

for name, model in algos:
    result = cross_val_score(model, X_trainOOB, y_trainOOB, cv=3)
    names.append(name)
    results.append(result)

In [None]:
# evaluate results

for i in range(len(names)):
    print(names[i],results[i].mean())

## Validate

In [None]:
# select best model(s) to tune and validate with the test set
# NOTE: tuning is optional for this task

bestAlgo = bestAlgo(n_jobs=4)
bestAlgoOOBpred = bestAlgo.fit(X_trainOOB, y_trainOOB).predict(X_testOOB)
print(classification_report(y_testOOB, bestAlgoOOBpred))
print(confusion_matrix(y_testOOB, bestAlgoOOBpred))




## Predict
* Predict the unknown DV for a separate dataset, if provided. 