In [1]:
import pandas as pd
from numpy import mean
from numpy import std
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
import pickle

In [2]:
raw_data = pd.read_csv("/Users/stephaniekim/downloads/TrainingData.csv")
target_name = "Risk_Flag"
target = raw_data[target_name]
data = raw_data.drop(columns=["Id", target_name])

In [3]:
# Column selectors for transformer
numerical_cs = selector(dtype_exclude=object)
categorical_cs = selector(dtype_include=object)
num_col = numerical_cs(data)
cat_col = categorical_cs(data)

In [4]:
cat_preproc = OneHotEncoder(handle_unknown="ignore")
num_preproc = StandardScaler()
preprocessor = ColumnTransformer([
    ('one-hot-encoder', cat_preproc, cat_col),
    ('standard-scaler', num_preproc, num_col)])

# TODO: output to file OR see if I can add to custom model package for pre-processing step
holdout_data = pd.read_csv("/Users/stephaniekim/downloads/TestData.csv")
# holdout_data = holdout_data.drop(columns=["ID"])
# new_data = preprocessor.fit_transform(holdout_data)

In [5]:
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=42)],
                                ['classifier', GradientBoostingClassifier(n_estimators=1000)]])

model = make_pipeline(preprocessor, GradientBoostingClassifier(n_estimators=1000))
set_config(display='diagram')
model

In [8]:
data_train, data_test, target_train, target_test = train_test_split(
    data, target, test_size=0.20, random_state=42, shuffle=True)

In [9]:

model.fit(data_train, target_train)

In [11]:
model.predict(data_test)
model.score(data_test, target_test)

0.8796626984126984

In [12]:
pkl_filename = "/Users/stephaniekim/downloads/credit_risk.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [13]:
cv_results = cross_validate(model, data, target, cv=5)
cv_results

{'fit_time': array([220.45415616, 209.9649682 , 194.72522593, 211.92368317,
        188.86522508]),
 'score_time': array([1.45894814, 1.19622874, 1.08824015, 1.13765812, 1.01507211]),
 'test_score': array([0.88136905, 0.88176587, 0.88246032, 0.88031746, 0.8703373 ])}

In [15]:
model.predict(holdout_data)

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
holdout_data

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
0,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,4,13
1,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,5,10
2,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,9,14
3,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,3,12
4,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,13,11
...,...,...,...,...,...,...,...,...,...,...,...
27995,9955481,57,13,single,rented,no,Statistician,Eluru[25],Andhra Pradesh,5,10
27996,2917765,47,9,single,rented,no,Technical writer,Ratlam,Madhya Pradesh,9,14
27997,8082415,24,5,single,rented,no,Lawyer,Mira-Bhayandar,Maharashtra,4,13
27998,9474180,51,13,single,rented,yes,Chartered Accountant,Bhilai,Chhattisgarh,13,14
