# Main Goal
apply your machine learning skills to predict credit default, Training, validation, and testing datasets include time-series behavioral data and anonymized customer profile information.

# Step 1 - load the data and reading libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score

# load the train data, test data,  into DataFrame for further analysis
train_data = pd.read_feather('./train_data.ftr')
test_data = pd.read_feather('./test_data.ftr')
train_label = pd.read_csv('./train_labels.csv')

In [3]:
# Next, we can also explore how the Paid and Default ratio change over time
# to do this, we can first merge our train data with our train_label so that we can time info with target info
train_df = train_data.merge(train_label, left_on='customer_ID', right_on='customer_ID')
train_df.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,,,0.002426,0.003706,0.003819,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,,,0.003956,0.003166,0.005032,,0.009575,0.005493,0.009216,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954102,0.091492,0.021652,1.009766,0.006817,0.123962,0.007599,0.009422,...,,,0.003269,0.007328,0.000427,,0.003429,0.006985,0.002604,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960449,0.002455,0.013687,1.00293,0.001372,0.117188,0.000685,0.005531,...,,,0.006119,0.004517,0.003201,,0.008423,0.006527,0.009598,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947266,0.002483,0.01519,1.000977,0.007607,0.11731,0.004654,0.009308,...,,,0.003672,0.004944,0.008888,,0.00167,0.008125,0.009827,0


In [4]:
# we can first extract the month and year information to better prepare ourselves for monthly analysis
train_df["month"] = train_df["S_2"].str.split("-").str[0] + "_" + train_df["S_2"].str.split("-").str[1]

# find the default customer ratio for eacc unique month
def default_ratio(df):   
    return sum(df == 1) / len(df)
monthly_ratio = train_df[["target", "S_2"]].groupby("S_2").agg(default_ratio)
monthly_ratio = monthly_ratio.reset_index()
monthly_ratio

Unnamed: 0,S_2,target
0,2017-03-01,0.243618
1,2017-03-02,0.247207
2,2017-03-03,0.214741
3,2017-03-04,0.244071
4,2017-03-05,0.236578
...,...,...
391,2018-03-27,0.252054
392,2018-03-28,0.260557
393,2018-03-29,0.229985
394,2018-03-30,0.275603


# Step 3 - Building our model

## use train data to train our model 

In [5]:
# first drop the columns that have too many NA values
df = train_df.dropna(axis = 1, thresh = 0.7 * train_data.shape[0])
df.columns

Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_133', 'R_28', 'D_139', 'D_140', 'D_141', 'D_143', 'D_144', 'D_145',
       'target', 'month'],
      dtype='object', length=161)

In [7]:
print(df.columns.tolist())

['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'B_6', 'B_7', 'B_8', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'B_13', 'R_5', 'D_58', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_63', 'D_64', 'D_65', 'B_16', 'B_18', 'B_19', 'B_20', 'D_68', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'P_4', 'D_74', 'D_75', 'B_24', 'R_7', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_30', 'S_18', 'D_86', 'R_17', 'R_18', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'D_102', 'D_103', 'D_104', 'D_107', 'B_36', 'B_37', 'R_27', 'B_38'

In [9]:
columns = df.columns
for col in columns:
    print(col)

customer_ID
S_2
P_2
D_39
B_1
B_2
R_1
S_3
D_41
B_3
D_43
D_44
B_4
D_45
B_5
R_2
D_46
D_47
D_48
B_6
B_7
B_8
D_51
B_9
R_3
D_52
P_3
B_10
S_5
B_11
S_6
D_54
R_4
S_7
B_12
S_8
D_55
B_13
R_5
D_58
B_14
D_59
D_60
D_61
B_15
S_11
D_62
D_63
D_64
D_65
B_16
B_18
B_19
B_20
D_68
S_12
R_6
S_13
B_21
D_69
B_22
D_70
D_71
D_72
S_15
B_23
P_4
D_74
D_75
B_24
R_7
B_25
B_26
D_78
D_79
R_8
S_16
D_80
R_10
R_11
B_27
D_81
S_17
R_12
B_28
R_13
D_83
R_14
R_15
D_84
R_16
B_30
S_18
D_86
R_17
R_18
B_31
S_19
R_19
B_32
S_20
R_20
R_21
B_33
D_89
R_22
R_23
D_91
D_92
D_93
D_94
R_24
R_25
D_96
S_22
S_23
S_24
S_25
S_26
D_102
D_103
D_104
D_107
B_36
B_37
R_27
B_38
D_109
D_112
B_40
S_27
D_113
D_114
D_115
D_116
D_117
D_118
D_119
D_120
D_121
D_122
D_123
D_124
D_125
D_126
D_127
D_128
D_129
B_41
D_130
D_131
D_133
R_28
D_139
D_140
D_141
D_143
D_144
D_145
target
month


In [6]:
y = df[["target"]]
X = df.drop(["customer_ID", "S_2", "month", "target"], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)

In [6]:
print("the shape of our training data is " + str(X_train.shape))
print("the shape of our test data is " + str(X_test.shape))

the shape of our training data is (4701733, 154)
the shape of our test data is (829718, 154)


In [7]:
LGBM = lgb.LGBMClassifier(boosting_type='goss', max_depth=20, random_state=0, n_estimators=200, learning_rate=0.09, num_leaves=500)

In [8]:
LGBM = LGBM.fit(X_train, y_train)

  return f(**kwargs)


In [9]:
train_score = LGBM.score(X_train, y_train)

print("the score of our LGBM model on the training data is " + str(train_score))

the score of our LGBM model on the training data is 0.8952690422871736


In [10]:
test_score = LGBM.score(X_test, y_test)

print("the score of our LGBM model on the test data is " + str(test_score))

the score of our LGBM model on the test data is 0.8868338399311574


# step 4 - use our model and test data to make prediction

In [11]:
columns = X.columns.tolist()

In [12]:
test_df = test_data[columns]

In [None]:
probs = LGBM.predict_proba(test_df)
probs

In [None]:
probs[:, 1]

In [None]:
pred = LGBM.predict(test_df)
pred

# Step 5 - save result 

In [None]:
result= pd.DataFrame()
result["customer_ID"] = test_data["customer_ID"]
result["prediction"] = probs[:, 1]
result.to_csv('result.csv', index=None)

In [None]:
result

In [None]:
# Importing the pickle library

import pickle

# Dumping the model object to save it as model.pkl file

pickle.dump(model,open('model.pkl','wb+'))