In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Exploratory Data Analysis

## 1.1 Mengimport module yang diperlukan

In [None]:
!pip install xlrd

In [None]:
import pandas as pd # module baca file csv
import matplotlib.pyplot as plt # module untuk visualisasi (EDA)

## 1.2 Membaca data
### 1.2.1 Menyimpan data ke-variable

In [None]:
train_data = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-training.csv", index_col=0)
test_data  = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-test.csv", index_col=0)
data_description = pd.read_excel("/kaggle/input/GiveMeSomeCredit/Data Dictionary.xls", header=1)

### 1.2.2 Pengecekan fitur dan column

Disini kita akan melihat kelengkapan data, misalnya terdapat missing value (null)

In [None]:
train_data.head() 

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)
# Training dan testing data memiliki fitur yang sama

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.isnull().sum()
# Terdapat 2 fitur yang memiliki nilai null pada training data

In [None]:
test_data.isnull().sum()
# Terdapat 3 column yang memiliki nilai null pada test data
# Namun column SeriousDlqin2yrs dibiarkan null karena kita akan memprediksinya

## 1.3 Explorasi fitur

### 1.3.1 SeriousDlqin2yrs

Data dicolumn ini digunakan sebagai target dari prediksi. Berisi enum antara 0 atau 1, Kita bisa menggunakan Piechart atau Barchart untuk menghitung banyaknya tiap value

In [None]:
import seaborn as s

# dibawah ini adalah berapa banyak orang yang memiliki tunggakan lewat 90 hari
# 1 artinya memiliki tunggakan
# 0 artinya tidak memiliki

target_count = train_data["SeriousDlqin2yrs"].value_counts()
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

s.countplot("SeriousDlqin2yrs", data=train_data, ax=axes[0])

axes[1].set_title("SeriousDlqin2yrs")
target_count.plot.pie(explode=[0, 0.1], autopct='%1.1f%%',ax=axes[1])

Dari data tersebut kita bisa mengambil kesimpulan bahwa data yang kita miliki tidak seimbang, karena rasio dari 2 kelas tersebut adalah 14:1. Umumnya kriteria data yang baik adalah yang memiliki rasio kurang lebih 50:50. Jadi kita tidak bisa terlalu bergantung pada skor akurasi untuk memprediksi kesuksesan model.

### 1.3.2 Korelasi dengan Heatmap

Kita visualisasikan hubungan tiap fitur dengan heatmap

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
s.heatmap(train_data.corr(), annot=True, cmap="Blues", ax=ax)

### 1.3.3 Age

In [None]:
age_bins = [e for e in range(0, 100, 5)]

plt.hist(train_data["age"], bins=age_bins)

# Dari data berikut kita tau bahwa mayoritas yang mengajukan kartu kredit adalah rentang 45 - 60

# 2. Feature Engineering

### 2.1 Menghapus outliers

Dari korelasi map diatas kita mengetahu bahwa ada 3 column yang memiliki korelasi yang tinggi yaitu: NumberOfTimes90DaysLate, NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse. Setelah itu kita perlu mendeteksi ouliers dari ketiga column tersebut. Lalu menghapusnya dengan fungsi berikut.

In [None]:
from scipy import stats
import numpy as np

def remove_outliers(df, feature_name, max_scale_from_std):
    f = df[feature_name]
    std = f.std()
    distance_from_mean = f - f.mean()
    condition = np.abs(distance_from_mean) >= max_scale_from_std * std
    df.loc[condition, feature_name] = f.mean()
    
    return df

In [None]:
feature_one = train_data['NumberOfTimes90DaysLate']

fig, axes = plt.subplots(1, 2, figsize=(13, 6))
axes[0].scatter(feature_one, feature_one)
axes[0].set_xlabel("Sebelum")

train_data = remove_outliers(train_data, "NumberOfTimes90DaysLate", 3)
feature_one = train_data["NumberOfTimes90DaysLate"]
axes[1].scatter(feature_one, feature_one)
axes[1].set_xlabel("Sesudah")

plt.show()

In [None]:
feature_two = train_data['NumberOfTime30-59DaysPastDueNotWorse']

fig, axes = plt.subplots(1, 2, figsize=(13, 6))
axes[0].scatter(feature_two, feature_two)
axes[0].set_xlabel("Sebelum")

train_data = remove_outliers(train_data, "NumberOfTime30-59DaysPastDueNotWorse", 3)
feature_two = train_data["NumberOfTime30-59DaysPastDueNotWorse"]
axes[1].scatter(feature_two, feature_two)
axes[1].set_xlabel("Sesudah")

plt.show()

In [None]:
feature_three = train_data['NumberOfTime60-89DaysPastDueNotWorse']

fig, axes = plt.subplots(1, 2, figsize=(13, 6))
axes[0].scatter(feature_three, feature_three)
axes[0].set_xlabel("Sebelum")

train_data = remove_outliers(train_data, "NumberOfTime60-89DaysPastDueNotWorse", 3)
feature_three = train_data["NumberOfTime60-89DaysPastDueNotWorse"]
axes[1].scatter(feature_three, feature_three)
axes[1].set_xlabel("Sesudah")

plt.show()

Bisa kita lihat 3 feature diatas memiliki nilai outliers dan kita sudah membersihkannya.

## 2.2 Pengecekan kemiringan data

Disini kita akan plot data berdasarkan median, mean, ataupun modus untuk cek kearah mana data itu miring.  
Data yang simetris biasanya ditandai dengan plot mediannya adalah data tertinggi, lalu kiri dan kanan memiliki tinggi yang kurang lebih sama.  

Kita mengisi nilai null berdasarkan kondisi berikut:  
Gunakan median jika:
> (mean < median < mode)

Gunakan modus jika:
> (mean > median > mode)


In [None]:
import seaborn as s

In [None]:
# Plot data training

fig, ax = plt.subplots(figsize=(18, 6), ncols=2)
s.distplot(train_data["NumberOfDependents"], ax=ax[0])
s.distplot(train_data["MonthlyIncome"], ax=ax[1])

### 1.3.3 Mencari mean, median, dan modus

In [None]:
number_of_dependents_train = train_data["NumberOfDependents"]
number_of_dependents_train_mean = number_of_dependents_train.mean()
number_of_dependents_train_median = number_of_dependents_train.median()
number_of_dependents_train_mode = number_of_dependents_train.mode().mean()

monthly_income_train = train_data["MonthlyIncome"]
monthly_income_train_mean = monthly_income_train.mean()
monthly_income_train_median = monthly_income_train.median()
monthly_income_train_mode = monthly_income_train.mode().mean()

print("Number Of Dependents:", number_of_dependents_train_mean, number_of_dependents_train_median, number_of_dependents_train_mode)
print("Monthly Income:", monthly_income_train_mean, monthly_income_train_median, monthly_income_train_mode)

# Terlihat dari plot diatas serta kondisi mean-median-mode, data condong ke arah kanan

In [None]:
# Plot data testing

fig, ax = plt.subplots(figsize=(18, 6), ncols=2)
s.distplot(test_data["NumberOfDependents"], ax=ax[0]) 
s.distplot(test_data["MonthlyIncome"], ax=ax[1])

In [None]:
number_of_dependents_test = test_data["NumberOfDependents"]
number_of_dependents_test_mean = number_of_dependents_test.mean()
number_of_dependents_test_median = number_of_dependents_test.median()
number_of_dependents_test_mode = number_of_dependents_test.mode().mean()

monthly_income_test = test_data["MonthlyIncome"]
monthly_income_test_mean = monthly_income_test.mean()
monthly_income_test_median = monthly_income_test.median()
monthly_income_test_mode = monthly_income_test.mode().mean()

print("Number Of Dependents:", number_of_dependents_test_mean, number_of_dependents_test_median, number_of_dependents_test_mode)
print("Monthly Income:", monthly_income_test_mean, monthly_income_test_median, monthly_income_test_mode)

# Terlihat dari plot diatas serta kondisi mean-median-mode, data condong ke arah kanan

## 2.3 Mengisi nilai kosong

In [None]:
# Setelah mendapatkan nilai modus dan median, kita bisa langsung mengisi nilai kosong tersebut
# Sesuai kondisi kemiringan diatas

train_data['NumberOfDependents'].fillna(number_of_dependents_train_mode, inplace=True)
train_data['MonthlyIncome'].fillna(monthly_income_train_mode, inplace=True)

test_data['NumberOfDependents'].fillna(number_of_dependents_test_mode, inplace=True)
test_data['MonthlyIncome'].fillna(monthly_income_test_mode, inplace=True)

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

# 3. Modeling dan prediksi

## 3.1 Membuat data testing dari data training

In [None]:
# Kita tidak perlu menggunakan fitur SeriousDlqin2yrs karena hanya digunakan untuk testing
X = train_data.drop("SeriousDlqin2yrs", axis=1)
y = train_data["SeriousDlqin2yrs"]

X.info()

In [None]:
from sklearn.model_selection import train_test_split

# Data kita bagi dengan rasio 70/30 untuk training/testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## 3.2 Membuat dan mengetest model

Untuk setiap model yang kita test, kita akan menentukan possible parameter dan membuat random search untuk menemukan parameter terbaik.

### 3.2.1 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

parameter = {
    'n_estimators': [9,27,36],
    'max_depth': [3,7,9],
    'min_samples_leaf': [2, 4]
}

rf = RandomForestClassifier()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_forest_search = RandomizedSearchCV(rf, param_distributions=parameter, cv=5)

In [None]:
random_forest_search.fit(x_train, y_train)

In [None]:
# Mendapatkan parameter terbaik untuk random forest

best_estimator_rf = random_forest_search.best_estimator_
best_estimator_rf

In [None]:
# Acuracy for random forest
print("training accuracy: {:.2f}".format(random_forest_search.score(x_train, y_train) * 100))
print("validation accuracy: {:.2f}".format(random_forest_search.score(x_test, y_test) * 100))

### 3.2.2 LightGBM

In [None]:
from lightgbm import LGBMClassifier

parameter = {
    'n_estimators': [100, 250],
    'max_depth': [8, 24],
    'num_leaves': [25, 50],
    'first_metric_only': [True]
}


lgb = LGBMClassifier()

In [None]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(
    estimator=lgb,
    param_grid=parameter,
    cv=5,
)

lgbm_fit = gs.fit(x_train, y_train)

In [None]:
print(lgbm_fit.best_score_)
print(lgbm_fit.best_params_)

In [None]:
# Acuracy for lightgbm
print("training accuracy: {:.2f}".format(gs.score(x_train, y_train) * 100))
print("validation accuracy: {:.2f}".format(gs.score(x_test, y_test) * 100))

### 3.2.3 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

parameter = {
    'solver': ['newton-cg', 'lbfgs'],
    'penalty': ['l2', 'elasticnet'],
    'C': [ 1e-1, 1],
}

lr = LogisticRegression()

In [None]:
%%capture --no-display

gs_lr = GridSearchCV(
    estimator=lr,
    param_grid=parameter,
    cv=5,
)

lr_fit = gs_lr.fit(x_train, y_train)

In [None]:
print(lr_fit.best_score_)
print(lr_fit.best_params_)

In [None]:
# Acuracy for logistic regression
print("training accuracy: {:.2f}".format(gs_lr.score(x_train, y_train) * 100))
print("validation accuracy: {:.2f}".format(gs_lr.score(x_test, y_test) * 100))

## 3.3 Kesimpulan

Dari 3 model diatas terlihat bahwa LightGBM mendapatkan score yang paling baik, jadi kita gunakan itu sebagai final model untuk melakukan prediksi pada submission.

### 3.3.1 Confusion matrix

In [None]:
# Ambil probabilitas prediksi untuk target 0 dan 1
# Tidak lupa konversi dari data continouse ke binary

continous_proba = lgbm_fit.best_estimator_.predict_proba(x_test)

proba = continous_proba[:, 1]
proba[proba >= 0.5] = 1
proba[proba < 0.5] = 0
proba = proba.astype(int)

In [None]:
from sklearn.metrics import confusion_matrix

matrix = confusion_matrix(y_test, proba) # Mengambil probabilitas keluar angka 1

plt.figure(figsize=(12, 6))

s.heatmap(matrix, annot=True, fmt=".2f", lineWidths=5, square=True, cmap="PuBuGn_r")
plt.show()

### 3.3.2 Classification report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, proba))

### 3.3.3 AUC ROC Score

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, proba)

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, continous_proba[:, 1])
plt.plot(fpr, tpr)
plt.plot(fpr, fpr, linestyle = '--', color = 'b')
plt.xlabel('Rate of false positive')
plt.ylabel('Rate of true positive')
plt.title('ROC')

## 3.4 Submission

In [None]:
X = test_data.drop(["SeriousDlqin2yrs"], axis=1)
y = lgbm_fit.predict_proba(X)[:, 1]
ids = X.index.values
predicted = pd.DataFrame({'Id': ids, 'Probability': y})
predicted.to_csv("submission.csv", index=False)