# Load Data

In [None]:
import pandas as pd

train_data = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
train_data.drop(columns = ['Unnamed: 0'], inplace = True)
train_data



# Data Processing

In [None]:
train_data.drop_duplicates(inplace = True)
train_data.index = range(train_data.shape[0])
train_data['NumberOfDependents'].fillna(0, inplace = True)
train_data.info()

In [None]:
# filter 出income 不缺的行数

xgb_data = train_data.loc[train_data.MonthlyIncome.notna()]
xgb_feature_columns = ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                       'DebtRatio', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse']
xgb_data.index = range(xgb_data.shape[0])
xgb_train = xgb_data[xgb_feature_columns + ['MonthlyIncome']]
xgb_train



In [None]:
from model_train import xgb_optuna_regression
xgb_model = xgb_optuna_regression(df = xgb_train, 
                                  label_column = 'MonthlyIncome', 
                                  enable_optuna = True, 
                                  n_trials = 50)


In [None]:
pred_data = train_data[train_data['MonthlyIncome'].isna()]
pred_features = pred_data[xgb_feature_columns]
pred_monthly = xgb_model.predict(pred_features)
pred_data['MonthlyIncome'] = pred_monthly
pred_data['MonthlyIncome'] = pred_data['MonthlyIncome'].apply(lambda x: round(x, 1) if x > 0 else 0)
pred_data.index = range(pred_data.shape[0])
pred_data


In [None]:
train_data = pd.concat([xgb_data, pred_data])
train_data.index = range(train_data.shape[0])
train_data.info()

In [None]:
# outlier检测

In [None]:
# inbalance

X = train_data.iloc[:, 1:]
y = train_data.iloc[:,0]
y.value_counts()

# resample

z = train_data[train_data['SeriousDlqin2yrs'] == 1]
f = train_data[train_data['SeriousDlqin2yrs'] == 0]
z # -> 10009
f # -> 139382

zcopy = z.iloc[0:0].copy()

len_f_z = int(len(f) / len(z))
len_f_z
for i in range(len_f_z):
    if i != len_f_z:
        zcopy = zcopy.append(z)
sample_data = pd.concat([zcopy, f])
sample_data.info()

X = sample_data.iloc[:, 1:]
y = sample_data.iloc[:,0]
y.value_counts()


In [None]:
# train
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import sklearn.metrics
from model_train import logistic_optuna
import time
import optuna


df = sample_data.copy()
label_column = 'SeriousDlqin2yrs'
# model = logistic_optuna(df, label_column, enable_optuna = True, n_trials = 50)
n_trials = 100
enable_optuna = True
start_time = time.time()
y_data = df[label_column]
x_data = df.drop(columns = [label_column])
X_train, X_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2)
solver = 'liblinear'
if enable_optuna:
    def objective(trial):
        solver = trial.suggest_categorical('solver', ['liblinear', 'sag', 'saga'])
        class_weight = trial.suggest_categorical('class_weight', ['balanced',  None])
        c = trial.suggest_uniform('c', 0.1, 1)
        model = LogisticRegression(solver = solver, class_weight = class_weight, C = c)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        error = sklearn.metrics.mean_squared_error(y_val, y_pred)
        return error
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    solver = study.best_params['solver']
    class_weight = study.best_params['class_weight']
    c = study.best_params['c']
model = LogisticRegression(solver = solver, class_weight = class_weight, C = c)
model.fit(X_train, y_train)
end_time = time.time()
print('time cost : ', round((end_time - start_time) / 60, 2), 'min')
    

y_prob = model.predict_proba(X_val)[:,1]
y_pred = model.predict(X_val)
y_prob



In [None]:
# AUC

from sklearn import metrics

fpr_lr, tpr_lr, threshold_lr = metrics.roc_curve(y_val, y_prob)
auc_lr = metrics.auc(fpr_lr, tpr_lr)
score_lr = metrics.accuracy_score(y_val, y_pred)
print('precision: ', score_lr)
print('AUC: ', auc_lr)

In [None]:
fpr_lr
tpr_lr

import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = [5, 5]
plt.plot(fpr_lr, tpr_lr, label='ROC')
plt.title('ROC')
plt.legend(loc="best", shadow=True)
plt.xticks(rotation= 45)
plt.grid()
plt.show()

# predict

In [None]:
import numpy as np

pred_data = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')
pred_data.drop(columns = ['Unnamed: 0'], inplace = True)
pred_data['NumberOfDependents'].fillna(0, inplace = True)
x_data = pred_data.loc[pred_data.MonthlyIncome.notna()]
xgb_feature_columns = ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                       'DebtRatio', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse']
x_data.index = range(x_data.shape[0])
f_data = pred_data[pred_data['MonthlyIncome'].isna()]
f_features = f_data[xgb_feature_columns]
f_monthly = xgb_model.predict(f_features)
f_data['MonthlyIncome'] = f_monthly
f_data['MonthlyIncome'] = f_data['MonthlyIncome'].apply(lambda x: round(x, 1) if x > 0 else 0)
f_data.index = range(f_data.shape[0])
pred_data = pd.concat([x_data, f_data])
pred_data.index = range(pred_data.shape[0])
pred_data.info()


In [None]:
pred_df = model.predict_proba(pred_data.drop(columns = 'SeriousDlqin2yrs'))
pred_df
