In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

把Dictionary的信息打印出来，方便查看

In [None]:
!pip install xlrd
import pandas as pd
dict_all = pd.read_excel("/kaggle/input/GiveMeSomeCredit/Data Dictionary.xls")
dict_all

读入数据，查看数据信息

In [None]:
df_train = pd.read_csv("../input/GiveMeSomeCredit/cs-training.csv")
df_test = pd.read_csv("../input/GiveMeSomeCredit/cs-test.csv")

In [None]:
df_train.info()

In [None]:
df_test.info()

做数据清洗

In [None]:
df_train = df_train.rename({"Unnamed: 0" : "ID"}, axis = 1)
df_test = df_test.rename({"Unnamed: 0" : "ID"}, axis = 1)

In [None]:
print(df_train.duplicated().value_counts())
print("------------------")
print(df_test.duplicated().value_counts())

In [None]:
df_train.describe()

In [None]:
df_test.describe()

训练数据的年龄中有0，不合理；
另外，两个数据的MonthlyIncome列和NumberOfDependents列中都有NaN值。

> 先解决年龄列的问题：

In [None]:
df_train.loc[df_train['age'] < 18]

In [None]:
temp = df_train.copy()
temp.drop(65695, inplace = True)
temp.describe()

去掉0数据后，标准差为14.771298，因此原df中可以考虑使用平均值代替0数据

In [None]:
df_train.loc[(df_train['age'] == 0), 'age'] = df_train['age'].mean()
df_train.describe()

> 解决有空值的两个列的问题：

首先发现NumberOfDependents的std均为1.11左右，可以考虑使用平均数来代替空值，但是平均数为浮点数，需要的是整数，因此向下取整。

In [None]:
df_train['NumberOfDependents'].fillna(int(df_train['NumberOfDependents'].mean()), inplace = True)

In [None]:
df_test['NumberOfDependents'].fillna(int(df_test['NumberOfDependents'].mean()), inplace = True)

对于MonthlyIncome，也使用平均数代替NaN值：

In [None]:
df_train['MonthlyIncome'].fillna(df_train['MonthlyIncome'].mean(), inplace = True)

In [None]:
df_test['MonthlyIncome'].fillna(df_test['MonthlyIncome'].mean(), inplace = True)

In [None]:
df_train.describe()

In [None]:
df_test.describe()

从训练数据可以看出，75%以上的数据都是反例，也就是不会遭遇财务危机，可以查看一下：

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure()
sns.countplot("SeriousDlqin2yrs", data = df_train)

可见，0和1的比例是极其不平衡的。因此考虑使用集成学习+阈值调整的方法。

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_train.drop(['SeriousDlqin2yrs', 'ID'], axis = 1)
Y = df_train['SeriousDlqin2yrs']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import xgboost as xgb
from sklearn import metrics
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [None]:
xgb1 = XGBClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.1, objective = 'binary:logistic')

In [None]:
xgb1.fit(X_train, Y_train)

In [None]:
Y_pred = xgb1.predict_proba(X_train)
Y_pred

In [None]:
Y_pred_true = Y_pred[:, 1]
Y_pred_true

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
fpr_train, tpr_train, thresh_train = roc_curve(Y_train, Y_pred_true)

In [None]:
plt.figure()
plt.plot(fpr_train, tpr_train)
plt.xlabel("FP%")
plt.ylabel("TP%")
plt.show()

In [None]:
print("AUC Score = {}".format(roc_auc_score(Y_train, Y_pred_true)))

这个AUC Score很高，在测试集上看看是不是过拟合了：

In [None]:
Y_pred_test = xgb1.predict_proba(X_test)
Y_pred_test

In [None]:
Y_pred_true = Y_pred_test[:, 1]
Y_pred_true

In [None]:
fpr_test, tpr_test, thresh_test = roc_curve(Y_test, Y_pred_true)

In [None]:
plt.figure()
plt.plot(fpr_test, tpr_test)
plt.xlabel("FP%")
plt.ylabel("TP%")
plt.show()

In [None]:
print("AUC Score = {}".format(roc_auc_score(Y_test, Y_pred_true)))

可以发现确实过拟合了，因此把XGBoost分类器的参数调一下：

In [None]:
xgb2 = XGBClassifier(n_estimators = 500, max_depth = 3, learning_rate = 0.03, objective = 'binary:logistic', subsample = 0.5, eval_metric = "error")

In [None]:
xgb2.fit(X_train, Y_train)

In [None]:
Y_pred = xgb2.predict_proba(X_train)
Y_pred_train = Y_pred[:, 1]
fpr_train, tpr_train, thresh_train = roc_curve(Y_train, Y_pred_train)
print("AUC Score(train) = {}".format(roc_auc_score(Y_train, Y_pred_train)))
Y_pred = xgb2.predict_proba(X_test)
Y_pred_test = Y_pred[:, 1]
fpr_test, tpr_test, thresh_test = roc_curve(Y_test, Y_pred_test)
print("AUC Score(test) = {}".format(roc_auc_score(Y_test, Y_pred_test)))

过拟合的问题有了很大的改善。

最后输出结果即可：

In [None]:
# df_test_X = df_test.drop(['SeriousDlqin2yrs', 'ID'], axis = 1)
# submission_proba = xgb2.predict_proba(df_test_X)
# submission_scores = submission_proba[:, 1]
# submission_scores.shape


In [None]:
# ids = np.arange(1, 101504)
# submission = pd.DataFrame({'Id': ids, 'Probability': submission_scores})
# submission.to_csv('submission.csv', index = False)

最后提交的结果不好，因此考虑先对数据欠采样：

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
under_samp = RandomUnderSampler(random_state = 0)
X_sample, Y_sample = under_samp.fit_resample(X_train, Y_train)

In [None]:
xgb3 = XGBClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.1, objective = 'binary:logistic', subsample = 0.5, eval_metric = "error")

In [None]:
xgb3.fit(X_sample, Y_sample)

In [None]:
Y_pred = xgb3.predict_proba(X_train)
Y_pred_train = Y_pred[:, 1]
fpr_train, tpr_train, thresh_train = roc_curve(Y_train, Y_pred_train)
print("AUC Score(train) = {}".format(roc_auc_score(Y_train, Y_pred_train)))
Y_pred = xgb3.predict_proba(X_test)
Y_pred_test = Y_pred[:, 1]
fpr_test, tpr_test, thresh_test = roc_curve(Y_test, Y_pred_test)
print("AUC Score(test) = {}".format(roc_auc_score(Y_test, Y_pred_test)))

看起来还是有些过拟合，再调参：

In [None]:
xgb4 = XGBClassifier(n_estimators = 1000, min_child_weight = 10, max_depth = 5, learning_rate = 0.05, objective = 'binary:logistic', subsample = 0.8, eval_metric = "auc", gamma = 0.6)

In [None]:
xgb4.fit(X_sample, Y_sample)

In [None]:
Y_pred = xgb4.predict_proba(X_train)
Y_pred_train = Y_pred[:, 1]
fpr_train, tpr_train, thresh_train = roc_curve(Y_train, Y_pred_train)
print("AUC Score(train) = {}".format(roc_auc_score(Y_train, Y_pred_train)))
Y_pred = xgb4.predict_proba(X_test)
Y_pred_test = Y_pred[:, 1]
fpr_test, tpr_test, thresh_test = roc_curve(Y_test, Y_pred_test)
print("AUC Score(test) = {}".format(roc_auc_score(Y_test, Y_pred_test)))

看上去结果不错了，生成提交文件：

In [None]:
df_test_X = df_test.drop(['SeriousDlqin2yrs', 'ID'], axis = 1)
X_df_test = scaler.transform(df_test_X)
submission_proba = xgb4.predict_proba(X_df_test)
submission_scores = submission_proba[:, 1]
submission_scores.shape


In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame({'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index = False)

In [None]:
submission_scores