In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/GiveMeSomeCredit'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 定义问题与数据读取
* 本次竞赛的所要预测的问题：某人在未来两年内遭遇财务困境的可能性。

In [None]:
#读取数据
train_df = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
test_df = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')

#  数据探索与预处理
> **1. 数据探索**

In [None]:
train_df.head()   #查看训练集前5行数据

In [None]:
test_df.head()    #查看测试集前5行数据

In [None]:
train_df.columns   #查看训练集的特征

分类型特征：SeriousDlqin2yrs

数值型特征：
              比率标度：RevolvingUtilizationOfUnsecuredLines、DebtRatio
              连续型特征：age、MonthlyIncome
              离散型特征：NumberOfOpenCreditLinesAndLoans、NumberOfTime30-   59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate、NumberOfDependents

In [None]:
train_df.info()        #查看训练集信息

训练集总共有150000条数据，数据类型都是int和float，MonthlyIncome（月收入）和NumberOfDependents（家属数量）有缺失值

In [None]:
test_df.info()    ##查看测试集信息

测试集总共有101503条数据，MonthlyIncome和NumberOfDependents都有缺失值。

**查看缺失值**

In [None]:
train_df.isnull().sum()   #具体查看训练集缺失值

In [None]:
test_df.isnull().sum()    #具体查看测试集缺失值

训练集和测试集的缺失值数量都占比较大，所以不能直接删除。
MonthlyIncome属于连续型数值特征，且缺失较多，可以用平均数填补缺失值。NumberOfDependents可以用中位数填充空值。

**查看异常值**

In [None]:
train_df.describe()

训练集和测试集RevolvingUtilizationOfUnsecuredLines在75%值为0.55，最大值却为50578，可能分布不均或是异常值存在。DebtRatio也是同样的问题。NumberOf-Time30-59DaysPastDueNotWorse, NumberOfTimes90DaysLate, NumberOfTime60-89DaysPastDueNotWorse三种的最大值都是98。训练集age最小值存在0，有异常值。家属数量最大值有43。

In [None]:
train_df.describe(percentiles=[.61, .62, .68, .69, .75, .8, .99])

具体查看一下age的情况

In [None]:
train_df.loc[train_df['age'] < 18]    #查看age小于18的情况

age小于18的数据只有一条，用中位数填充就好

查看以下三条相近特征的箱线图，看是否有异常值

In [None]:
plt.figure(figsize=(10, 5)) 
train_df[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

可以看到三个特征都存在异常点，将异常值都删掉。

In [None]:
# 查看SeriousDlqin2yrs分布
plt.figure()
sns.countplot('SeriousDlqin2yrs',data=train_df)

可以看到为0的占比大多数，分类非常不平衡，可能会造成预测性能下降。

In [None]:
correlation = train_df.corr()
f , ax = plt.subplots(figsize = (13, 13))
plt.title('heatmap',y=1,size=16)
sns.heatmap(correlation,annot = True,  vmax=0.8)

这时的热力图是还没消除异常值的情况下

In [None]:
age=train_df['age']
sns.distplot(age)
plt.show()

mi=train_df[train_df['MonthlyIncome']<50000]['MonthlyIncome']
sns.distplot(mi)
plt.show()      #观察图，年龄和收入分布皆近似正态分布！）

年龄和收入分布近似正态分布。

# 2.数据预处理 

In [None]:
train_df.drop_duplicates(inplace=True)    #去重

对未命名的第一列重命名为ID，方便观察。

In [None]:
train_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
test_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)

将NumberOf-Time30-59DaysPastDueNotWorse, NumberOfTimes90Days-Late, NumberOfTime60-89DaysPastDueNotWorse大于90的点删除。

In [None]:
train_df = train_df[train_df['NumberOfTime30-59DaysPastDueNotWorse'] < 90] 
train_df = train_df[train_df['NumberOfTimes90DaysLate'] < 90] 
train_df = train_df[train_df['NumberOfTime60-89DaysPastDueNotWorse'] < 90] 

家属人数大于8的设为8。

In [None]:
k = 0
for i in train_df['NumberOfDependents']:
    if i>8:
        train_df['NumberOfDependents'].values[k]=8
    k +=1
k = 0
for i in test_df['NumberOfDependents']:
    if i>8:
        test_df['NumberOfDependents'].values[k]=8
    k +=1

age为0的用中位数填充。

In [None]:
k = 0
for i in test_df['age']:
    if i==0:
        test_df['age'].values[k]=test_df['age'].median()
    k +=1
k = 0
for i in train_df['age']:
    if i==0:
        train_df['age'].values[k]=train_df['age'].median()
    k +=1

用中位数填充年龄小于18的数据。

In [None]:
train_df.loc[train_df['age'] < 18, 'age'] = train_df['age'].median()

用平均数填充MonthlyIncome空值，用中位数填充NumberOf-Dependents空值。

In [None]:
train_df['MonthlyIncome'] = train_df['MonthlyIncome'].replace(np.nan,train_df['MonthlyIncome'].mean())
test_df['MonthlyIncome'] = test_df['MonthlyIncome'].replace(np.nan,test_df['MonthlyIncome'].mean())
train_df['NumberOfDependents'] = train_df['NumberOfDependents'].replace(np.nan,train_df['NumberOfDependents'].median())
test_df['NumberOfDependents'] = test_df['NumberOfDependents'].replace(np.nan,test_df['NumberOfDependents'].median())

划分训练集合测试集。

In [None]:
from sklearn.model_selection import train_test_split
x = train_df.drop(['SeriousDlqin2yrs', 'ID'],axis=1)
y= train_df['SeriousDlqin2yrs']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

写出绘图auc曲线函数

In [None]:
#auc绘图
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--") # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

# 3.模型选择

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
#随机森林
from  sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(x_train,y_train)
pred=rfc.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, pred)
roc_auc = auc(fpr,tpr)
plot_roc_curve(fpr,tpr)
print ('AUC Score :', roc_auc)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(max_depth=5,eval_metric='auc',objective='binary:logistic')
model.fit(x_train, y_train)
# make predictions for test data
y_pred = model.predict_proba(x_test)[:,1]
# evaluate predictions
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
print ('AUC Score :', roc_auc)
plot_roc_curve(fpr,tpr)

In [None]:

from sklearn.ensemble import GradientBoostingClassifier
gbc_clf_submission = GradientBoostingClassifier()
gbc_clf_submission.fit(x_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, gbc_clf_proba)
roc_auc = auc(fpr,tpr)
print ('AUC Score :', roc_auc)
plot_roc_curve(fpr,tpr)

# 4.优化调参

先对提升框架内的，迭代次数和学习率做调整，选一个较小的学习率，对迭代次数网格化调参。

In [None]:
# from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# param_test1 = {'n_estimators':range(20,81,10)}
# gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
#                                   min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10),
#                        param_grid = param_test1, scoring='roc_auc',cv=5)
# gsearch1.fit(x_train,y_train)
# means = gsearch1.cv_results_['mean_test_score']
# params = gsearch1.cv_results_['params']
# print(means)
# print(params)

找到了一个合适的迭代次数，现在开始对决策树进行调参。首先我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。

In [None]:
# param_test2 = {'max_depth':range(3,10,2), 'min_samples_split':range(100,801,200)}
# gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, min_samples_leaf=20, 
#       max_features='sqrt', subsample=0.8, random_state=10), 
#    param_grid = param_test2, scoring='roc_auc', cv=5)
# gsearch2.fit(x_train,y_train)
# means = gsearch2.cv_results_['mean_test_score']
# params = gsearch2.cv_results_['params']
# print(means)
# print(params)

由于决策树深度7是一个比较合理的值，所以把它定下来，对于内部节点再划分所需最小样本数min_samples_split，暂时不能一起定下来，因为这个和决策树其他的参数存在关联。再对min_samples_split和min_samples_leaf一起调参。

In [None]:
# param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
# gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=7,
#                                      max_features='sqrt', subsample=0.8, random_state=10), 
#                        param_grid = param_test3, scoring='roc_auc', cv=5)
# gsearch3.fit(x_train,y_train)
# means = gsearch3.cv_results_['mean_test_score']
# params = gsearch3.cv_results_['params']
# print(means)
# print(params)

In [None]:
# gbm1 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=7, min_samples_leaf =120, 
#                min_samples_split =1000, max_features='sqrt', subsample=0.8, random_state=10)
# gbm1.fit(x_train,y_train)
# gbc_clf_proba = gbm1.predict_proba(x_test)[:,1]
# y_pred = gbm1.predict(x_test)
# fpr, tpr, _ = roc_curve(y_test, gbc_clf_proba)
# roc_auc = auc(fpr,tpr)
# plot_roc_curve(fpr,tpr)
# print ('AUC Score :', roc_auc)

In [None]:
# param_test4 = {'max_features':range(1,9)}
# gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=7, min_samples_leaf =120, 
#                min_samples_split =1000, subsample=0.8, random_state=10), 
#                        param_grid = param_test4, scoring='roc_auc', cv=5)
# gsearch4.fit(x_train,y_train)
# means = gsearch4.cv_results_['mean_test_score']
# params = gsearch4.cv_results_['params']
# print(means)
# print(params)

In [None]:
# param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
# gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50,max_depth=7, min_samples_leaf =120, 
#                min_samples_split =1000, max_features=4, random_state=10), 
#                        param_grid = param_test5, scoring='roc_auc', cv=5)
# gsearch5.fit(x_train,y_train)
# means = gsearch5.cv_results_['mean_test_score']
# params = gsearch5.cv_results_['params']
# print(means)
# print(params)

In [None]:
gbm1 = GradientBoostingClassifier(learning_rate=0.01, n_estimators=900,max_depth=7, min_samples_leaf =120, 
               min_samples_split =1000, max_features='sqrt', subsample=0.8, random_state=10)
gbm1.fit(x_train,y_train)
gbc_clf_proba = gbm1.predict_proba(x_test)[:,1]
y_pred = gbm1.predict(x_test)
fpr, tpr, _ = roc_curve(y_test, gbc_clf_proba)
roc_auc = auc(fpr,tpr)
plot_roc_curve(fpr,tpr)
print ('AUC Score :', roc_auc)

In [None]:
test_df = test_df.drop(['SeriousDlqin2yrs', 'ID'],axis=1)
gbc_clf_proba = gbm1.predict_proba(test_df)[:,1]
ids = np.arange(1,101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': gbc_clf_proba})
submission.to_csv("submision.csv", index=False)

最后运用新参数拟合模型，得到最终模型。