# 相关库

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from  sklearn.ensemble import RandomForestRegressor
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import re 

# 一、导入数据

In [None]:
# 读取训练集和测试集数据
train_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
test_data = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')

In [None]:
# 训练集
print(train_data.info())
train_data.head(5)

In [None]:
# 测试集
print(test_data.info())
test_data.head(5)

# 二、清洗数据

In [None]:
# copy操作
train_df = train_data.copy()
test_df = test_data.copy()

## 2.1 检查数据

可以从前面发现，属性名‘Unnamed: 0’应该改为‘ID’

In [None]:
# 列重命名

train_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
test_df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)

In [None]:
# 去重，结果显示没有重复数据，不采取操作

print(train_df.duplicated().value_counts())

# train_df.drop_duplicates()

In [None]:
# 对分类结果SeriousDlqin2yrs查看

plt.figure()
sns.countplot('SeriousDlqin2yrs',data=train_df)

In [None]:
# 可以看出分类结果是极其不平衡的，事件发生率如下
P = train_df.groupby('SeriousDlqin2yrs')['ID'].count().reset_index()
P['Percentage'] = 100 * P['ID'] / P['ID'].sum()
print(P)

数据不平衡会让监督学习算法过多关注多数类，使分类性能下降；因为数据足够多，采用欠采样；采用正则回归模型和集成模型

## 2.2 缺失值处理

缺失值指的是现有数据集中某个或某些属性的值是不完全的。

缺失值的处理方法一般包括：
1. 直接使用含有缺失值的属性（不处理）；
2. 删除含有缺失值的属性（该方法在包含缺失值的属性仅仅包含**极少量**有效值时是有效的）；
3. 直接删除含有缺失值的样本；
4. 缺失值补全：均值插补、建模预测等等

In [None]:
train_df.isnull().sum()

可以看出
1. 变量MonthlyIncome 缺失值较多，不能直接删除样本，同时缺失值也没有多到能直接删除属性，所以需要补全缺失值，这里使用随机森林预测
2. 变量NumberOfDependents的缺失值较少，这里就直接删除含缺失值的样本

In [None]:
# 用随机森林对MonthlyIncome的缺失值预测填充函数
def set_missing(df):
    # 把已有的数值型特征取出来
    process_df = df.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11]]
    # 分成已知该特征和未知该特征两部分
    known = process_df[process_df.MonthlyIncome.notnull()].values
    unknown = process_df[process_df.MonthlyIncome.isnull()].values
    # X为特征属性值
    X = known[:, [1,2,3,4,5,7,8,9,10]]
    # y为结果标签值
    y = known[:,[6]]
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, 
    n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:,[1,2,3,4,5,7,8,9,10]]).round(0)
    print(predicted)
    # 用得到的预测结果填补原缺失数据
    print(len(predicted))
    df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
    return df

In [None]:
train_df=set_missing(train_df)#用随机森林填补比较多的缺失值
train_df=train_df.dropna()#删除比较少的缺失值
train_df.shape

In [None]:
test_df.isnull().sum()

测试数据集也有这样的情况，缺失值填充。不能作删除样本的操作，所以对NumberOfDependents缺失值取中位数。

In [None]:
# 用随机森林对MonthlyIncome的缺失值预测填充函数
def set_missing2(df):
    # 把已有的数值型特征取出来
    process_df = df.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11]]
    # 分成已知该特征和未知该特征两部分
    known = process_df[process_df.MonthlyIncome.notnull()].values
    unknown = process_df[process_df.MonthlyIncome.isnull()].values
    # X为特征属性值
    X = known[:, [2,3,4,5,7,8,9,10]]
    # y为结果标签值
    y = known[:,[6]]
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, 
    n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(X,y)
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(unknown[:,[2,3,4,5,7,8,9,10]]).round(0)
    print(predicted)
    # 用得到的预测结果填补原缺失数据
    print(len(predicted))
    df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
    return df

In [None]:
#用随机森林填补比较多的缺失值，即MonthlyIncome
test_df=set_missing2(test_df)
# 对空值用中位数填充
test_df['NumberOfDependents'].fillna(test_df['NumberOfDependents'].median(), inplace=True)
test_df.shape

## 2.3 异常值处理

异常值指在数据集中存在的不合理的值，又称离群点，比如年龄小于0，或者不符合正态分布的数据。

异常值的处理方法一般包括：
1. 删除含有异常值的样本
2. 将异常值视为缺失值，应用缺失值处理方法
3. 用平均值来修正
4. 不处理

In [None]:
# 看是否有异常值
train_df.describe()

可以从这个结果看到年龄的最小值为0，是不合理的值，且这样的样本很少，所以直接删除异常的样本

In [None]:
train_df = train_df[train_df['age']>0]

In [None]:
# 大致数据分布情况
train_df.hist(bins=50, figsize=(20,15))
plt.show()

可以看出DebtRatio、NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate、NumberRealEstateLoansOrLines、RevolvingUtilizationOfUnsecuredLines的数据分布情况比较异常，应该有一些极端的数值影响了分布图像的呈现。下面分别用箱型图查看一下数值的分布，删去极端值，并检查相关性。

### 2.3.1 DebtRatio

In [None]:
datatemp1=train_df["DebtRatio"]
datatemp1.plot(kind='box',title='DebtRatio Distribution',sym='r+');

In [None]:
# DebtRatio异常值的数量
print(train_df[train_df['DebtRatio'] > 8000].count()) 

可以看出（相较于150000的总样本数量）DebtRatio的异常值很少，不影响整体数据，于是选择删去

In [None]:
train_df = train_df[train_df['DebtRatio'] < 8000]

### 2.3.2 MonthlyIncome

In [None]:
datatemp2=train_df["MonthlyIncome"]
datatemp2.plot(kind='box',title='MonthlyIncome Distribution',sym='r+');

In [None]:
print(train_df[train_df['MonthlyIncome'] > 50000].count()) 

In [None]:
train_df = train_df[train_df['MonthlyIncome'] < 50000]

### 2.3.3 NumberOfDependents

In [None]:
datatemp3=train_df["NumberOfDependents"]
datatemp3.plot(kind='box',title='NumberOfDependents Distribution',sym='r+');

In [None]:
print(train_df[train_df['NumberOfDependents'] > 10].count()) 

In [None]:
train_df = train_df[train_df['NumberOfDependents'] < 10]

### 2.3.4 NumberOfTime30-59DaysPastDueNotWorse

### &NumberOfTime60-89DaysPastDueNotWorse

### &NumberOfTimes90DaysLate

In [None]:
# 查看一下三者的箱型图
plt.figure(figsize=(19, 12)) 
train_df[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

In [None]:
print(train_df[train_df['NumberOfTime30-59DaysPastDueNotWorse'] > 20].count())

In [None]:
train_df = train_df[train_df['NumberOfTime30-59DaysPastDueNotWorse'] < 20]

### 2.3.5 NumberRealEstateLoansOrLines

In [None]:
datatemp5=train_df["NumberRealEstateLoansOrLines"]
datatemp5.plot(kind='box',title='NumberRealEstateLoansOrLines Distribution',sym='r+');

In [None]:
print(train_df[train_df['NumberRealEstateLoansOrLines'] > 30].count())

In [None]:
#剔除异常值
train_df = train_df[train_df['NumberRealEstateLoansOrLines'] < 30]

### 2.3.6 RevolvingUtilizationOfUnsecuredLines

In [None]:
datatemp6=train_df["RevolvingUtilizationOfUnsecuredLines"]
datatemp6.plot(kind='box',title='RevolvingUtilizationOfUnsecuredLines Distribution',sym='r+');

In [None]:
print(train_df[train_df['RevolvingUtilizationOfUnsecuredLines'] > 3].count()) 

In [None]:
train_df = train_df[train_df['RevolvingUtilizationOfUnsecuredLines'] < 3]

### 处理后的训练集

In [None]:
# 大致数据分布情况
train_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
test_df.describe()

测试数据集也有这样的情况，主要对NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate的异常值做替换为中位数的处理

In [None]:
# 查看一下三者的箱型图
plt.figure(figsize=(19, 12)) 
test_df[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

In [None]:
# 去掉98和96两个异常值
def replace98and96(column):
    new = []
    newval = column.median()
    for i in column:
        if (i == 96 or i == 98):
            new.append(newval)
        else:
            new.append(i)
    return new

In [None]:
test_df['NumberOfTime30-59DaysPastDueNotWorse'] = replace98and96(test_df['NumberOfTime30-59DaysPastDueNotWorse'])
test_df['NumberOfTimes90DaysLate'] = replace98and96(test_df['NumberOfTimes90DaysLate'])
test_df['NumberOfTime60-89DaysPastDueNotWorse'] = replace98and96(test_df['NumberOfTime60-89DaysPastDueNotWorse'])

# 三、探索性分析

客户收入和年龄分布如下图所示，可以看到两个变量都大致呈正态分布，符合统计分析的假设。

In [None]:
plt.figure(figsize=(15,5))
plt.hist(train_df.MonthlyIncome,bins=70,alpha=0.8,rwidth=0.9)
plt.title("MonthlyIncome distribution")
plt.ylabel('value of MonthlyIncome', fontsize=12)
plt.xlabel('MonthlyIncome', fontsize=12)
 
plt.show()

In [None]:
MI = train_df['MonthlyIncome']
sns.distplot(MI)

In [None]:
plt.figure(figsize=(15,5))
plt.hist(train_df.age,bins=50,alpha=0.8,rwidth=0.9)
plt.title("age distribution")
plt.ylabel('value of age', fontsize=12)
plt.xlabel('age', fontsize=12)
 
plt.show()

In [None]:
age = train_df['age']
sns.distplot(age)

In [None]:
# 检查数据的相关性
corr = train_df.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

# 四、模型分析

## 4.1 数据设定

In [None]:
# 为了避免和交叉验证混淆，将train和test设定为其他名称
X = train_df.drop(['SeriousDlqin2yrs', 'ID'],axis=1)
y = train_df['SeriousDlqin2yrs']
W = test_df.drop(['SeriousDlqin2yrs', 'ID'],axis=1)
z = test_df['SeriousDlqin2yrs']

## 4.2 线性回归分类

In [None]:
# 用线性回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=111)

# 调用线性回归函数，C为正则化系数，l1表示L1正则化
logit = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)

# 标准化拟合
scaler = StandardScaler().fit(X_train)

# 标准化X_train 和X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 线性回归拟合
logit.fit(X_train_scaled, y_train)

# 输入训练集，返回每个样本对应到每种分类结果的概率
logit_scores_proba = logit.predict_proba(X_train_scaled)

# 返回分类1的概率
logit_scores = logit_scores_proba[:,1]

In [None]:
# 画图
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--") # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")


In [None]:
# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)

# 画图
plot_roc_curve(fpr_logit,tpr_logit)
print('AUC Score : ', (roc_auc_score(y_train,logit_scores)))

In [None]:
# 验证测试集，测试分类结果概率分布
logit_scores_proba_val = logit.predict_proba(X_test_scaled)

# 分类结果为1的概率
logit_scores_val = logit_scores_proba_val[:,1]

# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit_val, tpr_logit_val, thresh_logit_val = roc_curve(y_test, logit_scores_val)

# 画图
plot_roc_curve(fpr_logit_val,tpr_logit_val)
print('AUC Score :', (roc_auc_score(y_test,logit_scores_val)))

In [None]:
# 采用LogisticRegressionCV来交叉验证选择正则化系数C
from sklearn.linear_model import LogisticRegressionCV
logit = LogisticRegressionCV(Cs=[0.001, 0.01, 0.1, 1, 10, 100], penalty='l1', solver='saga', max_iter=500, class_weight='balanced', random_state=111)

# 线性回归拟合
logit.fit(X_train_scaled, y_train)

print(logit.C_)

In [None]:
# 输入训练集，返回每个样本对应到每种分类结果的概率
logit_scores_proba = logit.predict_proba(X_train_scaled)

# 返回分类1的概率
logit_scores = logit_scores_proba[:,1]

# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)

# 画图
plot_roc_curve(fpr_logit,tpr_logit)
print('AUC Score : ', (roc_auc_score(y_train,logit_scores)))

从结果看，LR方法调参数并不能很好地提高AUC，虽然采用了balanced权重，但是效果还是不理想；接下来尝试先将数据降采样，再采用随机森林法

## 4.3 降采样处理

In [None]:
# 引入降采样模块
from imblearn.under_sampling import RandomUnderSampler

# Counter类的目的是用来跟踪值出现的次数
from collections import Counter
print('Original dataset shape :', Counter(y))

In [None]:
# 调用模块
rus = RandomUnderSampler(random_state=111)

# 直接降采样后返回采样后的数值
X_resampled, y_resampled = rus.fit_resample(X, y)
print('Resampled dataset shape:', Counter(y_resampled))

In [None]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train_rus, X_test_rus, y_train_rus, y_test_rus = train_test_split(X_resampled, y_resampled, random_state=111)
X_train_rus.shape, y_train_rus.shape

In [None]:
# 对重采样以后的数据进行分类
logit_resampled = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)

logit_resampled.fit(X_resampled, y_resampled)
logit_resampled_proba_res = logit_resampled.predict_proba(X_resampled)
logit_resampled_scores = logit_resampled_proba_res[:, 1]
fpr_logit_resampled, tpr_logit_resampled, thresh_logit_resampled = roc_curve(y_resampled, logit_resampled_scores)
plot_roc_curve(fpr_logit_resampled, tpr_logit_resampled)
print('AUC score: ', roc_auc_score(y_resampled, logit_resampled_scores))

准确率反而降低了

## 4.4 随机森林分类

In [None]:
# 采用随机森林法分类和梯度上升法
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
forest = RandomForestClassifier(n_estimators=300, random_state=111, max_depth=5, class_weight='balanced')
forest.fit(X_train_rus, y_train_rus)
y_scores_prob = forest.predict_proba(X_train_rus)
y_scores = y_scores_prob[:, 1]
fpr, tpr, thresh = roc_curve(y_train_rus, y_scores)
plot_roc_curve(fpr, tpr)
print('AUC score:', roc_auc_score(y_train_rus, y_scores))

In [None]:
# 交叉验证
y_test_proba = forest.predict_proba(X_test_rus)
y_scores_test = y_test_proba[:, 1]
fpr_test, tpr_test, thresh_test = roc_curve(y_test_rus, y_scores_test)
plot_roc_curve(fpr_test, tpr_test)
print('AUC Score:', roc_auc_score(y_test_rus, y_scores_test))

In [None]:
# 看看随机森林法对各个特征的重视程度
def plot_feature_importances(model):
    plt.figure(figsize=(10,8))
    n_features = X.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.ylim(-1, n_features)

plot_feature_importances(forest)

## 4.5 梯度提升法分类

In [None]:
# 看一下梯度提升树法
gbc_clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=8, random_state=112)
gbc_clf.fit(X_train, y_train)
gbc_clf_proba = gbc_clf.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:, 1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc, tpr_gbc)
print('AUC Score:', roc_auc_score(y_train, gbc_clf_scores))

In [None]:
# 来看一下交叉验证的结果
gbc_val_proba = gbc_clf.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:, 1]
print('AUC score:', roc_auc_score(y_test, gbc_val_scores))

调一下参数

In [None]:
gbc_clf_submission = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05 ,max_depth=4,  random_state=42)
gbc_clf_submission.fit(X_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
gbc_val_proba = gbc_clf_submission.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
print('AUC Score :', roc_auc_score(y_train, gbc_clf_scores))
print('AUC Score :', roc_auc_score(y_test, gbc_val_scores))

In [None]:
plot_feature_importances(gbc_clf)

# 五、输出数据

In [None]:
submission_proba = gbc_clf_submission.predict_proba(W)
submission_scores = submission_proba[:, 1]
submission_scores.shape

In [None]:
W.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)