In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**导入数据**

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import re as re

In [None]:
train_df = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')  #训练集
test_df = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')       #测试集
train_df.info()
train_df.head(5)

* 向上分析：
15万条数据
12栏项目

**数据审查和整理**

* 向下解释：
首先对数据进行分析

In [None]:
train_df.describe()

* 向上分析：
age.min==0很显然是不合理的

In [None]:
test_df.describe()

In [None]:
import seaborn as sns
plt.figure()
sns.countplot('SeriousDlqin2yrs',data=train_df)

* 向下解释：
训练集中的未成年人很显然是不合理的，因此需要挑出并用中位数替代

In [None]:
train_df.loc[train_df['age'] < 18]

In [None]:
train_df.loc[train_df['age'] == 0, 'age'] = train_df['age'].median()

In [None]:
#按照退休年龄划分数据集
working = train_df.loc[(train_df['age'] >= 18) & (train_df['age'] <= 60)]
senior = train_df.loc[(train_df['age'] > 60)]
working_income_mean = working['MonthlyIncome'].mean()
senior_income_mean = senior['MonthlyIncome'].mean()


In [None]:
working_income_mean

In [None]:
senior_income_mean

* 向上解释：
在数据不平衡的情况下，少数类样本的数量远少于多数类样本，会产生更多的稀疏样本（那些样本数很少的子类中的样本）。由于缺乏足够的数据，分类器对稀疏样本的刻画能力不足，难以有效的对这些稀疏样本进行分类。数据不均衡导致的分类器决策边界偏移也会影响到最终的分类效果。同时当你在对一个类别不均衡的数据集进行分类时得到了90%的准确度。当你进一步分析发现，数据集的90%的样本是属于同一个类，并且分类器将所有的样本都分类为该类。在这种情况下，显然该分类器是无效的。并且这种无效是由于训练集中类别不均衡而导致的。

* 向下分析：
很显然按照是否退休划分数据集是平衡的，不会出现上述问题，因此可以进行下一步对数据空值进行处理

In [None]:
#填充收入空值
train_df['MonthlyIncome'] = train_df['MonthlyIncome'].replace(np.nan,train_df['MonthlyIncome'].mean())

In [None]:
#对NumberOfDependents空值用中位数填充
train_df['NumberOfDependents'].fillna(train_df['NumberOfDependents'].median(), inplace=True)

In [None]:
#检查数据相关性
corr = train_df.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

* 向上分析：
可见除去对角线以外，有三个数据处于极其接近1的状态，他们分别对应NumberOfTime30-59DaysPastDueNotWorse, 
          NumberOfTime60-89DaysPastDueNotWorse,
          NumberOfTimes90DaysLate，因此可以说这三者相关性很大

In [None]:
#画出三者的箱型图
plt.figure(figsize=(19, 12)) 
train_df[['NumberOfTime30-59DaysPastDueNotWorse', 
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']].boxplot()
plt.show()

In [None]:
# 去掉98和96两个点，再查看相关性如何
def replace98and96(column):
    new = []
    newval = column.median()
    for i in column:
        if (i == 96 or i == 98):
            new.append(newval)
        else:
            new.append(i)
    return new

train_df['NumberOfTime30-59DaysPastDueNotWorse'] = replace98and96(train_df['NumberOfTime30-59DaysPastDueNotWorse'])
train_df['NumberOfTimes90DaysLate'] = replace98and96(train_df['NumberOfTimes90DaysLate'])
train_df['NumberOfTime60-89DaysPastDueNotWorse'] = replace98and96(train_df['NumberOfTime60-89DaysPastDueNotWorse'])

test_df['NumberOfTime30-59DaysPastDueNotWorse'] = replace98and96(test_df['NumberOfTime30-59DaysPastDueNotWorse'])
test_df['NumberOfTimes90DaysLate'] = replace98and96(test_df['NumberOfTimes90DaysLate'])
test_df['NumberOfTime60-89DaysPastDueNotWorse'] = replace98and96(test_df['NumberOfTime60-89DaysPastDueNotWorse'])
corr = train_df.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

In [None]:
#对age、MonthlyIncome、NumberOfDependents进行处理
test_df.loc[test_df['age'] == 0, 'age'] = test_df['age'].median()
test_df['MonthlyIncome'] = test_df['MonthlyIncome'].replace(np.nan,test_df['MonthlyIncome'].mean())
test_df['NumberOfDependents'].fillna(test_df['NumberOfDependents'].median(), inplace=True)

**数据分析**

In [None]:
# 为了避免和交叉验证混淆，将train和test设定为其他名称
X = train_df.drop(['SeriousDlqin2yrs'],axis=1)
y = train_df['SeriousDlqin2yrs']
W = test_df.drop(['SeriousDlqin2yrs'],axis=1)
z = test_df['SeriousDlqin2yrs']

* 向上解释：
除去测试集，把剩余数据进行划分，组合成多组不同的训练集和验证集，某次在训练集中出现的样本下次可能成为验证集中的样本，这就是所谓的“交叉”。最后用各次验证误差的平均值作为模型最终的验证误差。

* 向下分析：
线性回归模型

In [None]:
# 用线性回归模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=111)

# 调用线性回归函数，C为正则化系数，l1表示L1正则化
logit = LogisticRegression(random_state=111, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=500)

# 标准化拟合
scaler = StandardScaler().fit(X_train)

# 标准化X_train 和X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 线性回归拟合
logit.fit(X_train_scaled, y_train)

# 输入训练集，返回每个样本对应到每种分类结果的概率
logit_scores_proba = logit.predict_proba(X_train_scaled)

# 返回分类1的概率
logit_scores = logit_scores_proba[:,1]

In [None]:
# 画图
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1], "k--") # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, logit_scores)

# 画图
plot_roc_curve(fpr_logit,tpr_logit)
print ('AUC Score : ', (roc_auc_score(y_train,logit_scores)))

In [None]:
# 验证测试集，测试分类结果概率分布
logit_scores_proba_val = logit.predict_proba(X_test_scaled)

# 分类结果为1的概率
logit_scores_val = logit_scores_proba_val[:,1]

# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit_val, tpr_logit_val, thresh_logit_val = roc_curve(y_test, logit_scores_val)

# 画图
plot_roc_curve(fpr_logit_val,tpr_logit_val)
print('AUC Score :', (roc_auc_score(y_test,logit_scores_val)))

* 向下分析：
梯度提升法分类

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_clf_submission = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05 ,max_depth=4,  random_state=42)
gbc_clf_submission.fit(X_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
gbc_val_proba = gbc_clf_submission.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
print ('AUC Score :', roc_auc_score(y_train, gbc_clf_scores))
print ('AUC Score :', roc_auc_score(y_test, gbc_val_scores))

**数据输出**

In [None]:
submission_proba = gbc_clf_submission.predict_proba(W)
submission_scores = submission_proba[:, 1]
submission_scores.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission_1.csv', index=False)