In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#导入包
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#一、导入数据

In [None]:
#导入数据
train_df = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
test_df = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')

In [None]:
#二、清洗数据

In [None]:
#观察数据集的基本信息
print( test_df.info())
print( train_df.info())
#检查发现有一些项目存在缺失值，故在后续的处理中考虑将其填充或删去。

In [None]:
test_df.head()

In [None]:
train_df.describe()
#在观察数据集信息时，意外发现年龄（age）有0，不合理。而且在实际生活中，应只允许成人（即年龄大于18岁）办理。
#故考虑找出所有小于18岁的年龄，采用中位数替换法将其换下去。
#同时注意到NumberOfTime30-59DaysPastDueNotWorse, NumberOfTimes90DaysLate, NumberOfTime60-89DaysPastDueNotWorse三种的最大值都是98，导致平均值很接近
#应注意判别有无异常值

In [None]:
#找出所有小于18岁的年龄项

train_df.loc[train_df['age'] < 18]


In [None]:
# 只有一个年龄小于18，用中位数替代
train_df.loc[train_df['age'] == 0, 'age'] = train_df['age'].median()

In [None]:
#去重
train_df = train_df.drop_duplicates()

In [None]:
# 现在对NumberOfDependents非空值统计
train_df['NumberOfDependents'].value_counts()

In [None]:
# 对空值用中位数填充
train_df['NumberOfDependents'].fillna(train_df['NumberOfDependents'].median(), inplace=True)

In [None]:
#对于缺失比较多的MonthlyIncome，在此建立随机森林模型进行填补。
from sklearn.ensemble import RandomForestRegressor

train_df = train_df.iloc[:,1:]
test_df = test_df.iloc[:,1:]
Data = train_df.iloc[:,[5,0,1,2,3,4,6,7,8,9]]
train_known = Data[Data.MonthlyIncome.notnull()].values
train_unknown = Data[Data.MonthlyIncome.isnull()].values
train_X = train_known[:,1:]
train_y = train_known[:,0]
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(train_X,train_y)

predicted_y = rfr.predict(train_unknown[:,1:]).round(0)
train_df.loc[train_df.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted_y



In [None]:
train_box = train_df.iloc[:,[3,7,9]]
train_box.boxplot()

In [None]:
#观察发现三个特征中有两组数据远远偏离正常值，应该考虑删去

In [None]:
#删除特异值
train_df = train_df[train_df['NumberOfTime30-59DaysPastDueNotWorse']<90]

In [None]:
# 检查数据的相关性
import matplotlib.pyplot as plt
import seaborn as sns
corr = train_df.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr, annot=True, fmt='.2g')

In [None]:
#对test集也作处理：为符合实际，将月收入空值用平均数替换，将家属数量空值用中位数替换，将年龄为0的替换为年龄的中位数
test_df.loc[test_df['age'] == 0, 'age'] = test_df['age'].median()
test_df['MonthlyIncome'] = test_df['MonthlyIncome'].replace(np.nan,test_df['MonthlyIncome'].mean())
test_df['NumberOfDependents'].fillna(test_df['NumberOfDependents'].median(), inplace=True)

In [None]:
#三、数据分析

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler


In [None]:
# 为了避免和交叉验证混淆，将train和test设定为其他名称
X = train_df.drop(['SeriousDlqin2yrs'],axis=1)
y = train_df['SeriousDlqin2yrs']
W = test_df.drop(['SeriousDlqin2yrs'],axis=1)
z = test_df['SeriousDlqin2yrs']

In [None]:
#先使用线性回归模型
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100)

# 调用线性回归函数，C为正则化系数，l1表示L1正则化
log = LogisticRegression(random_state=100, solver='saga', penalty='l1', class_weight='balanced', C=1.0, max_iter=1000)

# 标准化拟合
sca = StandardScaler().fit(X_train)

# 标准化X_train 和X_test
X_train_scaled = sca.transform(X_train)
X_test_scaled = sca.transform(X_test)

# 线性回归拟合
log.fit(X_train_scaled, y_train)

# 输入训练集，返回每个样本对应到每种分类结果的概率
log_scores_proba = log.predict_proba(X_train_scaled)

# 返回分类1的概率
log_scores = log_scores_proba[:,1]

In [None]:
# 画图
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,10))
    plt.plot(fpr, tpr, linewidth=3, label=label)
    plt.plot([0,1],[0,1], "k--") 
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")

In [None]:
# roc_curve根据分类结果和分类概率，返回false positive rage和true positive rate
fpr_logit, tpr_logit, thresh_logit = roc_curve(y_train, log_scores)

# 画图
plot_roc_curve(fpr_logit,tpr_logit)
print ('AUC Score :'   , (roc_auc_score(y_train,log_scores)))


In [None]:
#或许不是那么满意，故尝试其他方法

In [None]:
# 使用梯度提升树法
from sklearn.ensemble import GradientBoostingClassifier
gbc_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=100)
gbc_clf.fit(X_train, y_train)
gbc_clf_proba = gbc_clf.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:, 1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
plot_roc_curve(fpr_gbc, tpr_gbc)
print ('AUC Score:', roc_auc_score(y_train, gbc_clf_scores))

In [None]:
gbc_val_proba = gbc_clf.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:, 1]
print ( 'AUC Score :', roc_auc_score(y_test, gbc_val_scores))

In [None]:
#我认为上述结果是非常好的，于是输出结果

In [None]:
ans_proba = gbc_clf.predict_proba(W)
ans_scores = ans_proba[:, 1]
ans_scores.shape

In [None]:
W.shape

In [None]:
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': ans_scores})
submission.to_csv('submission.csv', index=False)