In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**数据导入**

In [None]:
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import re as re
%matplotlib inline

In [None]:
#导入训练集和测试集并查看数据大致分布
train = pd.read_csv(r'/kaggle/input/GiveMeSomeCredit/cs-training.csv')
test = pd.read_csv(r'/kaggle/input/GiveMeSomeCredit/cs-test.csv')
train.describe()


In [None]:
train.isnull().sum()

**异常数据处理**

In [None]:
#综合以上来看，数据中异常情况包括：
#1.年龄一栏出现了最小值0——中位数替换
#2.出现一列未命名——改成ID列
#3.月收入缺失较多——中位数替换

#观察测试集的情况：
test.describe()

In [None]:
test.isnull().sum()

In [None]:
#情况接近一致
#开始修补异常值：
#1.中位数替换年龄：
train.loc[train['age']==0, 'age'] = train['age'].median()

In [None]:
#2.列重命名：
train.rename(columns={'Unnamed: 0':'ID'}, inplace = True)
test.rename(columns={'Unnamed: 0':'ID'}, inplace = True)

In [None]:
#3.中位数填充收入：
train['MonthlyIncome'] = train['MonthlyIncome'].replace(np.nan,train['MonthlyIncome'].mean())

In [None]:
#中位数填充家属数量
train['NumberOfDependents'].fillna(train['NumberOfDependents'].median(), inplace=True)

**箱型图和相关性检查处理**

In [None]:
#相关性图：
corr = train.corr()
plt.figure(figsize = (19,15))
sns.heatmap(corr, annot=True, fmt='.2g')

In [None]:
#箱型图：
debt=train["DebtRatio"]
debt.plot(kind='box',title='DebtRatio Distribution',sym='r+');

In [None]:
#删除异常值
train = train[train['DebtRatio'] < 8000]

In [None]:
#箱型图：
monthly = train['MonthlyIncome']
monthly.plot(kind='box', title='MonthlyIncome Distribution', sym='r+')

In [None]:
#删除异常值：
train = train[train['MonthlyIncome']<50000]

In [None]:
#箱型图：
depend=train["NumberOfDependents"]
depend.plot(kind='box',title='NumberOfDependents Distribution',sym='r+');

In [None]:
#删除异常值
train = train[train['NumberOfDependents']<10]

In [None]:
#箱线图：
time30_50=train["NumberOfTime30-59DaysPastDueNotWorse"]
time30_50.plot(kind='box',title='NumberOfTime30-59DaysPastDueNotWorse Distribution',sym='r+');

In [None]:
#删除异常值
train = train[train['NumberOfTime30-59DaysPastDueNotWorse'] < 20]
train = train[train['NumberOfTimes90DaysLate'] < 20]
train = train[train['NumberOfTime60-89DaysPastDueNotWorse'] < 20]

In [None]:
test.isnull().sum()

In [None]:
def changeTest(item, maxnum):
    new = []
    newval = test[item].median()
    for i in test[item]:
        if i >= maxnum:
            new.append(newval)
        else:
            new.append(i)
    return new

In [None]:
#对测试集采用同样的方式处理
for (item,num) in [('DebtRatio',8000),('MonthlyIncome',50000), ('NumberOfDependents',10), ('NumberOfTime30-59DaysPastDueNotWorse',20),('NumberOfTimes90DaysLate',20),('NumberOfTime60-89DaysPastDueNotWorse',20)]:
    test[item] = changeTest(item, num)
test.loc[test['age']==0, 'age'] = test['age'].median()
test['MonthlyIncome'] = test['MonthlyIncome'].replace(np.nan,test['MonthlyIncome'].mean())
test['NumberOfDependents'].fillna(test['NumberOfDependents'].median(), inplace=True)

**数据分析与线性回归**

In [None]:
X = train.drop(['SeriousDlqin2yrs', 'ID'],axis=1)
y = train['SeriousDlqin2yrs']
W = test.drop(['SeriousDlqin2yrs', 'ID'],axis=1)
z = test['SeriousDlqin2yrs']

In [None]:
test['SeriousDlqin2yrs'].shape

In [None]:
# 用线性回归模型包
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=111)

gbc_clf_submission = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05 ,max_depth=4,  random_state=42)
gbc_clf_submission.fit(X_train,y_train)
gbc_clf_proba = gbc_clf_submission.predict_proba(X_train)
gbc_clf_scores = gbc_clf_proba[:,1]
gbc_val_proba = gbc_clf_submission.predict_proba(X_test)
gbc_val_scores = gbc_val_proba[:,1]
fpr_gbc, tpr_gbc, thresh_gbc = roc_curve(y_train, gbc_clf_scores)
print ('AUC Score :', roc_auc_score(y_train, gbc_clf_scores))
print ('AUC Score :', roc_auc_score(y_test, gbc_val_scores))

**输出**

In [None]:
submission_proba =gbc_clf_submission.predict_proba(W)
submission_scores = submission_proba[:, 1]
ids = np.arange(1, 101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': submission_scores})
submission.to_csv('submission.csv', index=False)

****