In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [None]:
df = pd.read_csv('../input/GiveMeSomeCredit/cs-training.csv')
testdf = pd.read_csv('../input/GiveMeSomeCredit/cs-test.csv')
df.head()

**一、对数据性质的考察**


全属性一览：
1. Unnamed:0，编号；
2. SeriousDlqin2yrs：是否违约；
3. RevolvingUtilizationOfUnsecuredLines：不安全线路的循环利用率，代表欠款与信用额度的比率；
4. age：年龄；
5. NumberOfTime30-59DaysPastDueNotWorse：过了30-59天还没还的次数
6. DebtRatio：资产与负债的比率；
7. MonthlyIncome：月收入；
8. NumberOfOpenCreditLinesAndLoans：
9. NumberOfTimes90DaysLate：过了90+天还没还的次数
10. NumberRealEstateLoansOrLines：
11. NumberOfTime60-89DaysPastDueNotWorse：过了60-89天还没还的次数
12. NumberOfDependents：家属人数；

In [None]:
#1：生成年龄-是否赖债的直方图，目的是看它是否能保证年龄大的人得到贷款，关于年龄的问题
#可以保证
age_hist = df['age'].hist(by=df['SeriousDlqin2yrs'], bins=20, layout=(2,1))
age_hist[0].set_xlim((0,100))
age_hist[0].set_title('SeriousDlqin2yrs = 0')
age_hist[1].set_xlim((0,100))
age_hist[1].set_title('SeriousDlqin2yrs = 1')

In [None]:
#2：债务与资产比率的问题 DebtRatio的92分是哪个数，是1685，非常不合理
df.DebtRatio.quantile([.92])

In [None]:
#把这1685之后的人打出来，发现全是老赖
#这些人的月收入要么是1要么是0，所以应当把这一部分数据全部删去
#删去数据的时候应当删去哪一部分，也要考虑欠债的实际总数，有些人拖欠的债务并不比一般人多，说明只是输入错误
df[df['DebtRatio'] > 1685][['Unnamed: 0','SeriousDlqin2yrs','MonthlyIncome']].describe()

In [None]:
#3：NumberOfTimes[30-59][60-89][90+]DaysLate
#数百人已98次违约时间超过90天
df.groupby('NumberOfTimes90DaysLate').NumberOfTimes90DaysLate.count()

In [None]:
#如果用svm，这些极端值需要改进，如果用随机森林，就不要管了
#极端值，指这些人违约率极高，有很多天晚96/98次
df[df['NumberOfTimes90DaysLate'] > 95][['SeriousDlqin2yrs','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate']].describe()

In [None]:
#4: RevolvingUtilizationOfUnsecuredLines
#不安全线路的循环利用率。它代表了欠款与信用额度的比率，所以它不应该超过1，看看接近和超过1的值
df[(df['RevolvingUtilizationOfUnsecuredLines'] > .9) & (df['RevolvingUtilizationOfUnsecuredLines'] <= 4)].SeriousDlqin2yrs.describe()

In [None]:
df[(df['RevolvingUtilizationOfUnsecuredLines'] > 4) & (df['RevolvingUtilizationOfUnsecuredLines'] <= 10)].SeriousDlqin2yrs.describe()

In [None]:
df[df['RevolvingUtilizationOfUnsecuredLines'] > 10].describe()

In [None]:
#5：缺失值MonthlyIncome和NumberOfDependents
#和训练集一样要填补MonthlyIncome和NumberOfDependents的缺失值
testdf.isnull().sum()

In [None]:
df.head()

**二、根据以上分析，处理数据集**

In [None]:
fulldata = [df, testdf]
i = 1
for dataset in fulldata:
    #5：Median Fill, Outliers Removed
    income_median = dataset['MonthlyIncome'].median()
    income_null_count = dataset['MonthlyIncome'].isnull().sum()
    income_null_list = np.array([income_median]*income_null_count)
    dataset['MonthlyIncome'][np.isnan(dataset['MonthlyIncome'])] = income_null_list
    dataset['MonthlyIncome'] = dataset['MonthlyIncome'].astype(int)

    #3：Removed 98s: NumberOfTimeDaysPastDueNotWorse
    dataset.loc[dataset['NumberOfTime30-59DaysPastDueNotWorse']> 90, 'NumberOfTime30-59DaysPastDueNotWorse'] = 18
    dataset.loc[dataset['NumberOfTime60-89DaysPastDueNotWorse']> 90, 'NumberOfTime60-89DaysPastDueNotWorse'] = 18
    dataset.loc[dataset['NumberOfTimes90DaysLate']> 90, 'NumberOfTimes90DaysLate'] = 18
    
    #4：Removed utilization outliers
    if i==1:
         dataset.drop(dataset[dataset['RevolvingUtilizationOfUnsecuredLines'] >10].index,inplace=True)
    
    #2：Removed DebtRatio
    if i==1:
        dataset.drop(dataset[df['DebtRatio'] > 1685].index,inplace=True)
    
    #0 Fill: dependents，
    dependents_null_count = dataset['NumberOfDependents'].isnull().sum()
    dependents_null_list = np.array([0]*dependents_null_count)
    dataset['NumberOfDependents'][np.isnan(dataset['NumberOfDependents'])] = dependents_null_list
    dataset['NumberOfDependents'] = dataset['NumberOfDependents'].astype(int)
    i += 1
testdf.isnull().sum()

**三、随机森林+网格搜索**

In [None]:
df.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

y_train = df['SeriousDlqin2yrs'].astype('uint8')
df = df.drop(['SeriousDlqin2yrs'],axis=1)
x_train = df.values
x_test = testdf.drop(['SeriousDlqin2yrs'],axis=1).values
print(y_train)

def Tuning(cv_params, other_params):
    model2 = RandomForestClassifier(**other_params)
    optimized_GBM = GridSearchCV(estimator=model2,
                                param_grid=cv_params,
                                cv=3,
                                n_jobs=4)
    optimized_GBM.fit(x_train, y_train)
    print(optimized_GBM.best_params_)
    print(optimized_GBM.best_score_)

#调n_estimators
cv_params = {'n_estimators':range(6,30,1)}
other_params = {
    'max_depth' : 8,
    'random_state':0
}
Tuning(cv_params, other_params)


In [None]:
#调max_depth
cv_params = {'max_depth':range(2,30,1)}
other_params = {
    'n_estimators' : 17,
    'random_state':0
}
Tuning(cv_params, other_params)

**四、调参完毕，将训练模型用于测试集，提交**

In [None]:
Unnamed = testdf['Unnamed: 0']
trainunnamed = df['Unnamed: 0']

rfc = RandomForestClassifier(
n_estimators = 17,
max_depth = 6,
random_state=0).fit(x_train, y_train)
rfc_predictions = rfc.predict_proba(x_test)[:,1]
StackingSubmission = pd.DataFrame({'Id':Unnamed, 'Probability':rfc_predictions})
StackingSubmission.to_csv("RFC.csv", index=False)

In [None]:
StackingSubmission.head()