In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# 绘图用
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
pd.set_option("display.max_columns", 120)
pd.set_option("display.max_rows", 120)

# 建模用
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 七步流程
  1. 定义问题
  2. 获取训练数据和测试数据
  3. 整理、准备、清洗数据
  4. 分析、发现模式、探索数据
  5. 建模、预测、求解问题
  6. 可视化、报告、呈现问题求解步骤和最终结论
  7. 提交

### 1. 定义问题
训练集包含了150k条借贷者的样本，并给出了**接下来两年内是否经历严重经济困难（*借贷逾期3个月以上*）**的标记，训练一个模型判断测试集中的借贷者是否会在接下来两年内经历严重经济困难。

### 2. 获取数据

In [None]:
df = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
test_df = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')
df.describe()

### 3. 探索性数据分析
由于是分类问题，我们先观察一下数据分布的平衡性

In [None]:
sns.countplot(x='SeriousDlqin2yrs',data=df,palette='RdBu_r')
plt.title('Binary Class Comparison')

接下来关注一下可能有用的一些数据，由于不同年龄段的人，资产与消费观念都不一样，我们先观察一下年龄与结果的相关性

In [None]:
# 年龄
age_hist = df['age'].hist(by=df['SeriousDlqin2yrs'], bins=20, layout=(2,1))
age_hist[0].set_xlim((0,100))
age_hist[0].set_title('SeriousDlqin2yrs = 0')
age_hist[1].set_xlim((0,100))
age_hist[1].set_title('SeriousDlqin2yrs = 1')

In [None]:
df.MonthlyIncome.describe()

再观察一下负债率

In [None]:
df.DebtRatio.describe()

这里发现，绝大多数人的负债率都比较低，同时有着一些极端值，我们再仔细对其进行分析

In [None]:
# 97.5%分位点
df.DebtRatio.quantile([.975])

有2.5%的人，负债率超过了3500倍，这部分人违约的概率可能会相对要高，我们来观察一下

In [None]:
df[df['DebtRatio'] > 3489.025][['SeriousDlqin2yrs','MonthlyIncome']].describe()

负债率最高的这部分人，只有极小部分有*稳定月收入*这个值，但是其违约的概率并未比训练集整体的平均值要高，同时月收入只有1和0，因此这一数据可能存在问题，为了方便处理，月收入这一项全都去除。接下来观察一下过去的违约情况。
   
**去除负债率>3489**

In [None]:
df.groupby('NumberOfTimes90DaysLate').SeriousDlqin2yrs.describe()

违约超过3个月的人群中，17-96中间的值缺失了，同时次数为98的值特别地多，这可能是数据存在一些错误,同时过去有违约情况的人将来违约的可能性远大于从未违约过的人

In [None]:
df.groupby('NumberOfTime30-59DaysPastDueNotWorse').SeriousDlqin2yrs.describe()

In [None]:
df.groupby('NumberOfTime60-89DaysPastDueNotWorse').SeriousDlqin2yrs.describe()

In [None]:
import time
# 模型｜数据 效果测试器
# A utility class to test all of our models on different datasets
class Tester():
    def __init__(self, target):
        self.target = target
        self.datasets = {}
        self.models = {}
        self.cache = {} # we added a simple cache to speed things up

    def addDataset(self, name, df):
        self.datasets[name] = df.copy()

    def addModel(self, name, model):
        self.models[name] = model
        
    def clearModels(self):
        self.models = {}

    def clearCache(self):
        self.cache = {}
    
    def testModelWithDataset(self, m_name, df_name, sample_len, cv):
        if (m_name, df_name, sample_len, cv) in self.cache:
            return self.cache[(m_name, df_name, sample_len, cv)]

        clf = self.models[m_name]
        
        if not sample_len: 
            sample = self.datasets[df_name]
        else: sample = self.datasets[df_name].sample(sample_len)
            
        X = sample.drop([self.target], axis=1)
        Y = sample[self.target]

        s = cross_validate(clf, X, Y, scoring=['roc_auc'], cv=cv, n_jobs=-1)
        self.cache[(m_name, df_name, sample_len, cv)] = s

        return s

    def runTests(self, sample_len=80000, cv=4):
        # Tests the added models on all the added datasets
        scores = {}
        for m_name in self.models:
            for df_name in self.datasets:
                # print('Testing %s' % str((m_name, df_name)), end='')
                start = time.time()

                score = self.testModelWithDataset(m_name, df_name, sample_len, cv)
                scores[(m_name, df_name)] = score
                
                end = time.time()
                
                # print(' -- %0.2fs ' % (end - start))

        print('--- Top 10 Results ---')
        for score in sorted(scores.items(), key=lambda x: -1 * x[1]['test_roc_auc'].mean())[:10]:
            auc = score[1]['test_roc_auc']
            print("%s --> AUC: %0.4f (+/- %0.4f)" % (str(score[0]), auc.mean(), auc.std()))

            
# We will use a tester object across all models
tester = Tester('SeriousDlqin2yrs')

# You can add datasets like this:
tester.addDataset('Drop Missing', df.dropna())

# And models like this:
rfc = RandomForestClassifier(n_estimators=15, max_depth = 6, random_state=0)
tester.addModel('Simple Random Forest', rfc)

# You can then use it to run the tests
tester.runTests()

In [None]:
# Median Fill, Outliers Removed
removed_debt_outliers = df.drop(df[df['DebtRatio'] > 3489.025].index)
removed_debt_outliers = removed_debt_outliers.fillna(removed_debt_outliers.median())

# Removed utilization outliers
dfus = removed_debt_outliers.drop(removed_debt_outliers[removed_debt_outliers['RevolvingUtilizationOfUnsecuredLines'] > 10].index)

# Removed 98s
dfn98 = dfus.copy()
dfn98.loc[dfn98['NumberOfTime30-59DaysPastDueNotWorse'] > 90, 'NumberOfTime30-59DaysPastDueNotWorse'] = 18
dfn98.loc[dfn98['NumberOfTime60-89DaysPastDueNotWorse'] > 90, 'NumberOfTime60-89DaysPastDueNotWorse'] = 18
dfn98.loc[dfn98['NumberOfTimes90DaysLate'] > 90, 'NumberOfTimes90DaysLate'] = 18

In [None]:
"""
tester.addDataset('Median Fill', df.fillna(df.median()))
tester.addDataset('Median Fill, Outliers Removed', removed_debt_outliers)
tester.addDataset('Removed 98s', dfn98)
tester.addDataset('Removed utilization outliers', dfus)

# tester.runTests()

from sklearn.ensemble import RandomForestClassifier

for i in range(5,10):
    for j in range(10,20):
        rfc = RandomForestClassifier(n_estimators=j,max_depth = i, random_state=0)
        tester.addModel('Random Forest '+'d: '+str(i)+' est: '+str(j)  ,rfc)

tester.runTests()
"""

In [None]:
x_train = dfus.drop('SeriousDlqin2yrs', axis=1)
y_train = dfus['SeriousDlqin2yrs']
x_test= test_df.drop('SeriousDlqin2yrs', axis=1).copy()
x_test = x_test.fillna(x_test.median())


rfbest = RandomForestClassifier(n_estimators = 18, max_depth = 8, random_state=0)
rfbest.fit(x_train, y_train)
y_pred = rfbest.predict_proba(x_test)[:,1]

In [None]:
# test_df = test_df.drop(['SeriousDlqin2yrs'], axis=1)
rf_clf_res = rfbest.predict_proba(x_test)[:,1]
ids = np.arange(1,101504)
submission = pd.DataFrame( {'Id': ids, 'Probability': rf_clf_res})
submission.to_csv("submission.csv", index=False)
submission