In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 七步流程
1. 定义问题
2. 获取训练数据和测试数据
3. 整理、准备、清洗数据
4. 分析、发现模式、探索数据
5. 建模、预测、求解问题
6. 可视化、报告、呈现问题求解步骤和最终结论
7. 提交

# 1.定义问题
银行在市场经济中发挥着至关重要的作用。他们决定谁能获得融资，以什么条件获得融资，并能做出或破坏投资决定。为了使市场和社会运作，个人和公司需要获得信贷。

信用评分算法，对违约的概率进行猜测，是银行用来决定是否应该发放贷款的方法。本次比赛要求参赛者通过预测某人在未来两年内遭遇财务困境的概率，来改善信用评分的技术水平。

**导包**

In [None]:
from  sklearn.ensemble import RandomForestRegressor
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import warnings
warnings.filterwarnings("ignore")

# 2.获取数据

In [None]:
data_train = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')
data_test = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')

# 3.整理、准备、清洗数据
## 3.1 查看数据
**1.数据集中包含哪些特征**

In [None]:
data_train.info()
data_test.info()

In [None]:
print(data_train.columns.values)

| 字段名 | 定义                                 | 值                                            |
|----------|--------------------------------------------|------------------------------------------------|
| SeriousDlqin2yrs|是否有超过90天或更长时间逾期未还的不良行为|Y/N
| RevolvingUtilizationOfUnsecuredLines | 可用额度比值| percentage                           |
| age   | 年龄       | integer    |
| NumberOfTime30-59DaysPastDueNotWorse | 逾期30-59天笔数|integer|
| DebtRatio      | 每月债务支付、赡养费和生活费费用之和除以月总收入 |percentage|
| MonthlyIncome      | 月收入                       |real|
| NumberOfOpenCreditLinesAndLoans    | 信贷数量               |integer|
| NumberOfTimes90DaysLate    | 逾期90天笔数 |integer|
| NumberRealEstateLoansOrLines   | 固定资产贷款量    |integer|
| NumberOfTime60-89DaysPastDueNotWorse    | 逾期60-89天笔数 |integer|
| NumberOfDependents    | 家属数量                               |integer|


In [None]:
data_train.head(5)

## 3.2检查数据
**1.查看哪些数据具有缺失值**

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()

**可以看出以下结论**
1. Unamed: 0列为序号，可以直接删去
2. 训练集中年龄age的最小值min为0，是异常值；
3. 变量MonthlyIncome和NumberOfDependents存在缺失值，分别为29731，3924。

# 3. 整理、准备、清洗数据

**1.直接删除序列号**

In [None]:
data_train=data_train.drop(["Unnamed: 0"],axis=1)
data_train.head()

In [None]:
data_test=data_test.drop(["Unnamed: 0"],axis=1)
data_test.head()

**2.清除重复值**

In [None]:
data_train.duplicated().value_counts()

True表示样本之前已经出现，False表示样本第一次出现，说明训练集中存在重复样本609个

In [None]:
data_train = data_train.drop_duplicates()
data_train.duplicated().value_counts()

可以看到，删去重复值之后还剩149391条样本数据

**3.处理age异常值**

age 字段中包含有为 0 的值，通常认为该值为异常值，直接删去该条数据

In [None]:
data_train = data_train[data_train['age'] > 0]

**4.处理MonthlyIncome缺失值**

MonthlyIncome字段缺失值，用该字段平均值填充

In [None]:
data_train['MonthlyIncome'] = data_train['MonthlyIncome'].replace(np.nan,data_train['MonthlyIncome'].mean())
data_train.info()

**4.处理NumberOfDependents缺失值**

NumberOfDependents字段缺失值，用该字段中位数填充

In [None]:
data_train['NumberOfDependents'].fillna(data_train['NumberOfDependents'].median(), inplace=True)
data_train.info()

**5.检查处理结果**

再次查看数据集中样本各字段属性的个数，可以看到此时不在存在空值

In [None]:
data_train.info()

# 4. 分析、发现模式、探索数据

## 4.1相关性分析

In [None]:
data_train['age'].plot.hist(bins=30);

In [None]:
corr_train = data_train.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr_train, annot=True, fmt='.2g')

观察图表可以发现，训练集的'NumberOfTime’字段(逾期30-59天，60-89天，90天以上)间系数过大，存在极大的相关性，表明数据间存在问题

查看NumberOfTime三个字段

In [None]:
import matplotlib.pyplot as plt
columns = ['NumberOfTime30-59DaysPastDueNotWorse',
          'NumberOfTime60-89DaysPastDueNotWorse',
          'NumberOfTimes90DaysLate']
data_train.loc[:, columns].plot.box(vert=False)

观察可以发现，训练集的三个字段间存在离群点，大约在90至100之间，从业务上考虑，不应当出现这样的高的次数，这里同样删除掉这些异常数据。

In [None]:
for col in columns:
    data_train = data_train.loc[data_train[col] < 90]
data_train.loc[:, columns].plot.box(vert=False)

In [None]:
corr_train = data_train.corr()
plt.figure(figsize=(19, 15))
sns.heatmap(corr_train, annot=True, fmt='.2g')

处理后三个字段间的相关性已经被消除

## 4.2 分离数据

**1 在训练集中分离出X_train和y_train, 用测试集定义X_test**

In [None]:
X_train = data_train.iloc[:,1:].values
y_train = data_train.iloc[:,0].values
X_test = data_test.iloc[:,1:].values

X_train.shape, y_train.shape, X_test.shape 

**2.归一化**

In [None]:
X_train = data_train.iloc[:,1:].values
y_train = data_train.iloc[:,0].values
X_test = data_test.iloc[:,1:].values

X_train.shape, y_train.shape, X_test.shape 

In [None]:
from sklearn import preprocessing

print('X_train:')
train_scaler = preprocessing.StandardScaler().fit(X_train)
print('mean_:','\n', train_scaler.mean_)
print('scale_:','\n', train_scaler.scale_)

print('\n', '='*50,'\n')

print('X_test:')
test_scaler = preprocessing.StandardScaler().fit(X_test)
print( 'mean_:','\n', test_scaler.mean_ )
print( 'scale_:','\n', test_scaler.scale_)

In [None]:
X_train_scaled = train_scaler.transform(X_train)
X_test_scaled = test_scaler.transform(X_test)

X_train_scaled.mean(axis=0), X_train_scaled.std(axis=0), X_test_scaled.mean(axis=0), X_test_scaled.std(axis=0)

## 4.3划分训练集

In [None]:
from sklearn.model_selection import train_test_split

X_learn, X_valid, y_learn, y_valid = train_test_split(X_train_scaled, y_train, random_state=0)
X_learn.shape, X_valid.shape, y_learn.shape, y_valid.shape

# 5.建模、预测、求解问题
## 5.1 Logistic Regression

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegressionCV

arg, maxauc = 'none', 0
for s in ['newton-cg', 'lbfgs', 'liblinear']:
    model = LogisticRegressionCV(scoring='roc_auc', solver=s)
    model.fit(X_learn, y_learn)
    
    # 在子验证集上测试
    y_pred = model.predict_proba(X_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(s, score)
    if score > maxauc:
        arg, maxauc = s, score
print()
print(arg, maxauc)

## 5.2 Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = gaussian.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

## 5.3 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators':[30,50,80,100,200]
    },
    scoring='roc_auc',
    verbose=3
)
grid.fit(X_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
rfc = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'])
rfc.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = rfc.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

## 5.4 GDBT(Lightbgm)

In [None]:
import lightgbm as lgb

grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(40,60)
    },
    scoring='roc_auc',
    verbose=1
)
grid.fit(X_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
clf = lgb.LGBMClassifier(n_estimators=grid.best_params_['n_estimators'])
clf.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred = clf.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

## 5.5 模型优化

In [None]:
lgb_grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(),
    param_grid={
        'n_estimators':range(45,55),
        'learning_rate':[0.05,0.1,0.15,0.2,0.25,0.3]
    },
    scoring='roc_auc',
    verbose=1
)
lgb_grid.fit(X_learn, y_learn)
lgb_grid.best_params_['n_estimators'], lgb_grid.best_params_['learning_rate']

In [None]:

from sklearn.model_selection import GridSearchCV, StratifiedKFold
LGB = lgb.LGBMClassifier()

## Search grid for optimal parameters
lgb_param_grid = {
    'n_estimators':range(lgb_grid.best_params_['n_estimators']),
    'learning_rate':[lgb_grid.best_params_['learning_rate']]
}

# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)


gsLGB = GridSearchCV(LGB,param_grid = lgb_param_grid, cv=kfold, scoring="accuracy", verbose=1)

gsLGB.fit(X_learn, y_learn)

# 在子验证集上测试
y_pred_gslgb = gsLGB.predict_proba(X_valid)[:,1]
score = roc_auc_score(y_valid, y_pred_gslgb)
print(score)

## 5.6 根据模型预测

In [None]:
# 将fit的参数换成X_train, y_train，对测试集X_test进行预测
gsLGB.fit(X_train, y_train)
y_pred_gsLGB = gsLGB.predict_proba(X_test)[:,1]

In [None]:
# 输出预测结果
sample = pd.read_csv('../input/GiveMeSomeCredit/sampleEntry.csv')
sample['Probability'] = y_pred_gsLGB
sample


# 6.提交结果

In [None]:
sample.to_csv('./my_submit.csv', index = False)