In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline  

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 七步流程
1. 定义问题
2. 获取训练数据和测试数据
3. 整理，准备，清洗数据
4. 探索数据
5. 建模
6. 可视化，报告，呈现问题求解步骤和最终结论
7. 提交

# 1.定义问题
通过个人的数据判别将来的还款能力与还款意愿

# 2.读取数据

In [None]:
train = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv', index_col=0)
test = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-test.csv", index_col=0)

In [None]:
print(train.shape)
print(test.shape)
train.describe()

* 从上，我们可以得出其训练集有15000条数据和101503条测试集数据

# 3.数据预处理
## 3.1缺失值处理

In [None]:
train.isnull().sum()


In [None]:
train.isnull().sum()/train.shape[0]


* 1. 从表格中可以发现MonthlyIncome 和Number of Dependents 具有缺失值，分别为29731和3924。 MonthlyIncome的缺失值比例为19.82%,numberofDependents的缺失值为2.6%。由于MonthlyIncome的缺失值数量大，我们不能直接删除，而是需要用平均值进行替换，而NumberOfDependents的缺失值占比不大,我们可以直接删除

In [None]:
train['MonthlyIncome'] = train['MonthlyIncome'].fillna(train['MonthlyIncome'].mean())
train = train.dropna()
train.info()

## 3.2异常值处理
3.2.1 RevolvingUtilizationOfUnsecuredLines


In [None]:
plt.boxplot(train['RevolvingUtilizationOfUnsecuredLines'])
plt.show()
train['RevolvingUtilizationOfUnsecuredLines'].describe()

* 通过箱线图以及表格我们可以看出其前75%的数值为0.56，但是最大值为50708，可以判断出次特征数据具有异常值。异常值的定义为在四分位数外的值，通过公式我们可以计算出其数值，并对异常值进行删除

In [None]:
q1=train['RevolvingUtilizationOfUnsecuredLines'].quantile(0.25)  #计算上四分位数
q3=train['RevolvingUtilizationOfUnsecuredLines'].quantile(0.75)  #计算下四分位数
iqr=q3-q1
low=q1-1.5*iqr
up=q3+1.5*iqr
train=train[(train['RevolvingUtilizationOfUnsecuredLines']>low) & (train['RevolvingUtilizationOfUnsecuredLines']< up)]


plt.boxplot(train['RevolvingUtilizationOfUnsecuredLines'])
plt.show()
train['RevolvingUtilizationOfUnsecuredLines'].describe()

3.2.2 Age

In [None]:
plt.boxplot(train['age'])
plt.show()
train['age'].describe()

* 通过查看箱线图以及表格，我们可以看出其主要的异常值是其最小值0，根据日常情况，岁数为0的贷款人不可能存在，所以在处理此特征的时候，我们只需要取大于0的数值。

In [None]:
train = train[train['age']>0]
plt.boxplot(train['age'])
plt.show()
train['age'].describe()

3.2.3 NumberOfTimePastDueNotWorse

In [None]:
plt.boxplot(train['NumberOfTime30-59DaysPastDueNotWorse'])
plt.show()
train['NumberOfTime30-59DaysPastDueNotWorse'].describe()

* 从上述图中，我们发现其在大于80的地方有两个异常值，根据实际情况，逾期笔数在98的情况不太可能存在，我们进行删除处理。基于关于大于逾期天数的特征还有两样，我们对其他两样再进行分析，查看是否有相同情况

In [None]:
fig=plt.figure(figsize=(5,5))
a=fig.add_subplot(1,1,1)
a.boxplot([train['NumberOfTimes90DaysLate'],train['NumberOfTime60-89DaysPastDueNotWorse']])
plt.show()

* 基于上述箱线图，我们可以发现这两个变量拥有相同的异常值，我们可以对这三样变量的异常值进行删除

In [None]:
train = train[train['NumberOfTime30-59DaysPastDueNotWorse']<80]
train = train[train['NumberOfTimes90DaysLate']<80]
train = train[train['NumberOfTime60-89DaysPastDueNotWorse']<80]


3.2.4 DebtRatio


In [None]:
plt.boxplot(train['DebtRatio'])
plt.show()
train['DebtRatio'].describe()

* 此变量具有和RevolvingUtilizationOfUnsecuredLines一样的异常值问题，我们可以也利用计算公式去除异常值

In [None]:
q1=train['DebtRatio'].quantile(0.25)  #计算上四分位数
q3=train['DebtRatio'].quantile(0.75)  #计算下四分位数
iqr=q3-q1
low=q1-1.5*iqr
up=q3+1.5*iqr
train=train[(train['DebtRatio']>low) & (train['DebtRatio']< up)]


plt.boxplot(train['DebtRatio'])
plt.show()
train['DebtRatio'].describe()

3.2.5 MontylyIncome

In [None]:
plt.boxplot(train['MonthlyIncome'])
plt.show()
train['MonthlyIncome'].describe()

* 基于月收入的范围可以很广，对此变量我们不做异常值处理

3.2.6 NumberOfOpenCreditLinesAndLoans

In [None]:
plt.boxplot(train['NumberOfOpenCreditLinesAndLoans'])
plt.show()
train['NumberOfOpenCreditLinesAndLoans'].describe()

* 通过表格以及箱线图，我们发现其具有outline，我们可以通过公式进行异常值删除处理


In [None]:
q1=train['NumberOfOpenCreditLinesAndLoans'].quantile(0.25)  #计算上四分位数
q3=train['NumberOfOpenCreditLinesAndLoans'].quantile(0.75)  #计算下四分位数
iqr=q3-q1
low=q1-1.5*iqr
up=q3+1.5*iqr
train=train[(train['NumberOfOpenCreditLinesAndLoans']>low) & (train['NumberOfOpenCreditLinesAndLoans']< up)]


plt.boxplot(train['NumberOfOpenCreditLinesAndLoans'])
plt.show()
train['NumberOfOpenCreditLinesAndLoans'].describe()

3.2.7 NumberRealEstateLoansOrLines

In [None]:
plt.boxplot(train['NumberRealEstateLoansOrLines'])
plt.show()
train['NumberRealEstateLoansOrLines'].describe()

* 通过表格以及箱线图，我们发现其具有outline，我们可以通过公式进行异常值删除处理


In [None]:
q1=train['NumberRealEstateLoansOrLines'].quantile(0.25)  #计算上四分位数
q3=train['NumberRealEstateLoansOrLines'].quantile(0.75)  #计算下四分位数
iqr=q3-q1
low=q1-1.5*iqr
up=q3+1.5*iqr
train=train[(train['NumberRealEstateLoansOrLines']>low) & (train['NumberRealEstateLoansOrLines']< up)]


plt.boxplot(train['NumberRealEstateLoansOrLines'])
plt.show()
train['NumberRealEstateLoansOrLines'].describe()

3.2.8 NumberOfDependents

In [None]:
plt.boxplot(train['NumberOfDependents'])
plt.show()
train['NumberOfDependents'].describe()

* 此处考虑实际情况，有可能依靠的家庭数量有此范围，我们不做处理

## 4.探索数据

In [None]:

train.hist(figsize=(20,15))


* 数据探索 判断各个特征变量是否满足统计基本假设，分别绘制直方图进行分析。
* 从上我们发现其连续性变量都是符合统计基本假设


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


# 5.模型预测

5.1 Split 测试集和预测集

In [None]:

x = train.drop(['SeriousDlqin2yrs'],axis=1)
y = train['SeriousDlqin2yrs']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 20)


5.2 对比模型

In [None]:

rf = RandomForestClassifier()
result = cross_val_score(rf,x_train,y_train,scoring='roc_auc',cv=StratifiedKFold(n_splits=10))
rfAuc = result.mean()
rfAuc

In [None]:
svc = SVC()
result = cross_val_score(svc,x_train,y_train,scoring='roc_auc',cv=StratifiedKFold(n_splits=10))
sucAuc = result.mean()
sucAuc

In [None]:
perceptron = Perceptron()
result = cross_val_score(perceptron,x_train,y_train,scoring='roc_auc',cv=StratifiedKFold(n_splits=10))
perceptronAuc = result.mean()
perceptronAuc

In [None]:
tree = DecisionTreeClassifier()
result = cross_val_score(tree,x_train,y_train,scoring='roc_auc',cv=StratifiedKFold(n_splits=10))
treeAuc = result.mean()
treeAuc

In [None]:
lgbm = LGBMClassifier()
result = cross_val_score(lgbm,x_train,y_train,scoring='roc_auc',cv=StratifiedKFold(n_splits=10))
lgbmAuc = result.mean()
lgbmAuc

In [None]:
xg = XGBClassifier()
result = cross_val_score(xg,x_train,y_train,scoring='roc_auc',cv=StratifiedKFold(n_splits=10))
xgAuc = result.mean()
xgAuc

In [None]:
models = pd.DataFrame({
    'Model': ['Random Forest', 'Support Vector Machines', 'Perceptron', 
              'Decision Tree', 'lgbm',
              'xgboost'],
    'CV-Auc': [rfAuc, sucAuc, perceptronAuc,treeAuc,
             lgbmAuc, xgAuc]
})
models.sort_values(by='CV-Auc', ascending=False)

In [None]:
sns.countplot(y)

* 对比各个模型后我们发现 SVM，Decision Tree Perceptron的分数非常低，其原因通过查看y值，我们可以看到其分布不平衡，导致模型的正确率低。通过对比其他模型，我们可以发现lgbm具有最好数值，所以我们最后利用lgbm进行预测和调参
* 超参数优化：'n_estimators','learning_rate','max_depth','num_leaves,树的深度与叶子树相关，所以我们讲其共同利用k-fold为10进行优化，剩余参数为另外一组。这样做也可以更好的节省运行时间

In [None]:
from sklearn.model_selection import GridSearchCV
lgbm = LGBMClassifier()

## Search grid for optimal parameters
hyper_space = {'n_estimators': [1000, 1500, 2000],
               'learning_rate' : [0.01,0.02,0.03]
              }

# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

grid= GridSearchCV(lgbm, param_grid=hyper_space, cv=kfold, scoring="roc_auc", verbose=1)

grid.fit(x_train, y_train)

bestEL = grid.best_params_


In [None]:
from sklearn.model_selection import GridSearchCV
lgbm = LGBMClassifier()

## Search grid for optimal parameters
hyper_space = {'max_depth':  [4, 5, 8],
               'num_leaves': [15, 31, 63],
              }

# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=10)

grid = GridSearchCV(lgbm, param_grid=hyper_space, cv=kfold, scoring="roc_auc", verbose=1)

grid.fit(x_train, y_train)

bestMN =  grid.best_params_



In [None]:
print(bestEL)
print(bestMN)

利用调优后的参数训练模型

In [None]:
from sklearn.metrics import roc_auc_score
lgbm = LGBMClassifier(learning_rate = 0.01,n_estimators=1000,max_depth = 5,num_leaves=15)
lgbm.fit(x_train,y_train)
predict = lgbm.predict_proba(x_test)[:,1]
roc_auc_score(y_test, predict)

# 7.Submission

In [None]:
x = test.drop(['SeriousDlqin2yrs'],axis=1)
y = lgbm.predict_proba(x)[:,1]
ids = np.arange(1,101504)
sub = pd.DataFrame({'Id': ids, 'Probability':y})
sub.to_csv("submission.csv", index=False)
