In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 目录
[1. 定义问题](#identify)

[2. 获取数据](#data)

[3. 探索性数据分析（EDA）](#eda)

[4. 数据预处理](#preprocessing)

[5. 建立模型，预测求解](#modeling)

[6. 提交](#submit)

# <a name="identify">1.定义问题</a>

Improve on the state of the art in credit scoring by predicting the probability that somebody will experience financial distress in the next two years.<br>
建立一个模型帮助银行做出最佳财务决策，即预测某人在未来两年内遇到财务困境的可能性。

In [None]:
# 数据整理和分析
import pandas as pd
import numpy as np
import random as rnd

# 可视化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter

# 机器学习
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from lightgbm import LGBMClassifier


from sklearn.metrics import roc_auc_score, confusion_matrix, auc
from sklearn.metrics import roc_auc_score, precision_recall_curve,roc_curve

# <a name="data">2. 获取数据</a>

In [None]:
train_df = pd.read_csv("../input/GiveMeSomeCredit/cs-training.csv")
test_df = pd.read_csv("../input/GiveMeSomeCredit/cs-test.csv")
combine = [train_df, test_df]

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.head()

# <a name="eda">3. 探索性数据分析（EDA）</a>

## 3.1 数据集特征介绍

|属性|含义|类型
|------ | ------ |------|
|**1. SeriousDlqin2yrs**|预测标签，逾期 90 天或更糟的人|Y/N|
|**2. RevolvingUtilizationOfUnsecuredLines**|信用卡、个人信用额度（房地产除外）和汽车贷款等无分期付款债务的总余额除以信用额度总和<br>**理解**：债务总额/信用额度，一定程度上代表欠债严重程度，该值越大，违约的可能应该越大|percentage|
|**3. age**|借款人年龄（年）|integer|
|**4. NumberOfTime30-59DaysPastDueNotWorse**|借款人逾期30-59天的次数|integer|
|**5. DebtRatio**|每月还款额、赡养费、生活费除以每月总收入|percentage|
|**6. MonthlyIncome**|月收入|real|
|**7. NumberOfOpenCreditLinesAndLoans**|贷款（分期付款，如汽车贷款或抵押贷款）和信用额度（例如信用卡）的数量|integer|
|**8. NumberOfTimes90DaysLate**|借款人逾期 90 天或更长时间的次数|integer|
|**9. NumberRealEstateLoansOrLines**|抵押贷款和房地产贷款的数量|integer|
|**10. NumberOfTime60-89DaysPastDueNotWorse**|借款人逾期60-89天的次数|integer|
|**11. NumberOfDependents**|家庭受抚养人数量（不包括自己）|integer|

**特征类型**
- 标称属性：
 - 二元属性：SeriousDlqin2yrs
- 数值属性：
 - 连续型特征有：
     - RevolvingUtilizationOfUnsecuredLines
     - MonthlyIncome
     - DebtRatio
 - 离散型特征有：
     - age
     - NumberOfTime30-59DaysPastDueNotWorse
         <br>NumberOfTimes90DaysLate
         <br>NumberOfTime60-89DaysPastDueNotWorse
     - NumberOfOpenCreditLinesAndLoans
     - NumberRealEstateLoansOrLines
     - NumberOfDependents

In [None]:
train_df.info()
print('_'*64)
test_df.info()

## 3.2 缺失值检测

- 训练集
 - MonthlyIncome (19.82%)
 - NumberOfDependents (2.62%)
- 测试集
 - MonthlyIncome (19.81%)
 - NumberOfDependents (2.59%)

In [None]:
pd.DataFrame({"Number of Null Values":train_df.isnull().sum(),
             "Ratio":round(train_df.isnull().sum()/len(train_df)*100,2)})

In [None]:
pd.DataFrame({"Number of Null Values":test_df.isna().sum(),
             "Ratio":round(test_df.isnull().sum()/len(test_df)*100,2)})

## 3.3离群值检测

异常值检测 Turkey's Test：这里检测超过三个异常值的样本

参考：https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling/notebook

In [None]:
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers 

In [None]:
# 十个特征的异常值检测：超过3个样本异常将被划分为异常样本
Outliers_to_drop = detect_outliers(train_df,3,train_df.columns.values[2:])
print("异常值：",round(len(Outliers_to_drop)/train_df.shape[0]*100,2),"%")
train_df.loc[Outliers_to_drop] # Show the outliers rows

## 3.4 重复样本
- 无重复样本

In [None]:
train_df.duplicated().value_counts()

## 3.5 特征分布

- 数据集中各个特征的分布如何，是否类别不平衡


参考：https://www.kaggle.com/nicholasgah/eda-credit-scoring-top-100-on-leaderboard

In [None]:
train_df.describe()

In [None]:
# 类别分布
sns.countplot(x="SeriousDlqin2yrs", data=train_df)
print("客户违约率: {}%".format(train_df["SeriousDlqin2yrs"].sum() / len(train_df)*100))

只有6.68%客户违约（SeriousDlqin2yrs=1），分类分布非常不平衡。

**特征是否有大量重复值**

In [None]:
features = train_df.columns.values
unique_max_train = []
unique_max_test = []
for feature in features:
    values = train_df[feature].value_counts()
    unique_max_train.append([feature, values.max(), values.idxmax(), round(values.max()/train_df.shape[0]*100,2)]) #该特征重复最多的特征值idxmax()及其个数max()

np.transpose((pd.DataFrame(unique_max_train, columns=['Feature', 'Max duplicates', 'Value', 'Ratio'])).\
            sort_values(by = 'Max duplicates', ascending=False).head(15))

- 特征大多数为重复值0：
    - NumberOfTime60-89DaysPastDueNotWorse(94.93%)
    - NumberOfTimes90DaysLate(94.44%)
    - SeriousDlqin2yrs(93.32%)
    - NumberOfTime30-59DaysPastDueNotWorse(84.01%)
    - NumberOfDependents(60.55%)

**1. RevolvingUtilizationOfUnsecuredLines** RUUL

随着RUUL值变大（代表欠债严重），违约率应该增大。下面输出随着RUUL最小值增大，违约率的变化。

In [None]:
'''default_prop = [] # 违约率
for i in range(int(train_df["RevolvingUtilizationOfUnsecuredLines"].max())):
    temp_ = train_df.loc[train_df["RevolvingUtilizationOfUnsecuredLines"] >= i]
    default_prop.append([i, temp_["SeriousDlqin2yrs"].mean()])
default_prop'''

RUUL最小值从12到13，违约率有一个明显的下降，RUUL最小值到达12时，违约率6.667%已经低于总体违约率6.684%。可见，RUUL>=12的数据是异常的。

In [None]:
'''sns.lineplot(x=[i[0] for i in default_prop], y=[i[1] for i in default_prop])
plt.title("Proportion of Defaulters As Minimum RUUL Increases")'''

**2. age** 
- 有1个年龄为0的样本，这显然是不合理的，稍后删除该样本
- 违约客户偏向于年轻人群

In [None]:
sns.boxplot(train_df['age'])

In [None]:
len(train_df.loc[train_df['age'] == 0])

In [None]:
sns.distplot(train_df.loc[train_df['age'] > 0]["age"])

In [None]:
sns.distplot(train_df.loc[(train_df['age'] > 0)&(train_df["SeriousDlqin2yrs"] == 0)]["age"])
plt.title('SeriousDlqin2yrs = 0')

In [None]:
sns.distplot(train_df.loc[(train_df['age'] > 0)&(train_df["SeriousDlqin2yrs"] == 1)]["age"])
plt.title('SeriousDlqin2yrs = 1')

**3. 逾期次数相关特征**

- **NumberOfTime30-59DaysPastDueNotWorse**
- **NumberOfTimes90DaysLate**
- **NumberOfTime60-89DaysPastDueNotWorse** 

没有客户逾期30-59天的次数在(13, 96)范围内，没有客户逾期90天以上的次数在(17, 96)范围内，没有客户逾期30-59天的次数在(11, 96)范围内。

In [None]:
late_pay_cols = ['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate','NumberOfTime60-89DaysPastDueNotWorse']

In [None]:
train_df["NumberOfTime30-59DaysPastDueNotWorse"].value_counts().sort_index()

In [None]:
train_df["NumberOfTimes90DaysLate"].value_counts().sort_index()

In [None]:
train_df["NumberOfTime60-89DaysPastDueNotWorse"].value_counts().sort_index()

In [None]:
train_df.loc[train_df["NumberOfTimes90DaysLate"] > 17][late_pay_cols].describe()

NumberOfTimes90DaysLate>17情况下，三个特征的分布是一样的,下面查看它们的取值情况。

In [None]:
distinct_triples_counts = dict()
for arr in train_df.loc[train_df["NumberOfTimes90DaysLate"] > 17][late_pay_cols].values:
    triple = ",".join(list(map(str, arr)))
    if triple not in distinct_triples_counts:
        distinct_triples_counts[triple] = 0
    else:
        distinct_triples_counts[triple] += 1
distinct_triples_counts

NumberOfTimes90DaysLate>17时，这三个特征取值完全一致，且只有两种情况（全是96或全是98）。经过简单计算可知，一个人在2年内逾期超过90天以上的次数不可能是96或者是98，这些样本显然是异常的。

因此，考虑删除NumberOfTimes90DaysLate>17的样本。

**4. DebtRatio** 每月还款额、赡养费、生活费/每月总收入

In [None]:
train_df["DebtRatio"].quantile(0.765)

大约76.5%的客户每月要支出小于月收入

In [None]:
train_df["DebtRatio"].quantile(0.975)

2.5%的客户每月要支出月收入的3490倍或更多

In [None]:
train_df.loc[train_df["DebtRatio"] > train_df["DebtRatio"].quantile(0.975)][['DebtRatio','MonthlyIncome']].describe()

In [None]:
train_df.loc[train_df["DebtRatio"] > train_df["DebtRatio"].quantile(0.975)]['MonthlyIncome'].value_counts()

这2.5%客户的月收入取值有异常：3750个样本月收入特征有大量缺失，只有185个样本的月收入不缺失，且取值只有0和1。

In [None]:
len(train_df[(train_df["DebtRatio"] > train_df["DebtRatio"].quantile(0.9)) & (train_df['SeriousDlqin2yrs'] == train_df['MonthlyIncome'])])

这185人中有164人属于以下两种类型：
1. 月收入为0，不违约
2. 月收入为1，违约

这2.5%的样本显然是异常的，考虑删除这些样本。

**5. MonthlyIncome**

In [None]:
sns.distplot(train_df['MonthlyIncome'].dropna().loc[train_df['DebtRatio']<train_df['DebtRatio'].quantile(0.975)])

- 分布右偏
    1. 可以考虑用中位数、正态分布（均值、方差）进行插补
    2. 通过随机森林算法插补

**6. NumberOfOpenCreditLinesAndLoans** 贷款数量

In [None]:
#train_df["NumberOfOpenCreditLinesAndLoans"].value_counts()
sns.distplot(train_df["NumberOfOpenCreditLinesAndLoans"])

**7. NumberRealEstateLoansOrLines** 抵押贷款和房地产贷款的数量

In [None]:
train_df["NumberRealEstateLoansOrLines"].value_counts()

In [None]:
sns.countplot(x="NumberRealEstateLoansOrLines", data=train_df.loc[train_df["NumberRealEstateLoansOrLines"] <= 13])

In [None]:
train_df.loc[train_df["NumberRealEstateLoansOrLines"] > 13]["SeriousDlqin2yrs"].describe()

当NumberRealEstateLoansOrLines>13时，违约率比总样本的违约率6.684%高很多，符合逻辑。

**8. NumberOfDependents**

- NumberOfDependents没有依赖项，可以考虑用它的众数0补全缺失值。

In [None]:
train_df["NumberOfDependents"].value_counts()

In [None]:
train_df.loc[train_df["NumberOfDependents"] <= 10]["SeriousDlqin2yrs"].describe()

In [None]:
sns.countplot(x="NumberOfDependents", data=train_df.loc[train_df["NumberOfDependents"] <= 10])

经过以上分析，将进行以下预处理：
- 异常值处理
    1. 删除年龄为0的样本
    2. 只保留 0 < RevolvingUtilizationOfUnsecuredLines < 12 的样本
    3. 只保留NumberOfTimes90DaysLate <= 17 的样本
    4. 删除DebtRatio超过97.5%的样本
- 缺失值处理
    1. MonthlyIncome用随机森林预测补全
    2. 用众数0填充NumberOfDependents

# <a name="preprocessing">4. 数据预处理</a>

In [None]:
# 将train_df，test_df第一列列名重命名为“ID”
for df in combine:
    df.rename(columns={'Unnamed: 0':'ID'}, inplace=True)

## 4.1 缺失值处理

缺失值指的是现有数据集中某个或某些属性的值是不完全的。

缺失值的处理方法一般包括：

1. 直接使用含有缺失值的属性（不处理）；
2. 删除含有缺失值的属性（该方法在包含缺失值的属性仅仅包含极少量有效值时是有效的）；
3. 直接删除含有缺失值的样本；
4. 缺失值补全：均值插补、建模预测等等

In [None]:
# 用众数0填充NumberOfDependents
for df in combine:
    df['NumberOfDependents'].fillna(0, inplace=True)

In [None]:
#MonthlyIncome缺失值较多，不能直接删除样本，同时缺失值也没有多到能直接删除属性，这里使用随机森林预测补全缺失值
def randomforest_filled_func(df):
    # 分成已知该特征和未知该特征两部分
    train = df[df.MonthlyIncome.notnull()]
    test = df[df.MonthlyIncome.isnull()]
    
    train_x = train.iloc[:,2:].drop('MonthlyIncome',axis=1)
    train_y = train['MonthlyIncome']
    
    test_x = test.iloc[:,2:].drop('MonthlyIncome',axis=1)
    
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=2021, n_estimators=200,max_depth=3,n_jobs=-1)
    rfr.fit(train_x, train_y)
    
    # 用得到的模型进行未知特征值预测
    predicted = rfr.predict(test_x).round(0)
    print(predicted)
    
    df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
    return df

In [None]:
for df in combine:
    df = randomforest_filled_func(df)

## 4.2 异常值处理

In [None]:
# 删除年龄为0的样本
train_df = train_df[train_df['age'] > 0]

In [None]:
# 只保留 0 < RevolvingUtilizationOfUnsecuredLines < 13 的样本
train_df = train_df[train_df['RevolvingUtilizationOfUnsecuredLines'] < 13]

In [None]:
# 只保留NumberOfTimes90DaysLate <= 17 的样本
train_df = train_df[train_df['NumberOfTimes90DaysLate'] <= 17]

In [None]:
# 删除DebtRatio超过97.5%的样本
train_df = train_df.loc[train_df["DebtRatio"] <= train_df["DebtRatio"].quantile(0.975)]

In [None]:
train_df.info()

## 4.3 重复值处理

In [None]:
train_df.duplicated().value_counts()

In [None]:
train_df = train_df.drop_duplicates()

# <a name="modeling">5. 建立模型，预测求解</a>

**数据挖掘任务**：训练一个用于信用评分的模型，用于预测违约概率，帮助银行取得最大经济效益。

分类类别不平衡会让监督学习算法过多关注多数类，分类性能下降。
可采用的方法有：
- 随机过采样
- 基于聚类的过采样
- 信息性过采样SMOTE
- 随机欠采样
- 集成技术
- 阈值移动
- 调整代价或权重

参考：https://www.cnblogs.com/shenggang/p/12133016.html

In [None]:
#auc绘图函数
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(8,6))
    plt.plot(fpr,tpr,'b',label= 'AUC= %0.3f' % roc_auc) #生成roc曲线
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--') # 画直线做参考
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive rate")
    plt.show()

In [None]:
# 划分训练集和测试集
x_train = train_df.iloc[:,2:]
y_train = train_df['SeriousDlqin2yrs'].astype('uint8')
train_X, test_X, train_y, test_y = train_test_split(x_train,y_train,test_size=.1,random_state=2021, stratify = y_train)

### 5.1 模型选择
- 采用集成算法

#### **随机森林**

In [None]:
# 随机森林
rfc = RandomForestClassifier(random_state=2021)
rfc.fit(train_X,train_y)

pred = rfc.predict_proba(test_X)[:,1]
fpr, tpr, _ = roc_curve(test_y, pred) #计算threshold阈值，tpr真正例率，fpr假正例率，大于阈值的视为1即坏客户
roc_auc = auc(fpr,tpr) #计算AUC值
plot_roc_curve(fpr,tpr)
print ('AUC Score :', roc_auc)

#### **LightGBM**

In [None]:
lgbm_clf = LGBMClassifier(is_unbalance = True) #使用默认参数
lgbm_clf.fit(train_X,train_y)
lgbm_clf_proba = lgbm_clf.predict_proba(test_X)[:,1]
fpr, tpr, _ = roc_curve(test_y, lgbm_clf_proba)
roc_auc = auc(fpr,tpr)
print ('AUC Score :', roc_auc)
plot_roc_curve(fpr,tpr)

### 5.2 参数优化

参考：https://www.kaggle.com/wuqiong123/givemesomecredit123/notebook


为防止过拟合，应当注意正则化参数的调优
根据lightGBM文档，当面临过拟合时，可能需要做以下参数调优:

1. 使用更小的max_bin（默认值255）
2. 使用更小的num_leaves
3. 使用mindatainleaf和minsumhessianin_leaf
4. 通过设置baggingfraction和baggingfreq使用bagging_freq
5. 通过设置feature_fraction使用特征子采样
6. 使用更大的训练数据
7. 尝试lambdal1、lambdal2和mingainto_split进行正则化
8. 尝试max_depth以避免树的深度增长

参考：https://baijiahao.baidu.com/s?id=1677319446132263721&wfr=spider&for=pc

In [None]:
#最优基分类器数量n_estimators
num_estimators = range(100,401,10)
train_scores = []
test_scores = []
for num_estimator in num_estimators:
    lgbm = LGBMClassifier(is_unbalance = True,num_leaves=60,learning_rate=0.02,n_estimators=num_estimator)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(num_estimators,train_scores,label='Training Score')
ax.plot(num_estimators,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('num_estimators')
ax.legend(loc='best')
plt.show()

In [None]:
num_estimators[test_scores.index(max(test_scores))]

In [None]:
# 最优learning_rates 
learning_rates = np.arange(1,11,1)
learning_rates = learning_rates/200.0
train_scores = []
test_scores = []
for learning_rate in learning_rates:
    lgbm = LGBMClassifier(is_unbalance = True,num_leaves=60,learning_rate=learning_rate,n_estimators=180)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(learning_rates,train_scores,label='Training Score')
ax.plot(learning_rates,test_scores,label='Testing Score')
ax.set_xlabel(r'rate')
ax.set_ylabel(r'auc')
ax.set_title('learning_rates')
ax.legend(loc='best')
plt.show()

In [None]:
learning_rates [test_scores.index(max(test_scores))]

In [None]:
max_bins = range(10,71,5) # 较小的max_bin防止过拟合
train_scores = []
test_scores = []
for max_bin in max_bins:
    lgbm = LGBMClassifier(is_unbalance = True,max_bin = max_bin,num_leaves=60,learning_rate=0.025,n_estimators=180)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(max_bins,train_scores,label='Training Score')
ax.plot(max_bins,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('max_bins')
ax.legend(loc='best')
plt.show()

In [None]:
max_bins[test_scores.index(max(test_scores))]

In [None]:
num_leaves = range(10,81,5)
#train_scores = []
test_scores = []
for num_leave in num_leaves:
    lgbm = LGBMClassifier(is_unbalance = True,num_leaves=num_leave,max_bin=40,learning_rate=0.025,n_estimators=180)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(num_leaves,train_scores,label='Training Score')
ax.plot(num_leaves,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('num_leaves')
ax.legend(loc='best')
plt.show()

In [None]:
num_leaves[test_scores.index(max(test_scores))]

In [None]:
max_depths = range(5,21,1)
#train_scores = []
test_scores = []
for max_depth in max_depths:
    lgbm = LGBMClassifier(is_unbalance = True,max_depth=max_depth,num_leaves=30,max_bin=40,learning_rate=0.025,n_estimators=180)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(max_depths,train_scores,label='Training Score')
ax.plot(max_depths,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('max_depths')
ax.legend(loc='best')
plt.show()

In [None]:
max_depths[test_scores.index(max(test_scores))]

In [None]:
feature_fractions = np.arange(1,11,1)
feature_fractions = feature_fractions/10.0
#train_scores = []
test_scores = []
for feature_fraction in feature_fractions:
    lgbm = LGBMClassifier(is_unbalance = True,max_depth=14,num_leaves=30,max_bin=40,learning_rate=0.025,n_estimators=180,feature_fraction=feature_fraction)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(feature_fractions,train_scores,label='Training Score')
ax.plot(feature_fractions,test_scores,label='Testing Score')
ax.set_xlabel(r'rate')
ax.set_ylabel(r'auc')
ax.set_title('feature_fractions')
ax.legend(loc='best')
plt.show()

In [None]:
feature_fractions[test_scores.index(max(test_scores))]

In [None]:
#训练模型
lgbm = LGBMClassifier(is_unbalance = True,max_depth=14,num_leaves=30,max_bin=40,learning_rate=0.025,n_estimators=180,feature_fraction=0.6)
lgbm.fit(train_X,train_y)
pre_y = lgbm.predict_proba(test_X)[:,1]

In [None]:
#得分显示
score = roc_auc_score(test_y, pre_y)
print(score)

fpr,tpr,threshold=roc_curve(test_y,pre_y)  #计算threshold阈值，tpr真正例率，fpr假正例率，大于阈值的视为1即坏客户
roc_auc=auc(fpr,tpr)   #计算AUC值
plt.plot(fpr,tpr,'b',label= 'AUC= %0.3f' % roc_auc) #生成roc曲线
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('tpr')
plt.xlabel('fpr')
plt.show()

# <a name="submit">6. 提交</a>

Id & Probability

In [None]:
#模型预测
test_df_x = test_df.iloc[:,2:]
pre_y2 = lgbm.predict_proba(test_df_x)[:,1]

In [None]:
#结果导出
result = pd.read_csv('/kaggle/input/GiveMeSomeCredit/sampleEntry.csv') 
result['Probability'] = pre_y2
result.to_csv('./submit.csv',index=False)
reload = pd.read_csv('./submit.csv')
reload