In [None]:
#导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier

In [None]:
#导入训练数据
train=pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv')  #训练集

#了解训练集
train.info() #给出样本数据的相关信息概览


In [None]:
#箱型图
plt.figure(figsize=[10, 18])
i = 0
for col in train.columns:
    i = i+1
    if i<3:
        continue
    plt.subplot(5,2,i-2)
    sns.boxplot(data=train[col])
    plt.ylabel(col)

In [None]:
#异常值处理
train = train[train['age']>0]
train.loc[train['NumberOfTime30-59DaysPastDueNotWorse']>20, 'NumberOfTime30-59DaysPastDueNotWorse'] = 6
train.loc[train['NumberOfTimes90DaysLate']>20, 'NumberOfTimes90DaysLate'] = 2
train.loc[train['NumberOfTime60-89DaysPastDueNotWorse']>20, 'NumberOfTime60-89DaysPastDueNotWorse'] = 3

In [None]:
#直方图
train.hist(figsize=(20,15))

In [None]:
#缺失值处理
train["NumberOfDependents"].fillna(train["NumberOfDependents"].mode()[0], inplace=True)

mData = train.iloc[:,[6,2,3,4,5,7,8,9,10,11]]
train_known = mData[mData.MonthlyIncome.notnull()].values
train_unknown = mData[mData.MonthlyIncome.isnull()].values
train_X = train_known[:,1:]
train_y = train_known[:,0]
rfr = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(train_X,train_y)
predicted_y = rfr.predict(train_unknown[:,1:]).round(0)
train.loc[train.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted_y


In [None]:
train_2 = train
train_2.describe()

In [None]:
#划分训练集
train_X = train_2[train_2.columns[2:]]
train_y = train_2[train_2.columns[1]]
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size=0.1,  random_state=20, stratify=train_y)

In [None]:
age = train['age']
sns.distplot(age)


In [None]:
mi = train[['MonthlyIncome']]
sns.distplot(mi)

In [None]:
#参数调节

num_estimators = range(100,401,10)
#train_scores = []
test_scores = []
for num_estimator in num_estimators:
    lgbm = LGBMClassifier(num_leaves=60,learning_rate=0.02,n_estimators=num_estimator)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(num_estimators,train_scores,label='Training Score')
ax.plot(num_estimators,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('num_estimators')
ax.legend(loc='best')
plt.show()


In [None]:
learning_rates = np.arange(1,11,1)
learning_rates = learning_rates/200.0
#train_scores = []
test_scores = []
for learning_rate in learning_rates:
    lgbm = LGBMClassifier(num_leaves=60,learning_rate=learning_rate,n_estimators=250)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(learning_rates,train_scores,label='Training Score')
ax.plot(learning_rates,test_scores,label='Testing Score')
ax.set_xlabel(r'rate')
ax.set_ylabel(r'auc')
ax.set_title('learning_rates')
ax.legend(loc='best')
plt.show()

In [None]:
num_leaves = range(10,51,5)
#train_scores = []
test_scores = []
for num_leave in num_leaves:
    lgbm = LGBMClassifier(num_leaves=num_leave,learning_rate=0.02,n_estimators=250)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(num_leaves,train_scores,label='Training Score')
ax.plot(num_leaves,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('num_leaves')
ax.legend(loc='best')
plt.show()

In [None]:
max_depths = range(10,31,1)
#train_scores = []
test_scores = []
for max_depth in max_depths:
    lgbm = LGBMClassifier(max_depth=max_depth,num_leaves=30,learning_rate=0.02,n_estimators=250)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(max_depths,train_scores,label='Training Score')
ax.plot(max_depths,test_scores,label='Testing Score')
ax.set_xlabel(r'num')
ax.set_ylabel(r'auc')
ax.set_title('max_depths')
ax.legend(loc='best')
plt.show()

In [None]:
feature_fractions = np.arange(1,11,1)
feature_fractions = feature_fractions/10.0
#train_scores = []
test_scores = []
for feature_fraction in feature_fractions:
    lgbm = LGBMClassifier(max_depth=20,num_leaves=30,learning_rate=0.02,n_estimators=275,feature_fraction=feature_fraction)
    lgbm.fit(train_X,train_y)
    pre_y1 = lgbm.predict_proba(train_X)[:,1]
    pre_y2 = lgbm.predict_proba(test_X)[:,1]
    #train_scores.append(roc_auc_score(train_y, pre_y1))
    test_scores.append(roc_auc_score(test_y, pre_y2))

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.plot(feature_fractions,train_scores,label='Training Score')
ax.plot(feature_fractions,test_scores,label='Testing Score')
ax.set_xlabel(r'rate')
ax.set_ylabel(r'auc')
ax.set_title('feature_fractions')
ax.legend(loc='best')
plt.show()

In [None]:
#训练模型
lgbm = LGBMClassifier(max_depth=20,num_leaves=30,learning_rate=0.02,n_estimators=270,feature_fraction=0.7)
lgbm.fit(train_X,train_y)
pre_y = lgbm.predict_proba(test_X)[:,1]

In [None]:
#得分显示
score = roc_auc_score(test_y, pre_y)
print(score)

fpr,tpr,threshold=roc_curve(test_y,pre_y)  #计算threshold阈值，tpr真正例率，fpr假正例率，大于阈值的视为1即坏客户
roc_auc=auc(fpr,tpr)   #计算AUC值
plt.plot(fpr,tpr,'b',label= 'AUC= %0.3f' % roc_auc) #生成roc曲线
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('tpr')
plt.xlabel('fpr')
plt.show()

In [None]:
#导入测试数据
test=pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-test.csv')       #测试集
test.info()

#异常值处理
test.loc[test['NumberOfTime30-59DaysPastDueNotWorse']>20, 'NumberOfTime30-59DaysPastDueNotWorse'] = 6
test.loc[test['NumberOfTimes90DaysLate']>20, 'NumberOfTimes90DaysLate'] = 2
test.loc[test['NumberOfTime60-89DaysPastDueNotWorse']>20, 'NumberOfTime60-89DaysPastDueNotWorse'] = 3

test.describe()

In [None]:
#缺失值处理
test["NumberOfDependents"].fillna(train["NumberOfDependents"].mode()[0], inplace=True)

mData2 = test.iloc[:,[6,2,3,4,5,7,8,9,10,11]]
test_known = mData2[mData2.MonthlyIncome.notnull()].values
test_unknown = mData2[mData2.MonthlyIncome.isnull()].values
test_X2 = test_known[:,1:]
test_y2 = test_known[:,0]
rfr2 = RandomForestRegressor(random_state=0,n_estimators=200,max_depth=3,n_jobs=-1)
rfr2.fit(test_X2,test_y2)
predicted_y = rfr2.predict(test_unknown[:,1:]).round(0)
test.loc[test.MonthlyIncome.isnull(),'MonthlyIncome'] = predicted_y

In [None]:
test_2 = test
test_2.describe()

In [None]:
#模型预测
test2 = test[test.columns[2:]]
pre_y2 = lgbm.predict_proba(test2)[:,1]

In [None]:
#结果导出
result=pd.read_csv('/kaggle/input/GiveMeSomeCredit/sampleEntry.csv') 
result['Probability'] = pre_y2
result.to_csv('./submit.csv',index=False)
reload = pd.read_csv('./submit.csv')
reload