# 读入数据


In [None]:
#寻找数据文件路径
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

train = pd.read_csv('/kaggle/input/GiveMeSomeCredit/cs-training.csv', index_col=0)
test = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-test.csv", index_col=0)

#重新命名cols(简化))
for df in [train, test]:
    df.columns = ["ifPastDue90", "creditPercentage", "age", "pastDue59", "debtRatio", "income", "numLoan", "pastDue90",
                "houseLoan", "pastDue89", "numDependent"] 
#将两组数据合并
full = train.append(test) 
print(full.info())
print(train.shape, test.shape)

# 数据清理

In [None]:
print (train.isnull().sum(), '\t', test.isnull().sum()) #查找缺失值

train.dropna(subset=["numDependent"], inplace=True) #由于numDependent中缺失元素比较少，删除所有缺失numDependent的行
test.numDependent.fillna(test.numDependent.median(), inplace=True) #由于无法删除test中numDependent缺失元素（后面需要足够数据数量进行预测）
                                                                   #，选择用median填充

print (train.isnull().sum(), '\t', test.isnull().sum())

In [None]:
#观察income分布
sns.boxplot(x=train.income.dropna().values)

#可以看到income数据中存在许多极端outliers
#我们用median填充缺失数据，减少outliers的影响
train = train.fillna(train.income.median())
test["income"] = test["income"].fillna(test.income.median())
print (train.isnull().sum())
print(test.isnull().sum())

In [None]:
#分离x_train, y_train, x_test

x_train = train.iloc[:, 1:]
y_train = train["ifPastDue90"].values
x_test = test.iloc[:, 1:].values
print (x_train.shape, y_train.shape, x_test.shape)

In [None]:
#数据归一化
from sklearn import preprocessing

train_scaler = preprocessing.StandardScaler().fit(x_train)
print( train_scaler.mean_ , '\n'+'-'*50+'\n', train_scaler.scale_)
print('='*50)
test_scaler = preprocessing.StandardScaler().fit(x_test)
print( test_scaler.mean_ , '\n'+'-'*50+'\n', test_scaler.scale_)

In [None]:
x_train_scaled = train_scaler.transform(x_train)
x_test_scaled = test_scaler.transform(x_test)

x_train_scaled.mean(axis=0), x_train_scaled.std(axis=0),  x_test_scaled.mean(axis=0), x_test_scaled.std(axis=0)


# 模型比较

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score #使用roc_auc 作为 metric

#用train_test_split分成learn和valid
x_learn, x_valid, y_learn, y_valid = train_test_split(x_train_scaled, y_train, random_state=0) 

comparison = [] #用于比较最终不同模型的表现

In [None]:
#逻辑回归
from sklearn.linear_model import LogisticRegressionCV

arg, maxauc = 'none', 0
for s in ['newton-cg', 'lbfgs', 'liblinear']: #测试不同solver效果
    model = LogisticRegressionCV(scoring='roc_auc', solver=s) #使用roc_auc作为metric
    model.fit(x_learn, y_learn)
    y_pred = model.predict_proba(x_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(s, score)
    if score > maxauc:
        arg, maxauc = s, score
print()
print(arg, maxauc)
comparison.append(['LogisticRegressionCV_'+arg,maxauc]) #可以看到逻辑回归表现不是太好

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
import itertools
 
arglist = itertools.product(['uniform', 'distance'],['auto', 'ball_tree', 'kd_tree', 'brute']) #尝试每种weight/algorithm组合
print(arglist,end='\n*******\n')

weight, algorithm, maxauc = '', '', 0
for wei, algo in arglist:
    knn = KNeighborsClassifier(weights=wei, algorithm=algo)
    knn.fit(x_learn, y_learn)
    y_pred = knn.predict_proba(x_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(wei, algo, score)
    if score > maxauc:
        weight, algorithm, maxauc = wei, algo, score
print()
print(weight, algorithm, maxauc)

#可以看出表现略逊于LR，同样不是太好
#distance brute为最优参数

In [None]:
scorelist = []
n_neighbors, maxauc = -1, 0
for k in range(100, 1000+1, 100):
    knn = KNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm) #以distance brute调整n_neighbors数量
    knn.fit(x_learn, y_learn)
    y_pred = knn.predict_proba(x_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(k, score)
    
    if score > maxauc:
        n_neighbors, maxauc = k, score
    scorelist.append(score)
print()
print(n_neighbors, maxauc)


In [None]:
#不同n-neighbors score可视化

plt.plot(range(100, 1000+1, 100), scorelist)
plt.title('score - n_neighbors')
plt.xlabel('n_neighbors')
plt.ylabel('score(AUC)')

#可以看到在k=300左右时auc得最大值


In [None]:
#接下来减小step来继续缩小k的范围
scorelist = []
n_neighbors, maxauc = -1, 0
for k in range(250, 330+1, 10):
    knn = KNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm)
    knn.fit(x_learn, y_learn)
    y_pred = knn.predict_proba(x_valid)[:,1]
    score = roc_auc_score(y_valid, y_pred)
    print(k, score)
    if score > maxauc:
        n_neighbors, maxauc = k, score
    scorelist.append(score)
print()
print(n_neighbors, maxauc)
comparison.append(['KNN_{}_{}_{}'.format(weight,algorithm,n_neighbors),maxauc])

#可以看到在k取260的时候score达到最佳

In [None]:
#朴素贝叶斯
from sklearn.naive_bayes import GaussianNB

gaussian = GaussianNB()
gaussian.fit(x_learn, y_learn)
y_pred = gaussian.predict_proba(x_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

comparison.append(['GaussianNB',score])

In [None]:
#决策树
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid={
        'criterion':['gini','entropy']
    },
    scoring='roc_auc',
    verbose=3
)
grid.fit(x_learn, y_learn)

for result in grid.cv_results_:
    print(result, grid.cv_results_[result])
print (grid.best_params_['criterion']) #寻找最佳param

tree = DecisionTreeClassifier(criterion=grid.best_params_['criterion'])
tree.fit(x_learn, y_learn)
y_pred = tree.predict_proba(x_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)
comparison.append(['DecisionTreeClassifier_{}'.format(grid.best_params_['criterion']),score])

In [None]:
#随机森林
from sklearn.ensemble import RandomForestClassifier

grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators':[30,50,80,100,200]
    },
    scoring='roc_auc',
    verbose=3
)

grid.fit(x_learn, y_learn)
for result in grid.cv_results_:
    print(result, grid.cv_results_[result])
print(grid.best_params_['n_estimators'])

rfc = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'])
rfc.fit(x_learn, y_learn)
y_pred = rfc.predict_proba(x_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

comparison.append(['RandomForestClassifier_{}'.format(grid.best_params_['n_estimators']),score])

In [None]:
comparison #比较不同模型

In [None]:
#可以看出随机森林在该数据集的表现最好，达到了0.84，远超过其他模型
#对该模型进行更细致的调参（n_estimators 和 criterion）

grid = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        'n_estimators':np.arange(180, 230, 10),
        'criterion':['gini', 'entropy']
    },
    scoring='roc_auc',
    verbose=3
)

grid.fit(x_learn, y_learn)
for result in grid.cv_results_:
    print(result, grid.cv_results_[result])

In [None]:
rfc = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'],
                         criterion=grid.best_params_['criterion'])
rfc.fit(x_learn, y_learn)
y_pred = rfc.predict_proba(x_valid)[:,1]
score = roc_auc_score(y_valid, y_pred)
print(score)

# 生成结果

In [None]:
#使用迭代后求得的最佳参数在train数据集上最后跑一次随机森林
rfc1 = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'], criterion=grid.best_params_['criterion'])         
rfc1.fit(x_train, y_train)
y_pred = rfc1.predict_proba(x_test)[:,1]
sample = pd.read_csv('/kaggle/input/GiveMeSomeCredit/sampleEntry.csv')
sample['Probability'] = y_pred
sample.to_csv('submission.csv', index=False)