In [21]:
# 导入库
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from IPython.display import Image
import pydotplus
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


# 导入数据
data = pd.read_csv('loan_data.txt',sep='\s+',encoding='utf-8',index_col='nameid')
print(data)
x = data.drop(['approve'],axis=1).values
print(x)
y = data['approve'].values
print(x.shape,y.shape)


        profession  education  house_loan  car_loan  married  child  revenue  \
nameid                                                                         
1                5          1           0         0        1      1     8204   
2                3          1           1         1        0      0     5674   
3                2          3           1         0        1      0    10634   
4                2          2           0         0        0      0    43551   
5                4          2           0         1        0      1    14065   
...            ...        ...         ...       ...      ...    ...      ...   
996              3          2           1         1        0      0    30535   
997              3          5           0         0        0      0    34315   
998              4          2           1         1        0      1    15509   
999              4          1           0         0        0      0    33619   
1000             5          4           

In [22]:
# 划分训练集和测试集
x1 = x[:900]
y1 = y[:900]
x2 = x[900:]
y2 = y[900:]

# 在训练集中再划分出训练集和验证集
x_train,x_test,y_train,y_test = train_test_split(x1,y1,test_size=0.2)

In [23]:
# 生成决策树
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print('训练集评分:', clf.score(x_train,y_train))
print('验证集评分:', clf.score(x_test,y_test))
print('测试集评分', clf.score(x2,y2))
print("查准率：", metrics.precision_score(y_test,y_pred))
print('召回率:',metrics.recall_score(y_test,y_pred))
print('f1分数:', metrics.f1_score(y_test,y_pred))


训练集评分: 1.0
验证集评分: 0.7611111111111111
测试集评分 0.74
查准率： 0.8434782608695652
召回率: 0.7950819672131147
f1分数: 0.818565400843882


In [24]:
# 混淆矩阵查看分类结果
print(confusion_matrix(y_true=y_test,y_pred=y_pred,labels=list(set(y))))
# 分类报告查看各类的评分
print(metrics.classification_report(y_test,y_pred,labels=list(set(y))))


[[40 18]
 [25 97]]
              precision    recall  f1-score   support

           0       0.62      0.69      0.65        58
           1       0.84      0.80      0.82       122

    accuracy                           0.76       180
   macro avg       0.73      0.74      0.73       180
weighted avg       0.77      0.76      0.76       180



In [25]:
# 混淆矩阵查看分类结果
print(confusion_matrix(y_true=y_test,y_pred=y_pred,labels=list(set(y))))
# 分类报告查看各类的评分
print(metrics.classification_report(y_test,y_pred,labels=list(set(y))))

[[40 18]
 [25 97]]
              precision    recall  f1-score   support

           0       0.62      0.69      0.65        58
           1       0.84      0.80      0.82       122

    accuracy                           0.76       180
   macro avg       0.73      0.74      0.73       180
weighted avg       0.77      0.76      0.76       180



In [26]:
param = {'max_depth': [5,10,20],'min_samples_leaf': np.arange(3,10,1),'min_impurity_split':np.linspace(0.1,0.6,10),}
clf = GridSearchCV(DecisionTreeClassifier(),param_grid=param,cv=8)
clf.fit(x_train,y_train)
print(clf.best_params_,clf.best_score_)




{'max_depth': 5, 'min_impurity_split': 0.1, 'min_samples_leaf': 9} 0.8263888888888888




In [27]:
clf = DecisionTreeClassifier(max_depth=5,min_samples_split=5,min_impurity_split=0.37)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print('训练集评分:', clf.score(x_train,y_train))
print('验证集评分:', clf.score(x_test,y_test))
print('测试集评分', clf.score(x2,y2))
print("查准率：", metrics.precision_score(y_test,y_pred))
print('召回率:',metrics.recall_score(y_test,y_pred))
print('f1分数:', metrics.f1_score(y_test,y_pred))


训练集评分: 0.8277777777777777
验证集评分: 0.8555555555555555
测试集评分 1.0
查准率： 0.8287671232876712
召回率: 0.9918032786885246
f1分数: 0.9029850746268656




In [28]:
print(confusion_matrix(y_true=y_test,y_pred=y_pred,labels=list(set(y))))
print(metrics.classification_report(y_test,y_pred,labels=list(set(y))))


[[ 33  25]
 [  1 121]]
              precision    recall  f1-score   support

           0       0.97      0.57      0.72        58
           1       0.83      0.99      0.90       122

    accuracy                           0.86       180
   macro avg       0.90      0.78      0.81       180
weighted avg       0.87      0.86      0.84       180



In [29]:
import os     
os.environ["PATH"] += os.pathsep + 'D:\\软件\\Graphviz\\bin\\'
dot_data = tree.export_graphviz(clf, out_file=None,
                         feature_names=data.columns[:-1],
                         class_names=data.columns[-1],
                         filled=True, rounded=True,
                         special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("./银行借贷模型.pdf")


True