In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl

#使用中文字体
mpl.rcParams['font.family']=['Microsoft Yahei','sans-serif']
mpl.rcParams['axes.unicode_minus']=False

#使用svg格式，避免图形模糊
%matplotlib inline
%config InlineBackend.figure_format="svg"

In [2]:
import pandas as pd
df = pd.read_csv("default.csv", index_col="no")
df.head()

Unnamed: 0_level_0,default,student,balance,income
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,No,No,729.526495,44361.625074
2,No,Yes,817.180407,12106.1347
3,No,No,1073.549164,31767.138947
4,No,No,529.250605,35704.493935
5,No,No,785.655883,38463.495879


In [3]:
# 将default列转换为0-1变量
isyes = (df["default"] == 'Yes')
df["default"] = isyes.astype(int)
df.groupby("default").count()

Unnamed: 0_level_0,student,balance,income
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9667,9667,9667
1,333,333,333


In [4]:
# 将student列转换为0-1变量
isyes = (df["student"] == 'Yes')
df["student"] = isyes.astype(int)
df.groupby("student").count()

Unnamed: 0_level_0,default,balance,income
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7056,7056,7056
1,2944,2944,2944


In [5]:
df

Unnamed: 0_level_0,default,student,balance,income
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,729.526495,44361.625074
2,0,1,817.180407,12106.134700
3,0,0,1073.549164,31767.138947
4,0,0,529.250605,35704.493935
5,0,0,785.655883,38463.495879
...,...,...,...,...
9996,0,0,711.555020,52992.378914
9997,0,0,757.962918,19660.721768
9998,0,0,845.411989,58636.156984
9999,0,0,1569.009053,36669.112365


In [6]:
#将df分成x(属性矩阵)和y(标签列)两部分
x = df[["student","balance","income"]]
y = df["default"]

In [7]:
#将数据分为训练集和验证集
from sklearn.model_selection import train_test_split

train_x,val_x,train_y,val_y=train_test_split(
    x,y,train_size=0.7)

In [10]:
#拟合模型
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty=None).fit(train_x,train_y)

In [11]:
#计算验证集中各样本为正例的概率
pred_prob = model.predict_proba(val_x)
pred_prob

array([[9.98034681e-01, 1.96531875e-03],
       [9.47005073e-01, 5.29949275e-02],
       [8.04624182e-01, 1.95375818e-01],
       ...,
       [9.94281395e-01, 5.71860505e-03],
       [9.93809497e-01, 6.19050305e-03],
       [9.99823398e-01, 1.76601686e-04]])

In [12]:
#我们仅需要第2列
pred_prob = pred_prob[:,1]
pred_prob

array([1.96531875e-03, 5.29949275e-02, 1.95375818e-01, ...,
       5.71860505e-03, 6.19050305e-03, 1.76601686e-04])

In [13]:
#判断验证集中各样本是否为正例（scikit-learn使用0.5作为阈值）
pred_y = model.predict(val_x)
pred_y

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
# 计算模型的指标值
from sklearn.metrics import classification_report
classification_report(val_y,pred_y, output_dict=True)

{'0': {'precision': 0.9763433592429875,
  'recall': 0.9958634953464323,
  'f1-score': 0.9860068259385666,
  'support': 2901.0},
 '1': {'precision': 0.7073170731707317,
  'recall': 0.29292929292929293,
  'f1-score': 0.4142857142857143,
  'support': 99.0},
 'accuracy': 0.9726666666666667,
 'macro avg': {'precision': 0.8418302162068596,
  'recall': 0.6443963941378625,
  'f1-score': 0.7001462701121405,
  'support': 3000.0},
 'weighted avg': {'precision': 0.9674654918026031,
  'recall': 0.9726666666666667,
  'f1-score': 0.9671400292540224,
  'support': 3000.0}}