In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl

#使用中文字体
mpl.rcParams['font.family']=['Microsoft Yahei','sans-serif']
mpl.rcParams['axes.unicode_minus']=False

#使用svg格式，避免图形模糊
%matplotlib inline
%config InlineBackend.figure_format="svg"

In [2]:
import pandas as pd
df = pd.read_csv("default.csv", index_col="no")
df.head()

Unnamed: 0_level_0,default,student,balance,income
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,No,No,729.526495,44361.625074
2,No,Yes,817.180407,12106.1347
3,No,No,1073.549164,31767.138947
4,No,No,529.250605,35704.493935
5,No,No,785.655883,38463.495879


In [3]:
# 将default列转换为0-1变量
isyes = (df["default"] == 'Yes')
df["default"] = isyes.astype(int)
df.groupby("default").count()

Unnamed: 0_level_0,student,balance,income
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9667,9667,9667
1,333,333,333


In [4]:
# 将student列转换为0-1变量
isyes = (df["student"] == 'Yes')
df["student"] = isyes.astype(int)
df.groupby("student").count()

Unnamed: 0_level_0,default,balance,income
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7056,7056,7056
1,2944,2944,2944


In [5]:
df

Unnamed: 0_level_0,default,student,balance,income
no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,729.526495,44361.625074
2,0,1,817.180407,12106.134700
3,0,0,1073.549164,31767.138947
4,0,0,529.250605,35704.493935
5,0,0,785.655883,38463.495879
...,...,...,...,...
9996,0,0,711.555020,52992.378914
9997,0,0,757.962918,19660.721768
9998,0,0,845.411989,58636.156984
9999,0,0,1569.009053,36669.112365


In [6]:
#将df分成x(属性矩阵)和y(标签列)两部分
x = df[["student","balance","income"]]
y = df["default"]

In [None]:
#将数据分为训练集和验证集
from sklearn.model_selection import train_test_split

train_x,val_x,train_y,val_y=train_test_split(
    x,y,train_size=0.7)

In [None]:
train_x

In [None]:
train_y

In [None]:
#拟合模型
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty=None).fit(train_x,train_y)

In [None]:
#计算验证集中各样本为正例的概率
pred_prob = model.predict_proba(val_x)
pred_prob

In [None]:
#我们仅需要第2列
pred_prob = pred_prob[:,1]
pred_prob

In [None]:
#判断验证集中各样本是否为正例（阈值由模型自动选取）
pred_y = model.predict(val_x)
pred_y

In [None]:
# 计算模型的指标值
from sklearn.metrics import classification_report
classification_report(val_y,pred_y, output_dict=True)

In [None]:
#比较引入不同惩罚项的回归模型
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
def calcScore(panelty):
    model = LogisticRegression(penalty=panelty).fit(train_x,train_y)
    pred_y = model.predict(val_x)
    f1 = f1_score(val_y,pred_y)
    acc = accuracy_score(val_y,pred_y)
    p = precision_score(val_y,pred_y)
    r = recall_score(val_y,pred_y)
    print(f"Panelty {panelty}: F1={f1}, acc={acc}, p={p}, r={r}")   


In [None]:
for panelty in [None, 'l2']:
    calcScore(panelty)