In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
# 读取示例数据
data = pd.read_csv("smoke.csv")
data

Unnamed: 0,age,gender,smoking,cancer
0,66.2,female,no,0
1,43.9,male,no,0
2,44.7,male,no,1
3,39.3,female,no,0
4,58.7,male,yes,1
5,27.0,male,no,1
6,67.4,female,no,0
7,42.4,male,yes,1
8,53.2,female,no,0
9,47.5,female,no,0


In [3]:
# 为data的“gender”和“smoking”分类变量创建虚拟变量
logistics_data = pd.get_dummies(data, 
                                columns=["gender", "smoking"],
                                dtype=int,
                                drop_first=True)
logistics_data

Unnamed: 0,age,cancer,gender_male,smoking_yes
0,66.2,0,0,0
1,43.9,0,1,0
2,44.7,1,1,0
3,39.3,0,0,0
4,58.7,1,1,1
5,27.0,1,1,0
6,67.4,0,0,0
7,42.4,1,1,1
8,53.2,0,0,0
9,47.5,0,0,0


In [4]:
# 查看自变量之间所有相关系数是否有大于0.8的
logistics_data.corr().abs() > 0.8

Unnamed: 0,age,cancer,gender_male,smoking_yes
age,True,False,False,False
cancer,False,True,False,False
gender_male,False,False,True,False
smoking_yes,False,False,False,True


In [5]:
# 创建构建逻辑回归模型所需的因变量和自变量
y = logistics_data['cancer']
X = logistics_data.drop('cancer', axis=1)
# 在自变量里添加一个常量（为了引入截距）
X = sm.add_constant(X)

In [6]:
# 构建逻辑回归模型，并进行数据拟合
result = sm.Logit(y, X).fit()
# 输出拟合结果
result.summary()

Optimization terminated successfully.
         Current function value: 0.426845
         Iterations 7


0,1,2,3
Dep. Variable:,cancer,No. Observations:,50.0
Model:,Logit,Df Residuals:,46.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 05 Nov 2023",Pseudo R-squ.:,0.3835
Time:,21:49:36,Log-Likelihood:,-21.342
converged:,True,LL-Null:,-34.617
Covariance Type:,nonrobust,LLR p-value:,7.314e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.0716,2.489,0.832,0.405,-2.806,6.949
age,-0.0789,0.051,-1.552,0.121,-0.179,0.021
gender_male,1.8148,0.817,2.222,0.026,0.214,3.415
smoking_yes,4.4339,1.346,3.294,0.001,1.796,7.072


In [7]:
# 计算系数的含义
print(np.exp(1.8148))
print(np.exp(4.4339))

6.139848085022786
84.25938856524215
