In [2]:
import pandas as pd

In [3]:
df = pd.read_excel('small_used_car.xlsx')
df

Unnamed: 0,mileage,model,price,year,my_car_damage,other_car_damage
0,63608,K3,970,2017,0,564596
1,69336,K3,1130,2015,1839700,1140150
2,36000,K3,1380,2016,446520,2244910
3,19029,K3,1390,2017,889000,4196110
4,97090,K3,760,2015,2339137,2029570
...,...,...,...,...,...,...
269,235000,Avante,390,2007,1473730,507260
270,140000,Avante,430,2009,14399366,4592589
271,96757,Avante,390,2006,303080,0
272,113853,Avante,390,2008,320780,1857718


In [4]:
# 위 데이터들을 기반으로 K3인지, Avante인지 맞출 수 있을까?
# 일단 로지스틱 회귀분석
from statsmodels.formula.api import logit

In [38]:
# 아래처럼 해서 컬럼을 추가
# pd.get_dummies(df['model'])['K3']
df['md'] = df['model'].map({'Avante':0, 'K3':1})

In [39]:
df

Unnamed: 0,mileage,model,price,year,my_car_damage,other_car_damage,md
0,63608,K3,970,2017,0,564596,1
1,69336,K3,1130,2015,1839700,1140150,1
2,36000,K3,1380,2016,446520,2244910,1
3,19029,K3,1390,2017,889000,4196110,1
4,97090,K3,760,2015,2339137,2029570,1
...,...,...,...,...,...,...,...
269,235000,Avante,390,2007,1473730,507260,0
270,140000,Avante,430,2009,14399366,4592589,0
271,96757,Avante,390,2006,303080,0,0
272,113853,Avante,390,2008,320780,1857718,0


In [40]:
res = logit('md ~ mileage + price + year', data=df).fit()
res.summary()

Optimization terminated successfully.
         Current function value: 0.539595
         Iterations 7


0,1,2,3
Dep. Variable:,md,No. Observations:,274.0
Model:,Logit,Df Residuals:,270.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 26 Oct 2020",Pseudo R-squ.:,0.04383
Time:,12:33:02,Log-Likelihood:,-147.85
converged:,True,LL-Null:,-154.63
Covariance Type:,nonrobust,LLR p-value:,0.003578

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-643.2888,215.572,-2.984,0.003,-1065.801,-220.776
mileage,-4.779e-06,4.93e-06,-0.969,0.332,-1.44e-05,4.89e-06
price,-0.0016,0.001,-1.827,0.068,-0.003,0.000
year,0.3198,0.107,2.980,0.003,0.109,0.530


In [41]:
import numpy as np
np.exp(-147.85)

6.159718291577813e-65

In [42]:
res.aic, res.bic, res.prsquared # price + year + mileage

(303.69828531628326, 318.15079774183556, 0.043831781198843345)

In [51]:
# 이제 확률 예측하기
import numpy as np
prob = res.predict(df)
prob

0      0.476482
1      0.266082
2      0.282609
3      0.366676
4      0.363306
         ...   
269    0.039457
270    0.103207
271    0.054605
272    0.091657
273    0.059491
Length: 274, dtype: float64

In [52]:
# 확률이 0.5보다 크면 1 아니면 0으로 예측
prediction = np.where(prob > 0.2, 1, 0)

In [54]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df['md'], prediction)
# 혼동행렬 의미1: 아반떼를 아반떼로 맞춘 경우 75
# 혼동행렬 의미2: 아반떼를 K3로 말해 맞춘 틀린 경우 130 --- False Positive
# 혼동행렬 의미3: K3를 아반떼로 말해 틀린 경우 13 --- True Negative
# 혼동행렬 의미4: K3를 K3로 맞춘 경우 56

array([[ 75, 130],
       [ 13,  56]])

In [55]:
# 정확도 계산
(75+56)/(75+130+13+56)*100

47.81021897810219

In [56]:
# 정밀도 계산: 양성 (여기서는 K3) 예측 중 맞은 비율
56/130*100

43.07692307692308

In [57]:
# 재현도 계산: 실제 양성 (여기서는 K3) 중 맞춘 비율
56/(13+56)*100

81.15942028985508

In [58]:
# 특이도 계산: 실제 음성 (여기서는 Avante) 중 맞춘 비율
13/(13+56)*100

18.84057971014493