# 회귀분석 예제 - 중고차 가격 구조
## 단순회귀분석

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm # 통계모형
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt  # 박스플랏, 산점도
usedcar1 = pd.read_csv('usedcar1.csv')
usedcar1

### 산점도

In [None]:
plt.scatter(x='Odometer', y='Price', data=usedcar1)
plt.title('Scatter Plot')
plt.xlabel('Odometer')
plt.ylabel('Price')

### 선형회귀분석

In [None]:
usedcar1.lm = smf.ols('Price ~ Odometer', data=usedcar1).fit()
usedcar1.lm.summary()

### 산점도 (회귀직선 포함)

In [None]:
plt.scatter(x='Odometer', y='Price', s=5, data=usedcar1)
plt.plot(usedcar1['Odometer'], usedcar1.lm.fittedvalues, color='red', linewidth=1)
plt.title('Scatter Plot with Regression Line')
plt.xlabel('Odometer')
plt.ylabel('Price')

## X 변수 추가 발굴

In [None]:
usedcar2 = pd.read_csv('usedcar2.csv')
usedcar2

### 다중 선형 회귀분석

In [None]:
usedcar2.lm1 = smf.ols('Price ~ Odometer+Accident+Option+Clean+Height+Temp', data=usedcar2).fit()
usedcar2.lm1.summary()

In [None]:
usedcar2.lm2 = smf.ols('Price ~ Odometer+Accident+Option+Clean+Temp', data=usedcar2).fit()
usedcar2.lm2.summary()

In [None]:
usedcar2.lm3 = smf.ols('Price ~ Odometer+Accident+Option+Clean', data=usedcar2).fit()
usedcar2.lm3.summary()

### 중고차 가격 예측

In [None]:
smith = pd.DataFrame([[60000,0,3,2,175,16]],columns=['Odometer','Accident','Option','Clean','Height','Temp'])
smith

In [None]:
usedcar2.lm3.predict(smith)

# 로지스틱 회귀분석 예제 - 구매여부 구조
## 단순 로지스틱

In [None]:
direct1 = pd.read_csv('direct1.csv')
direct1

### 결측치 확인

In [None]:
direct1.isnull().sum() # pd.isnull(direct1)

### 결측치 제거

In [None]:
direct1 = direct1.dropna()

### 구매자/비구매자 인원수

In [None]:
direct1['Buy'].value_counts()

### 구매자/비구매자 연령별 차이

In [None]:
import seaborn as sns
sns.boxplot(x="Buy", y="Age", data=direct1)

In [None]:
buy_0 = direct1['Age'].loc[direct1['Buy'] == 0]
buy_1 = direct1['Age'].loc[direct1['Buy'] == 1]
plt.hist(buy_0, alpha = 0.5)
plt.hist(buy_1, alpha = 0.5)

### 로지스틱 회귀분석

In [None]:
direct1.logit = smf.glm('Buy ~ Age', data=direct1, family=sm.families.Binomial()).fit()
direct1.logit.summary()

### 시그모이드 함수 그래프

In [None]:
direct1.logit.fittedvalues

In [None]:
plt.scatter(direct1['Age'],direct1.logit.fittedvalues)
grid = pd.DataFrame(np.linspace(-190,190,1000),columns=['Age'])
prob = direct1.logit.predict(grid)
plt.plot(grid, prob,'-', color = 'r')
plt.xlabel('Age')
plt.ylabel('Probability')

### AUC 계산

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(direct1['Buy'], direct1.logit.fittedvalues)
roc_auc

## X변수 추가 발굴

In [None]:
direct2 = pd.read_csv('direct2.csv')
direct2

In [None]:
direct2.isnull().sum()

In [None]:
direct2 = direct2.dropna()
direct2.shape

### 다중 로지스틱 회귀분석

In [None]:
direct2.logit1 = smf.glm('Buy ~ Age+Gender+Married+Children+Income+Ccard+Recent+Climate+Urban',
                         data=direct2, family=sm.families.Binomial()).fit()
direct2.logit1.summary()

In [None]:
direct2.logit2 = smf.glm('Buy ~ Age+Gender+Married+Children+Income+Ccard+Recent+Urban',
                         data=direct2, family=sm.families.Binomial()).fit()
direct2.logit2.summary()

In [None]:
direct2.logit3 = smf.glm('Buy ~ Age+Gender+Married+Children+Income+Recent+Urban',
                         data=direct2, family=sm.families.Binomial()).fit()
direct2.logit3.summary()

### AUC 계산

In [None]:
roc_auc = roc_auc_score(direct2['Buy'], direct2.logit3.fittedvalues)
roc_auc

### 기울기 해석

In [None]:
np.exp(-0.061)

In [None]:
np.exp(direct2.logit3.params)

In [None]:
np.exp(0.0021*100)

### 구매확률 계산

In [None]:
smith = [35, 1, 1, 1, 500, 1, 1] 
johnson = [36, 0, 1, 2, 550, 0, 0]
people = pd.DataFrame([smith, johnson],columns=['Age','Gender','Married','Children','Income','Recent','Urban'])
people

In [None]:
direct2.logit3.predict(people)

In [None]:
direct2['Buy'].value_counts() / len(direct2)