<a href="https://colab.research.google.com/github/sora3765/regression-analysis-sales-forecast/blob/main/1_2_%E9%87%8D%E5%9B%9E%E5%B8%B0%E5%88%86%E6%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.formula.api as smf

%precision 3
%matplotlib inline


In [2]:
!git clone https://github.com/umacchi/python-regression-tutorial-data datasets


Cloning into 'datasets'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 31 (delta 3), reused 30 (delta 2), pack-reused 0[K
Unpacking objects: 100% (31/31), done.


In [3]:
df = pd.read_csv("/content/datasets/test.csv")
n = len(df)
print(df.shape)
df.head()


(20, 4)


Unnamed: 0,monday_sales,week_sales,customer_traffic,weather
0,4.2,67,7.2,Sunny
1,7.2,71,7.9,Cloudy
2,2.0,30,5.3,Sunny
3,3.0,35,6.8,Rainy
4,1.5,35,7.5,Rainy


In [4]:
# monday_sales: 月曜日の売上
# customer traffic: 屋台前の営業時間中の10分あたりの平均的な人通り量
# weather: 当日の晴れか曇りかなどの天気情報

# まずは２変数での重回帰分析
formula = "week_sales ~ monday_sales + customer_traffic"
result = smf.ols(formula, df).fit()
result.summary()


0,1,2,3
Dep. Variable:,week_sales,R-squared:,0.737
Model:,OLS,Adj. R-squared:,0.706
Method:,Least Squares,F-statistic:,23.78
Date:,"Fri, 02 Dec 2022",Prob (F-statistic):,1.19e-05
Time:,00:54:46,Log-Likelihood:,-73.542
No. Observations:,20,AIC:,153.1
Df Residuals:,17,BIC:,156.1
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.2903,11.819,-0.194,0.849,-27.227,22.646
monday_sales,6.4340,1.005,6.402,0.000,4.314,8.554
customer_traffic,4.2412,1.780,2.383,0.029,0.486,7.997

0,1,2,3
Omnibus:,2.158,Durbin-Watson:,1.477
Prob(Omnibus):,0.34,Jarque-Bera (JB):,1.78
Skew:,0.675,Prob(JB):,0.411
Kurtosis:,2.442,Cond. No.,38.7


In [5]:
x1 = df['monday_sales']
x2 = df['customer_traffic']
y = df['week_sales']
p = 2


In [6]:
# β0, β1, β2の各推定値を求めていく
X = np.array([np.ones_like(x1), x1, x2]).T

beta0_hat, beta1_hat, beta2_hat = np.linalg.lstsq(X, y)[0]
beta0_hat, beta1_hat, beta2_hat
# 出力結果
# (-2.290, 6.434, 4.241)


  beta0_hat, beta1_hat, beta2_hat = np.linalg.lstsq(X, y)[0]


(-2.290285203435117, 6.433965003648682, 4.241179978200461)

In [7]:
# 予測値を求める
y_hat = beta0_hat + beta1_hat * x1 + beta2_hat * x2
print(y_hat)

0     55.268864
1     77.539585
2     33.055899
3     45.851634
4     39.169512
5     35.733251
6     28.171322
7     38.041548
8     43.379121
9     64.261976
10    43.393560
11    66.278799
12    43.658849
13    80.199805
14    28.479928
15    63.295534
16    53.148274
17    64.700533
18    24.106104
19    40.265905
dtype: float64


In [8]:
# 残差を求める
eps_hat = y - y_hat
print(eps_hat)


0     11.731136
1     -6.539585
2     -3.055899
3    -10.851634
4     -4.169512
5      4.266749
6     -5.171322
7     -1.041548
8     -4.379121
9     -9.261976
10    -3.393560
11     3.721201
12   -14.658849
13     7.800195
14    18.520072
15    13.704466
16    -1.148274
17    -9.700533
18    -6.106104
19    19.734095
dtype: float64


In [9]:
# 標準誤差を求める
s_var = np.sum(eps_hat ** 2) / (n - p - 1)

C0, C1, C2 = np.diag(np.linalg.pinv(np.dot(X.T, X)))
C0, C1, C2
# 出力結果
# (1.298, 0.009, 0.029)


(1.2977724263376968, 0.00938243089119918, 0.029436486917620183)

In [10]:
# β2の95信頼区間を求める
rv = stats.t(n-p-1)

lcl = beta2_hat - rv.isf(0.025) * np.sqrt(s_var * C2)
hcl = beta2_hat - rv.isf(0.975) * np.sqrt(s_var * C2)
lcl, hcl
# 出力結果
# (0.486, 7.997)


(0.48561064662117825, 7.996749309779743)

In [11]:
# =================================================

In [12]:
formula = "week_sales ~ monday_sales + customer_traffic + weather"
result = smf.ols(formula, df).fit()
result.summary()


0,1,2,3
Dep. Variable:,week_sales,R-squared:,0.764
Model:,OLS,Adj. R-squared:,0.702
Method:,Least Squares,F-statistic:,12.17
Date:,"Fri, 02 Dec 2022",Prob (F-statistic):,0.000131
Time:,00:56:57,Log-Likelihood:,-72.425
No. Observations:,20,AIC:,154.8
Df Residuals:,15,BIC:,159.8
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.5174,12.671,0.120,0.906,-25.491,28.526
weather[T.Rainy],-7.8439,6.477,-1.211,0.245,-21.650,5.962
weather[T.Sunny],-2.0630,6.286,-0.328,0.747,-15.461,11.335
monday_sales,5.9235,1.093,5.420,0.000,3.594,8.253
customer_traffic,4.5607,1.810,2.520,0.024,0.703,8.418

0,1,2,3
Omnibus:,1.563,Durbin-Watson:,1.409
Prob(Omnibus):,0.458,Jarque-Bera (JB):,0.92
Skew:,0.523,Prob(JB):,0.631
Kurtosis:,2.912,Cond. No.,42.6
