In [17]:
import pandas as pd # работа с таблицами 
import numpy as np # математические функции и матрицы
import matplotlib.pyplot as plt # графики
import seaborn as sns # еще более классные графики
import statsmodels.api as sm # стандартные регрессионные модели
import statsmodels.formula.api as smf # аналогичные модели с синтаксисом в стиле R
import statsmodels.graphics.gofplots as gf # визуализация моделей
import statsmodels.discrete.discrete_model # дискретные модели
from statsmodels.stats.outliers_influence import summary_table # работа с выбросами
from scipy.stats import shapiro # тест Шапиро – Уилка 

In [18]:
plt.style.use('seaborn')
plt.rc('font', size=14)
plt.rc('figure', titlesize=15)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=15)

In [19]:
data = pd.read_excel('/Users/polinapogorelova/Desktop/econ_metrics/dataflats.xlsx')

In [20]:
data

Unnamed: 0,A,B,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor
0,1,81.0,58,40,6.0,12.5,7,1,1,1
1,2,75.0,44,28,6.0,13.5,7,1,0,1
2,3,128.0,70,42,6.0,14.5,3,1,1,1
3,4,95.0,61,37,6.0,13.5,7,1,0,1
4,5,330.0,104,60,11.0,10.5,7,0,1,1
...,...,...,...,...,...,...,...,...,...,...
2035,2036,110.0,77,45,10.0,12.0,5,0,0,1
2036,2037,95.0,60,43,6.0,9.0,5,0,0,1
2037,2038,95.0,60,46,5.0,10.5,5,1,0,1
2038,2039,129.0,76,48,10.0,12.5,5,0,0,1


In [21]:
data.rename(columns={'A':'n', 'B': 'price'}, inplace=True)

In [22]:
data.dropna(inplace=True)

In [23]:
data['price_sq'] = data['price']/data['totsp']

In [24]:
data

Unnamed: 0,n,price,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor,price_sq
0,1,81.0,58,40,6.0,12.5,7,1,1,1,1.396552
1,2,75.0,44,28,6.0,13.5,7,1,0,1,1.704545
2,3,128.0,70,42,6.0,14.5,3,1,1,1,1.828571
3,4,95.0,61,37,6.0,13.5,7,1,0,1,1.557377
4,5,330.0,104,60,11.0,10.5,7,0,1,1,3.173077
...,...,...,...,...,...,...,...,...,...,...,...
2035,2036,110.0,77,45,10.0,12.0,5,0,0,1,1.428571
2036,2037,95.0,60,43,6.0,9.0,5,0,0,1,1.583333
2037,2038,95.0,60,46,5.0,10.5,5,1,0,1,1.583333
2038,2039,129.0,76,48,10.0,12.5,5,0,0,1,1.697368


In [25]:
regr = smf.ols('price_sq ~ livesp + dist', data=data).fit()
regr.summary()

0,1,2,3
Dep. Variable:,price_sq,R-squared:,0.279
Model:,OLS,Adj. R-squared:,0.278
Method:,Least Squares,F-statistic:,393.7
Date:,"Mon, 26 Sep 2022",Prob (F-statistic):,2.9400000000000002e-145
Time:,14:11:12,Log-Likelihood:,-717.45
No. Observations:,2038,AIC:,1441.0
Df Residuals:,2035,BIC:,1458.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.5395,0.057,26.994,0.000,1.428,1.651
livesp,0.0149,0.001,15.117,0.000,0.013,0.017
dist,-0.0465,0.002,-20.177,0.000,-0.051,-0.042

0,1,2,3
Omnibus:,806.418,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6015.853
Skew:,1.676,Prob(JB):,0.0
Kurtosis:,10.721,Cond. No.,361.0


In [26]:
data['yhat'] = regr.fittedvalues

In [27]:
data

Unnamed: 0,n,price,totsp,livesp,kitsp,dist,metrdist,walk,brick,floor,price_sq,yhat
0,1,81.0,58,40,6.0,12.5,7,1,1,1,1.396552,1.554726
1,2,75.0,44,28,6.0,13.5,7,1,0,1,1.704545,1.329234
2,3,128.0,70,42,6.0,14.5,3,1,1,1,1.828571,1.491539
3,4,95.0,61,37,6.0,13.5,7,1,0,1,1.557377,1.463471
4,5,330.0,104,60,11.0,10.5,7,0,1,1,3.173077,1.946048
...,...,...,...,...,...,...,...,...,...,...,...,...
2035,2036,110.0,77,45,10.0,12.0,5,0,0,1,1.428571,1.652556
2036,2037,95.0,60,43,6.0,9.0,5,0,0,1,1.583333,1.762252
2037,2038,95.0,60,46,5.0,10.5,5,1,0,1,1.583333,1.737235
2038,2039,129.0,76,48,10.0,12.5,5,0,0,1,1.697368,1.674048


In [33]:
import random
random.seed(7)
new_data = np.random.multivariate_normal(mean=[40,10,5], cov=[[25,0,0],[0,4,0],[0,0,4]], size=10).reshape(10,3)

In [34]:
new_data

array([[37.76638617, 13.12923911,  4.04223147],
       [40.54938695,  9.52602437,  6.79562524],
       [43.62803337, 11.93992542,  4.91101284],
       [32.98798597,  7.426355  ,  3.56680531],
       [35.23654448, 13.04649837,  4.31555364],
       [42.9916648 ,  9.57101192,  6.51440457],
       [47.80034564,  9.58787767,  2.12936371],
       [38.89885573,  9.64148759,  2.65226637],
       [39.00040627,  9.42809488,  3.5691296 ],
       [45.88358757,  8.26178966,  3.72774094]])

In [35]:
pd.DataFrame(new_data,columns=['livesp','dist','metrdist'])

Unnamed: 0,livesp,dist,metrdist
0,37.766386,13.129239,4.042231
1,40.549387,9.526024,6.795625
2,43.628033,11.939925,4.911013
3,32.987986,7.426355,3.566805
4,35.236544,13.046498,4.315554
5,42.991665,9.571012,6.514405
6,47.800346,9.587878,2.129364
7,38.898856,9.641488,2.652266
8,39.000406,9.428095,3.56913
9,45.883588,8.26179,3.727741


In [37]:
prediction = regr.predict(pd.DataFrame(new_data,columns=['livesp','dist','metrdist']))

In [38]:
prediction

0    1.492146
1    1.701236
2    1.634887
3    1.686109
4    1.458260
5    1.735571
6    1.806509
7    1.671248
8    1.682687
9    1.839595
dtype: float64