In [151]:
# Мультиколлинеарность

import pandas as pd # работа с таблицами 
import numpy as np # математические функции и матрицы
import matplotlib.pyplot as plt # графики
import seaborn as sns # еще более классные графики
import statsmodels.api as sm # стандартные регрессионные модели
import statsmodels.formula.api as smf # аналогичные модели с синтаксисом в стиле R
import statsmodels.graphics.gofplots as gf # визуализация моделей
import statsmodels.discrete.discrete_model # дискретные модели
from statsmodels.stats.outliers_influence import summary_table # работа с выбросами
from scipy.stats import shapiro # тест Шапиро – Уилка 
import math

In [152]:
# При желании можем настроить графики по своему вкусу, изменив стиль и другие параметры шрифтов и графиков
# (https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html)

plt.style.use('seaborn')
plt.rc('font', size=14)
plt.rc('figure', titlesize=15)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=15)

In [153]:
import random
random.seed(100)

In [154]:
# Зададим объем выборки

n = 100

In [155]:
# Сгенерируем x2, x3 и случайную ошибку eps

data = {'x2': np.random.normal(1, 3, n),
        'x3': np.random.normal(-1, 2, n),
        'eps': np.random.normal(0, 1.5, n)
       }
data = pd.DataFrame(data)
print(data)

          x2        x3       eps
0   0.698249 -0.065282  0.351270
1   5.092480  0.599139  1.156175
2   4.457369  2.795556  0.088688
3   2.161805 -5.595338 -0.913313
4   0.318529 -0.017839  2.494882
..       ...       ...       ...
95  0.454767 -1.966213 -0.697032
96  4.385914  0.370559 -0.109083
97  4.853832  0.759523  0.806617
98  4.608714 -0.277106 -1.600812
99  3.223690 -2.440314  0.810715

[100 rows x 3 columns]


In [170]:
# Сгенерируем переменную x1 на основе переменной x2

data['x1'] = 0.7*data['x2'] + np.random.normal(0, 0.01, n)
data

Unnamed: 0,x2,x3,eps,x1,y
0,0.698249,-0.065282,0.351270,0.473959,-0.014559
1,5.092480,0.599139,1.156175,3.583290,-4.331555
2,4.457369,2.795556,0.088688,3.132471,-3.719756
3,2.161805,-5.595338,-0.913313,1.522819,-5.264130
4,0.318529,-0.017839,2.494882,0.215612,2.602238
...,...,...,...,...,...
95,0.454767,-1.966213,-0.697032,0.314513,-1.532159
96,4.385914,0.370559,-0.109083,3.061387,-4.802341
97,4.853832,0.759523,0.806617,3.407735,-4.301824
98,4.608714,-0.277106,-1.600812,3.233863,-6.842555


In [171]:
# Сгенерируем зависимую переменную y

data['y'] = 0.5 + 1.2*data['x1'] - 2.3*data['x2'] + 0.4*data['x3'] + data['eps']
data

Unnamed: 0,x2,x3,eps,x1,y
0,0.698249,-0.065282,0.351270,0.473959,-0.212064
1,5.092480,0.599139,1.156175,3.583290,-5.516925
2,4.457369,2.795556,0.088688,3.132471,-4.786074
3,2.161805,-5.595338,-0.913313,1.522819,-5.796218
4,0.318529,-0.017839,2.494882,0.215612,2.513865
...,...,...,...,...,...
95,0.454767,-1.966213,-0.697032,0.314513,-1.652066
96,4.385914,0.370559,-0.109083,3.061387,-5.874797
97,4.853832,0.759523,0.806617,3.407735,-5.464105
98,4.608714,-0.277106,-1.600812,3.233863,-7.931061


In [172]:
# Оценим модель y = beta0 + beta1*x1 + beta2*x2 + beta3*x3 + eps
model1 = smf.ols('y ~ x1 + x2 + x3', data = data).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.858
Model:,OLS,Adj. R-squared:,0.854
Method:,Least Squares,F-statistic:,193.3
Date:,"Sun, 13 Nov 2022",Prob (F-statistic):,1.51e-40
Time:,19:31:19,Log-Likelihood:,-183.38
No. Observations:,100,AIC:,374.8
Df Residuals:,96,BIC:,385.2
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3033,0.200,1.514,0.133,-0.094,0.701
x1,-17.4734,14.821,-1.179,0.241,-46.892,11.945
x2,10.7258,10.372,1.034,0.304,-9.862,31.314
x3,0.4234,0.090,4.690,0.000,0.244,0.603

0,1,2,3
Omnibus:,3.629,Durbin-Watson:,2.289
Prob(Omnibus):,0.163,Jarque-Bera (JB):,3.671
Skew:,-0.169,Prob(JB):,0.16
Kurtosis:,3.875,Cond. No.,401.0


In [177]:
# Расччитаем корреляционную матрицы для x1, x2 и x3
data.loc[:, ["x1", "x2", "x3"]].corr()

Unnamed: 0,x1,x2,x3
x1,1.0,0.999982,0.186677
x2,0.999982,1.0,0.186349
x3,0.186677,0.186349,1.0


In [173]:
# Перейдем к расчету показателей VIF
# Если VIF(x) > 10, то вероятно существует проблема мультиколлинеарности

# VIF(x1)
print(1/(1 - smf.ols("x1 ~ 1 + x2 + x3", data = data).fit().rsquared))

28562.653985259632


In [138]:
# VIF(x2)
print(1/(1 - smf.ols("x2 ~ 1 + x1 + x3", data = data).fit().rsquared))

3.2110902373367116


In [139]:
# VIF(x3)
print(1/(1 - smf.ols("x3 ~ 1 + x1 + x2", data = data).fit().rsquared))

1.002735615762589


In [140]:
variables = model1.model.exog
print(variables)
var_names = model1.model.exog_names
print(var_names)

[[ 1.         -0.98147105  1.1511937  -0.15317428]
 [ 1.          4.45955644  5.64761709 -2.08406634]
 [ 1.         -1.6653349   0.41343929 -2.38837178]
 ...
 [ 1.         -4.01668462 -3.87262485 -5.84356842]
 [ 1.         -2.85958199 -1.4720997  -2.21107952]
 [ 1.         -0.01954002  3.48561869  0.60226791]]
['Intercept', 'x1', 'x2', 'x3']


In [141]:
# Рассчитаем VIF с помощью команды vif()
vif = pd.DataFrame()
vif["Variable"]   = var_names[1:]
vif["VIF Factor"] = [variance_inflation_factor(variables, i) for i in range(1, variables.shape[1])]
#
print(vif)

  Variable  VIF Factor
0       x1    3.215851
1       x2    3.211090
2       x3    1.002736


In [142]:
# Проверим чувствительность оценок коэффициентов регерессионной модели к удалению наблюдения

d = data.drop(index=[98])

In [143]:
d

Unnamed: 0,x2,x3,eps,x1,y
0,1.151194,-0.153174,-1.297223,-0.981471,-4.684003
1,5.647617,-2.084066,0.793853,4.459556,-7.177825
2,0.413439,-2.388372,-0.335830,-1.665335,-3.740491
3,2.110318,-0.936831,0.053908,0.876270,-3.623032
4,-0.976527,1.146501,-2.104863,-0.629762,0.344035
...,...,...,...,...,...
995,0.254455,-3.116883,1.975546,-0.014241,0.626457
996,1.202217,-2.959752,0.105237,1.217145,-1.883187
997,-3.872625,-5.843568,1.454419,-4.016685,3.704008
998,-1.472100,-2.211080,1.014440,-2.859582,0.584339


In [144]:
# Переоценим прежнюю модель для выборки без 98-го наблюдения и сравним полученные результаты с прежними

model2 = smf.ols('y ~ x1 + x2 + x3', data = d).fit()
print(model2.summary())
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.914
Method:                 Least Squares   F-statistic:                     3552.
Date:                Sun, 13 Nov 2022   Prob (F-statistic):               0.00
Time:                        19:22:13   Log-Likelihood:                -1801.6
No. Observations:                 999   AIC:                             3611.
Df Residuals:                     995   BIC:                             3631.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5388      0.055      9.882      0.0

In [None]:
# А теперь попробуйте увеличить объем выборки n до 1000 наблюдений и переоценить модель.1