In [1]:
# Мультиколлинеарность

import pandas as pd # работа с таблицами 
import numpy as np # математические функции и матрицы
import matplotlib.pyplot as plt # графики
import seaborn as sns # еще более классные графики
import statsmodels.api as sm # стандартные регрессионные модели
import statsmodels.formula.api as smf # аналогичные модели с синтаксисом в стиле R
import statsmodels.graphics.gofplots as gf # визуализация моделей
import statsmodels.discrete.discrete_model # дискретные модели
from statsmodels.stats.outliers_influence import summary_table # работа с выбросами
from scipy.stats import shapiro # тест Шапиро – Уилка 
import math

In [2]:
# При желании можем настроить графики по своему вкусу, изменив стиль и другие параметры шрифтов и графиков
# (https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html)

plt.style.use('seaborn')
plt.rc('font', size=14)
plt.rc('figure', titlesize=15)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=15)

In [3]:
import random
random.seed(100)

In [4]:
# Зададим объем выборки

n = 100

In [5]:
# Сгенерируем x2, x3 и случайную ошибку eps

data = {'x2': np.random.normal(1, 3, n),
        'x3': np.random.normal(-1, 2, n),
        'eps': np.random.normal(0, 1.5, n)
       }
data = pd.DataFrame(data)
print(data)

          x2        x3       eps
0   5.746017 -2.685823 -1.556536
1   2.381429  0.363473  0.090426
2   0.510905 -1.092830  0.219759
3   2.251326  0.695102 -1.719102
4  -2.066637 -3.082945  1.252571
..       ...       ...       ...
95 -3.104797 -0.511605  1.419869
96  7.056315 -0.250174 -0.880377
97  4.020893 -1.427391  1.106940
98  0.878109  0.467826 -1.748153
99  2.478585 -0.245780 -1.296118

[100 rows x 3 columns]


In [6]:
# Сгенерируем переменную x1 на основе переменной x2

data['x1'] = 0.7*data['x2'] + np.random.normal(0, 0.01, n)
data

Unnamed: 0,x2,x3,eps,x1
0,5.746017,-2.685823,-1.556536,4.025280
1,2.381429,0.363473,0.090426,1.674630
2,0.510905,-1.092830,0.219759,0.347695
3,2.251326,0.695102,-1.719102,1.579050
4,-2.066637,-3.082945,1.252571,-1.457355
...,...,...,...,...
95,-3.104797,-0.511605,1.419869,-2.189858
96,7.056315,-0.250174,-0.880377,4.944334
97,4.020893,-1.427391,1.106940,2.826404
98,0.878109,0.467826,-1.748153,0.608771


In [7]:
# Сгенерируем зависимую переменную y

data['y'] = 0.5 + 1.2*data['x1'] - 2.3*data['x2'] + 0.4*data['x3'] + data['eps']
data

Unnamed: 0,x2,x3,eps,x1,y
0,5.746017,-2.685823,-1.556536,4.025280,-10.516370
1,2.381429,0.363473,0.090426,1.674630,-2.731914
2,0.510905,-1.092830,0.219759,0.347695,-0.475220
3,2.251326,0.695102,-1.719102,1.579050,-4.224251
4,-2.066637,-3.082945,1.252571,-1.457355,3.523832
...,...,...,...,...,...
95,-3.104797,-0.511605,1.419869,-2.189858,6.228431
96,7.056315,-0.250174,-0.880377,4.944334,-10.776769
97,4.020893,-1.427391,1.106940,2.826404,-4.820386
98,0.878109,0.467826,-1.748153,0.608771,-2.350148


In [8]:
# Оценим модель y = beta0 + beta1*x1 + beta2*x2 + beta3*x3 + eps
model1 = smf.ols('y ~ x1 + x2 + x3', data = data).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,244.8
Date:,"Mon, 14 Nov 2022",Prob (F-statistic):,7.79e-45
Time:,13:01:34,Log-Likelihood:,-187.15
No. Observations:,100,AIC:,382.3
Df Residuals:,96,BIC:,392.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5319,0.198,2.687,0.008,0.139,0.925
x1,-10.8156,14.607,-0.740,0.461,-39.809,18.178
x2,6.0815,10.232,0.594,0.554,-14.230,26.393
x3,0.3325,0.080,4.159,0.000,0.174,0.491

0,1,2,3
Omnibus:,0.118,Durbin-Watson:,1.926
Prob(Omnibus):,0.943,Jarque-Bera (JB):,0.079
Skew:,0.064,Prob(JB):,0.961
Kurtosis:,2.948,Cond. No.,437.0


In [9]:
# Расччитаем корреляционную матрицы для x1, x2 и x3
data.loc[:, ["x1", "x2", "x3"]].corr()

Unnamed: 0,x1,x2,x3
x1,1.0,0.999986,0.134873
x2,0.999986,1.0,0.134924
x3,0.134873,0.134924,1.0


In [10]:
# Перейдем к расчету показателей VIF
# Если VIF(x) > 10, то вероятно существует проблема мультиколлинеарности

# VIF(x1)
print(1/(1 - smf.ols("x1 ~ 1 + x2 + x3", data = data).fit().rsquared))

35003.40430031932


In [11]:
# VIF(x2)
print(1/(1 - smf.ols("x2 ~ 1 + x1 + x3", data = data).fit().rsquared))

35003.8992802682


In [12]:
# VIF(x3)
print(1/(1 - smf.ols("x3 ~ 1 + x1 + x2", data = data).fit().rsquared))

1.0186311725619002


In [13]:
variables = model1.model.exog
print(variables)
var_names = model1.model.exog_names
print(var_names)

[[ 1.          4.02527977  5.74601738 -2.68582282]
 [ 1.          1.67463009  2.38142852  0.36347288]
 [ 1.          0.34769493  0.51090486 -1.09283043]
 [ 1.          1.57904998  2.25132594  0.69510164]
 [ 1.         -1.45735531 -2.06663713 -3.08294529]
 [ 1.         -0.21092343 -0.27543063 -4.54025282]
 [ 1.          1.06585962  1.53043108 -0.83161305]
 [ 1.          3.98942014  5.66785384 -0.97506249]
 [ 1.          0.89231701  1.27577334  0.1935178 ]
 [ 1.         -0.24934801 -0.36504102 -2.2137497 ]
 [ 1.          2.2487727   3.21759265 -0.78721021]
 [ 1.          4.48215421  6.38748895 -2.15428819]
 [ 1.         -2.18659486 -3.09939298  1.90448749]
 [ 1.          0.66760884  0.9714999  -0.88708916]
 [ 1.          1.27828408  1.8361332  -3.59188421]
 [ 1.         -2.13602749 -3.05603758 -1.79005152]
 [ 1.          1.50651427  2.14316696 -4.2000046 ]
 [ 1.         -0.48743591 -0.6829536   1.98418829]
 [ 1.          0.36541068  0.52405361 -2.86091627]
 [ 1.          2.57227417  3.67

In [15]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Рассчитаем VIF с помощью команды vif()
vif = pd.DataFrame()
vif["Variable"]   = var_names[1:]
vif["VIF Factor"] = [variance_inflation_factor(variables, i) for i in range(1, variables.shape[1])]
#
print(vif)

  Variable    VIF Factor
0       x1  35003.404300
1       x2  35003.899280
2       x3      1.018631


In [16]:
# Проверим чувствительность оценок коэффициентов регерессионной модели к удалению наблюдения

d = data.drop(index=[98])

In [17]:
d

Unnamed: 0,x2,x3,eps,x1,y
0,5.746017,-2.685823,-1.556536,4.025280,-10.516370
1,2.381429,0.363473,0.090426,1.674630,-2.731914
2,0.510905,-1.092830,0.219759,0.347695,-0.475220
3,2.251326,0.695102,-1.719102,1.579050,-4.224251
4,-2.066637,-3.082945,1.252571,-1.457355,3.523832
...,...,...,...,...,...
94,-1.161313,-1.728308,0.067307,-0.814966,1.569044
95,-3.104797,-0.511605,1.419869,-2.189858,6.228431
96,7.056315,-0.250174,-0.880377,4.944334,-10.776769
97,4.020893,-1.427391,1.106940,2.826404,-4.820386


In [18]:
# Переоценим прежнюю модель для выборки без 98-го наблюдения и сравним полученные результаты с прежними

model2 = smf.ols('y ~ x1 + x2 + x3', data = d).fit()
print(model2.summary())
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.886
Model:                            OLS   Adj. R-squared:                  0.882
Method:                 Least Squares   F-statistic:                     245.9
Date:                Mon, 14 Nov 2022   Prob (F-statistic):           1.22e-44
Time:                        13:02:43   Log-Likelihood:                -185.11
No. Observations:                  99   AIC:                             378.2
Df Residuals:                      95   BIC:                             388.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.5577      0.199      2.803      0.0

In [None]:
# А теперь попробуйте увеличить объем выборки n до 100000 наблюдений и переоценить модель 1

In [26]:
import random
random.seed(100)

n_new = 100000
data_new = {'x2': np.random.normal(1, 3, n_new),
        'x3': np.random.normal(-1, 2, n_new),
        'eps': np.random.normal(0, 1.5, n_new)
       }
data_new = pd.DataFrame(data_new)
data_new['x1'] = 0.7*data_new['x2'] + np.random.normal(0, 0.01, n_new)
data_new['y'] = 0.5 + 1.2*data_new['x1'] - 2.3*data_new['x2'] + 0.4*data_new['x3'] + data_new['eps']
print(data_new)

             x2        x3       eps        x1          y
0      5.703355 -4.063013  1.195610  3.966453  -8.287569
1      2.181769  0.896560 -0.045628  1.522619  -2.377931
2      2.145997  1.515044  0.984606  1.502064  -1.042694
3     -4.127956 -1.291541 -1.703240 -2.879609   4.318911
4     -1.347414 -1.144425 -2.215380 -0.966407  -0.233786
...         ...       ...       ...       ...        ...
99995  3.550310  0.049225  0.753926  2.471502  -3.926296
99996  7.662156 -1.710741  0.673152  5.349437 -10.714779
99997  0.834432 -4.473755 -0.898698  0.573142  -3.419624
99998  2.460077 -2.053164  1.002517  1.713087  -2.921222
99999  7.441248  0.628952 -0.241124  5.210057 -10.352345

[100000 rows x 5 columns]


In [27]:
model1_new = smf.ols('y ~ x1 + x2 + x3', data = data_new).fit()
print(model1_new.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.898
Method:                 Least Squares   F-statistic:                 2.927e+05
Date:                Mon, 14 Nov 2022   Prob (F-statistic):               0.00
Time:                        13:06:43   Log-Likelihood:            -1.8269e+05
No. Observations:              100000   AIC:                         3.654e+05
Df Residuals:                   99996   BIC:                         3.654e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4991      0.006     89.939      0.0