In [12]:
#Tables and matrices
import numpy as np
import pandas as pd

#Stats
import scipy.stats as st
from scipy.optimize import fmin
from scipy import integrate
from scipy.stats.mstats import mquantiles
import statsmodels.formula.api as smf
import statsmodels.api as sm 
from statsmodels.stats.diagnostic import het_breuschpagan #Heteroskedasticity test
from statsmodels.stats.diagnostic import het_white #Heteroskedasticity test

#Probabilistic programs
#!pip install numpy mkl #if you are in an intel machine i.e. in mac M# chips no
#!pip install pymc
#!pip install pytensor
import pymc as pm
import pytensor.tensor as pt
#import aesara.tensor as at
print('Running on PyMC v{}'.format(pm.__version__))


#Graphs 
#IMPORTANT: properly install ipywidgets and nodejs for interactive graphs
#If you are in jupyterlab, activate the widget extension (it should be in the latest versions)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib import animation, rc
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, HBox, VBox, Layout
from mpl_toolkits.mplot3d import axes3d
import arviz as az

Running on PyMC v5.10.3


In [18]:
def f(k,robust = False): #Esta función tiene comandos para la regresión y para gráficas.
    #k: heterosdactity parameter...larger more heteroscedasticity
    f, axes = plt.subplots(1, 2, figsize=(12, 4))
    #El ruido puede ser una función complicada que depende del nivel de la variable independiente X
    Y_noise = np.random.normal(loc = 0, scale = X_data**k, size = n)
    Y = pd.DataFrame({"X": X_data, "Y": Y_raw + Y_noise})
    
    # Regresión
    model = smf.ols(formula='Y ~ X', data = Y)
    if robust == True:
        results = model.fit(cov_type='HC3') #errores standard robustos (a heterocedasticidad)
    else:
        results = model.fit() 
    residuals = results.resid
    Y['predictions'] = results.predict(Y['X'])
    white_test = het_white(residuals,  model.exog)
    bp_test = het_breuschpagan(residuals, model.exog)
    
    
    #Gráficas
    sns.scatterplot(x = 'X', y='Y', data=Y, color = 'black', ax = axes[0])
    axes[0].set_title("Heteroskedasticity \n Variance in y depends x level")
    axes[0].plot(Y['X'], Y['predictions'], color = 'red')
    axes[1].axis('off')
    const = str(round(results.params[0],3))
    slope = str(round(results.params[1],3))
    axes[1].text(0, 0.5, 'Truth: Intercept: 1.370; Coef. X: 2.097 \nRegres: Intercept: ' + const + '; Coef. X: ' + slope,
        color='black', fontsize=15)
    
    #Output
    plt.show()
    print(results.summary()) 
    print("Durbin Watson measures autocorrelation of residuals. Close to 2 is good, zero autocorrelations")
    print("Jarque Bera is a test of normality of the residuals for large samples (n>2000). Large values and p<0.05 not normal")
    labels = ['LM-Statistic','LM-Test p-value', 'F-Statistic', 'F-Test p-value']
    print("Breusch Pagan test (p<0.05 there is Heteroskedasticity): ", dict(zip(labels[2:4], bp_test[2:4]))) # p<0.05 hay heterocedasticidad
    print("White test (p<0.05 there is Heteroskedasticity): ", dict(zip(labels[2:4], white_test[2:4]))) # p<0.05 hay heterocedasticidad



# Heteroskedasticity

It can bias the estimates as shown below

In [19]:
n = 2000
X_data = np.linspace(1,100, n)
Y_raw = 1.37 + 2.097 * X_data #The underlying truth ... the majority of the times is unknown...here for pedagogical reasons we do know it

interact(f, k = widgets.FloatSlider(min=0, max=3, step=.2, value=1.25), 
         robust = widgets.Checkbox(value = False, description = 'Std. err. robustos')); #Larger k more heteroskedasticity


interactive(children=(FloatSlider(value=1.25, description='k', max=3.0, step=0.2), Checkbox(value=False, descr…