In [1]:
#Tables and matrices
import numpy as np
import pandas as pd

#Stats
import scipy.stats as st
from scipy.optimize import fmin
from scipy import integrate
from scipy.stats.mstats import mquantiles
import statsmodels.formula.api as smf
import statsmodels.api as sm 
from statsmodels.stats.diagnostic import het_breuschpagan #Heteroskedasticity test
from statsmodels.stats.diagnostic import het_white #Heteroskedasticity test

#Probabilistic programs
#!pip install numpy mkl #if you are in an intel machine i.e. in mac M# chips no
#!pip install pymc
#!pip install pytensor
import pymc as pm
import pytensor.tensor as pt
#import aesara.tensor as at
print('Running on PyMC v{}'.format(pm.__version__))


#Graphs 
#IMPORTANT: properly install ipywidgets and nodejs for interactive graphs
#If you are in jupyterlab, activate the widget extension (it should be in the latest versions)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib import animation, rc
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, HBox, VBox, Layout
from mpl_toolkits.mplot3d import axes3d
import arviz as az



Running on PyMC v5.10.3


In [2]:
def f(k,robust = False): #Esta función tiene comandos para la regresión y para gráficas.
    #k: heterosdactity parameter...larger more heteroscedasticity
    f, axes = plt.subplots(1, 2, figsize=(12, 4))
    #Noise here depends on the level of the variable X
    Y_noise = np.random.normal(loc = 0, scale = X_data**k, size = n)
    Y = pd.DataFrame({"X": X_data, "Y": Y_raw + Y_noise})
    
    # Regressin
    model = smf.ols(formula='Y ~ X', data = Y)
    if robust == True:
        results = model.fit(cov_type='HC3') #robust standard errors (to heteroskedasticity)
    else:
        results = model.fit() 
    residuals = results.resid
    Y['predictions'] = results.predict(Y['X'])
    white_test = het_white(residuals,  model.exog)
    bp_test = het_breuschpagan(residuals, model.exog)
    
    
    #Gráficas
    sns.scatterplot(x = 'X', y='Y', data=Y, color = 'black', ax = axes[0])
    axes[0].set_title("Heteroskedasticity \n Variance in y depends on x level")
    axes[0].plot(Y['X'], Y['predictions'], color = 'red')
    axes[1].axis('off')
    const = str(round(results.params[0],3))
    slope = str(round(results.params[1],3))
    axes[1].text(0, 0.5, 'Truth: Intercept: 1.370; Coef. X: 2.097 \nRegres: Intercept: ' + const + '; Coef. X: ' + slope,
        color='black', fontsize=15)
    
    #Output
    plt.show()
    print(results.summary()) 
    print("Durbin Watson measures autocorrelation of residuals. Close to 2 is good, zero autocorrelations")
    print("Jarque Bera is a test of normality of the residuals for large samples (n>2000). Large values and p<0.05 not normal")
    labels = ['LM-Statistic','LM-Test p-value', 'F-Statistic', 'F-Test p-value']
    print("Breusch Pagan test (p<0.05 there is Heteroskedasticity): ", dict(zip(labels[2:4], bp_test[2:4]))) # p<0.05 hay heterocedasticidad
    print("White test (p<0.05 there is Heteroskedasticity): ", dict(zip(labels[2:4], white_test[2:4]))) # p<0.05 hay heterocedasticidad
    
    plt.figure()
    fig =  sm.graphics.plot_regress_exog(results, "X")
    fig.tight_layout(pad=1.0)
    


# Residuals and heteroskedasticity visualization.

It can bias the estimates as shown below. Change k and use robust std. error. Note how the normality, autocorrelation, and heteroskedasticity tests change. Also, the residuals visualizations.

In [3]:
n = 2000
X_data = np.linspace(1,100, n)
Y_raw = 1.37 + 2.097 * X_data #The underlying truth ... the majority of the times is unknown...here for pedagogical reasons we do know it

interact(f, k = widgets.FloatSlider(min=0, max=3, step=.2, value=1.25), 
         robust = widgets.Checkbox(value = False, description = 'Robust std. err.')); #Larger k more heteroskedasticity


interactive(children=(FloatSlider(value=1.25, description='k', max=3.0, step=0.2), Checkbox(value=False, descr…

# Running regressions in Python (and R to some extent)

In [18]:
#Load data
#SOURCE: Harvard Business Publishing (SMU557)
#"Amazon: Facing low costumer satisfaction in Singapore"
#By: Marcus Ang Teck and Chen Yongchang, Singapore Management University
amazon = pd.read_csv("Amazon_Satisfaction_Singapore.csv")
amazon_var_dictionary = pd.read_csv("Amazon_Satisfaction_Singapore_Var_Dictionary.csv") #Description of variables

#Filter for only Amazon
amazon_r = amazon.query("company_v=='AMAZON'")

#Rename columns for something more informative
cols = ["ID", #Consumer
        "Company", #Seller name
        "Prod_Quality", #Overall Product Quality 1 Very low, 10 Very High
        "Service_Quality", #Overall Service Quality 
        "Price_Value", #Price given quality 1 Very poor price,  10 Very good price
        "Satisfaction", #Customer Satisfaction 1 Very dissatisfied, 10 Very satisfied
        "Repurchase", #Likelihood to Repurchase 1 Very unlikely, 10 Very likely
        "Recommend", #Likelihood to recommend
        "Recommend_Site", #Recommended ecommerce site to family and friends in the last 3 months  0 No, 1 Yes
        "Satisfaction_With", #Overall experiences SATISFACTION with (INSERT NAME) 1 Very dissatisfied, 10 Very satisfied
        "Variety_Prods", #Variety of products that interests me
        "Variety_Prods_Needs", #Variety of products that meet my needs
        "Ease_Navigation", #Ease of navigating the website or app
        "Ease_Prods_Needs", #Ease of finding the products you need
        "Availability_Prods", #Availability of products
        "Discounts", #Attractiveness of promotions and discounts offered
        "Info_Prods", #Sufficiency of Product information
        "Ease_Comparing", #Ease of comparing products
        "Ease_Special_Needs", #Ease of indicating special requests
        "Ease_Cart", #Ease of managing your shopping cart
        "Ease_Check_Out", #Check-out and payment process
        "Security", #Security of website
        "Delivery", #Clarity and usefulness of information on your delivery methods and fees
        "Delivery_Range", #Range of delivery options
        "Order_Tracking", #Ease of tracking your order
        "Delivery_Time", #Time taken to receive the product
        "Fidelity_Prods", #Products you received were as described on the website
        "Feedback", #Availability of feedback channels
        "Return_Policies", #Return and exchange policies
        "Channel_Pref", #Method Used most frequently to shop at (INSERT NAME) 1 Mobile App, 2 Website (Using PC), 3 Website (Using a mobile device)
        "Pay_Pref", #Method of payment do you prefer most for shopping online at (INSERT NAME) 1 Credit Cards, 2 PayPal, 3 E-nets, 4 AXS machines, 5 Cash upon delivery, 6 Others
        "Pay_Pref_Other", #Method of payment do you prefer most for shopping online at (INSERT NAME) Other specify
        "Read_Reviews", #Read the reviews during shopping experience 1 Yes 2 No
        "Interacted_Seller", #Interacted directly with the seller during shopping experience
        "Satisfaction_Reviews", #Satisfaction with the product reviews 1 Very dissatisfied, 10 Very satisfied
        "Satisfaction_Channels_Seller", #Satisfaction with the channels available to communicate with the seller(s)
        "No_Purchases_6m", #No. of times purchased in the last 6 months
        "Amount_Spent_3m", #Average amount spent per visit last 3 months
        "Shop_Behavior", #General Shopping Behavior: Usually make most purchases on physical store or an online store 1 Physical store, 2 Online store 3 Equal for both physical store &  online store
        "Age", #Age
        "Race", #Race of the respondent 1 Chinese, 2 Malay, 3 Indian, 4 Eurasian, 5 Others
        "Employment", #Employment status of the respondent 1 Working full-time, 2 Working part-time, 3 Homemaker, 4 Retired, 5 Student, 6 Unemployed
        "Income_Personal", #Monthly Personal Income "1	Under SGD 2K; 2 SGD 2K - Under SGD 3K; 3 SGD 3K - Under SGD 4K; 4	SGD 4K - Under SGD 6K; 5	SGD 6K - Under SGD 8K: 6	SGD 8K - Under SGD 10K; 7	SGD 10K - Under SGD 15K; 8	SGD 15K - Under SGD 20K; 9	SGD 20K or over"
        "Income_House", #Monthly Household Income
        "Ed", #Education Qualification "1	None; 2	PSLE & below; 3	GCE N Level; 4	GCE O Level; 5	GCE A Level / Post-Secondary; 6	ITE / Vocational Institute; 7	Polytechnic Diploma / Professional Cert; 8	University Degree; 9	University Post-Graduate Degree"
        "No_Children", #No. of children dependent for financial support
        "Marital", #Marital Status "1	Single; 2	Married; 3	Divorced; 4	Widowed; 5	Separated;6	Domestic Partnership"
        "Gender", #Gender "1	Male; 2	Female"
        "House_Type", #Housetype according to respondent "1	HDB 1-2 room; 2	HDB 3 room; 3	HDB 4 room; 4	HDB 5 room / Executive Flat; 5	Condo / Ptd Apartment; 6	Landed Property"
        "Date_Interview", #Date of Interview
       ]


In [16]:
amazon_r.columns

Index(['uid', 'company_v', 'poverq', 'soverq', 'pq', 'satis', 'repur',
       'recomm', 'Q19', 'VN_1009_Q20A', 'VN_1009_TP01', 'VN_1009_TP02',
       'VN_1009_TP03', 'VN_1009_TP04', 'VN_1009_TP05', 'VN_1009_TP06',
       'VN_1009_TP07', 'VN_1009_TP08', 'VN_1009_TP09', 'VN_1009_TP10',
       'VN_1009_TP11', 'VN_1009_TP12', 'VN_1009_TP13', 'VN_1009_TP14',
       'VN_1009_TP15', 'VN_1009_TP16', 'VN_1009_TP17', 'VN_1009_TP18',
       'VN_1009_TP19', 'VN_1009_TP20', 'VN_1009_TP21', 'VN_1009_TP21_6specify',
       'VN_1009_TP24_1', 'VN_1009_TP24_2', 'VN_1009_TP22', 'VN_1009_TP23',
       'Q9C_P', 'Q9D', 'VN_1009_TP25A', 'age', 'race', 'work', 'pincome',
       'income', 'educat', 'childsupp', 'marital', 'gender', 'house', 'DOI'],
      dtype='object')