In [1]:
### Python 
dir()                       # returns a list of objects in the namespace 
A = np.array([3, 5, 11])    
dir(A)                      # returns object 'A' namespace, including attributes, and any method associated with object

NameError: name 'np' is not defined

In [None]:
import numpy as np
 

In [None]:
### Stats Models 
import statsmodels.api as sm    # imports the stats models 
from statsmodels.stats.outliers_influence \
import variance_inflation_factor as VIF     # import only a few items, keep namespace clean 
from statsmodels.stats.anova import anova_lm 



In [None]:
### Building models using modelspec
import pandas as pd 
from ISLP import load_data      # function for loading data 
from ISLP.models import (ModelSpec as MS, 
                         summarize, 
                         poly)  # use ModelSpec to construct design matrices   
Boston  = load_data("Boston")
Boston?                         # find out more about the Boston dataset
Boston.columns                  # returns the column names 
X = pd.DataFrame({'intercept': np.ones(Boston.shape[0]), 
                  'lstat': Boston['lstat']})    # create model matrix with single predictor
y = Boston['medv']          # create response vector 
model = sm.OLS(y, X)        # specify OLS model using stats model 
results = model.fit()       # fit model using least squares? stats model 
summarize(results)          # return coef, std err, t, |P| . T, using ISLP function       
design = MS(['lstat'])      # Use model spec to create design matrix
design = design.fit(Boston) # checks if 'lstat' specified in design exists in dataset
X = design.transform(Boston)# construct model matrix with 2 cols: intercept and lstat
results.summary()           # returns exhaustive summary of the fit 
results.params              # returns the fitted coefficients 
new_df = pd.DataFrame({'lstat': [5, 10, 15]}) # new dataframe for predictions 
newX = design.transform(new_df) # new model matrix to make predictions 
new_predictions = results.get_prediction(newX)  # compute predictions using new model 
new_predictions.predicted_mean                  # return (3,) predicted mean 
new_predictions.conf_int(alpha=0.05)            # return (3,3) 95% confidence intervals for predicted values 
new_predictions.conf_int(obs=True, alpha=0.05)  # return (3,3) prediction intervals setting obs=True
infl = results.get_influence()  # returns influence matrix
infl.hat_matrix_diag            # returns leverage stats
np.argmax(infl.hat_matrix_diag) # identifies index of largest element/leverage stat 
terms = Boston.columns.drop('medv') # fit on all predictos and drop medv
X = MS(terms).fit_transform(Boston)
model = sm.OLS(y, X)            # multiple OLS model 
results = model.fit()
summarize(results)
results.rsquared()              # returns R^2 
np.sqrt(results.scale)          # returns RSE
vals = [VIF(X, i) 
         for i in range(1, X.shape[1])] 
vif = pd.DataFrame({'vif':vals}, 
                   index = X.columns[1:])   # calculate vif, assessing collinearity in model 
X = MS(['lstat', 
        'age',
        ('lstat', 'age')]).fit_transform(Boston)    # include interaction term predictor in modelspec

model2 = sm.OLS(y, X)
results2 = model2.fit()
summarize(results2)

X = MS([poly('lstat', degree=2)], 'age').fit_transform(Boston) # include polynomial function 
model3 = sm.OLS(y, X)
results3 = model3.fit()
summarize(results3)

anova_lm(results, results3)    # use anova_lm to quantify extent to which fit is better 
Carseats = load_data('Carseats')
Carseats.columns    # ModelSpec() will automatically generate dummy variable for categorical feature 
# Look at example for Carseats. 

SyntaxError: invalid syntax (2273610293.py, line 7)

In [None]:
### Matplotlib 
# ax - axis object for existing plot 
# b - intercept 
# m - slope of desired line 
# *args - allows for any num of args to ax.plot
# **kwargs - allows for any num of named args (like linewidth=3)
from matplotlib.pyplot import subplots 
def abline(ax, b, m, *args, **kwargs): 
    "Adds a line with slope m and intercept b to ax" 
    xlim = ax.get_xlim()
    ylim = [m * xlim[0] + b, m * xlim[1] + b]
    ax.plot(xlim, ylim, *args, **kwargs)

ax = Boston.plot.scatter('lstat', 'medv')
abline(ax, 
       results.params[0], 
       results.params[1], 
       'r--', 
       linewidth=3)
ax = subplots(figsize=(8, 8))[1]
ax.scatter(results.fittedvalues, results.resid) #fitted vals vs. residuls of fit
ax.axhline(0, c='k', ls='--')   # adds horizonatl line, color black, linestyle dashed

