In [7]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from scipy import stats
from statsmodels.formula.api import ols

## Heights

Using the data from the class.

In [10]:
data = pd.read_csv("https://raw.githubusercontent.com/stanford-mse-125-2025/mse-125-2025-public/refs/heads/main/data/class_heights.csv")
data.columns = ["Time", "Gender", "Father", "Mother", "Child"]
data

Unnamed: 0,Time,Gender,Father,Mother,Child
0,2025/05/08 3:30:46 PM MDT,Male,73.00,70.0,74.00
1,2025/05/08 3:30:47 PM MDT,Male,67.00,60.0,65.00
2,2025/05/08 3:30:47 PM MDT,Female,76.00,72.0,72.00
3,2025/05/08 3:30:50 PM MDT,Female,69.00,65.0,62.00
4,2025/05/08 3:30:51 PM MDT,Male,70.00,70.0,73.00
...,...,...,...,...,...
57,2025/05/08 3:31:39 PM MDT,Male,75.00,65.0,73.00
58,2025/05/08 3:31:41 PM MDT,Female,69.00,63.0,65.00
59,2025/05/08 3:31:45 PM MDT,Female,76.70,69.6,68.00
60,2025/05/08 3:31:51 PM MDT,Female,68.11,61.0,64.96


In [11]:
model = ols("Child ~ Father", data=data).fit()
model.summary()

0,1,2,3
Dep. Variable:,Child,R-squared:,0.204
Model:,OLS,Adj. R-squared:,0.191
Method:,Least Squares,F-statistic:,15.36
Date:,"Mon, 12 May 2025",Prob (F-statistic):,0.00023
Time:,14:43:27,Log-Likelihood:,-180.78
No. Observations:,62,AIC:,365.6
Df Residuals:,60,BIC:,369.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,18.5774,12.524,1.483,0.143,-6.474,43.629
Father,0.6968,0.178,3.920,0.000,0.341,1.052

0,1,2,3
Omnibus:,1.945,Durbin-Watson:,2.043
Prob(Omnibus):,0.378,Jarque-Bera (JB):,1.379
Skew:,-0.12,Prob(JB):,0.502
Kurtosis:,2.31,Cond. No.,1530.0


In [12]:
model = ols("Child ~ Father + Mother", data=data).fit()
model.summary()

0,1,2,3
Dep. Variable:,Child,R-squared:,0.415
Model:,OLS,Adj. R-squared:,0.395
Method:,Least Squares,F-statistic:,20.89
Date:,"Mon, 12 May 2025",Prob (F-statistic):,1.38e-07
Time:,14:43:45,Log-Likelihood:,-171.26
No. Observations:,62,AIC:,348.5
Df Residuals:,59,BIC:,354.9
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0439,11.560,-0.004,0.997,-23.175,23.087
Father,0.3102,0.175,1.771,0.082,-0.040,0.661
Mother,0.7162,0.155,4.608,0.000,0.405,1.027

0,1,2,3
Omnibus:,1.385,Durbin-Watson:,2.15
Prob(Omnibus):,0.5,Jarque-Bera (JB):,1.408
Skew:,0.295,Prob(JB):,0.495
Kurtosis:,2.555,Cond. No.,2210.0


In [13]:
# Get the MAE of both models
model = ols("Child ~ Father", data=data).fit()
mae_simple = np.mean(np.abs(data["Child"] - model.predict(data)))
print(f"MAE of simple model: {mae_simple:.2f}")

model = ols("Child ~ Father + Mother", data=data).fit()
mae_multiple = np.mean(np.abs(data["Child"] - model.predict(data)))
print(f"MAE of multiple model: {mae_multiple:.2f}")

MAE of simple model: 3.66
MAE of multiple model: 3.15


## Ames housing dataset

Goal: predict the price of houses in Ames, Iowa. 

Description of features: https://www.openml.org/search?type=data&sort=runs&id=42165&status=active

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/stanford-mse-125-2025/mse-125-2025-public/refs/heads/main/data/ames_housing.csv")
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
