In [2]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the housing dataset
housing = fetch_california_housing()

# Creates a DataFrame for the features and a Series for the target variable
X = pd.DataFrame(housing.data, columns=housing.feature_names) 
y = pd.Series(housing.target, name='med_house_value')

# Display first 5 rows of the dataset
display(X.head())
display(y.head())


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: med_house_value, dtype: float64

In [3]:
# Print the feature names and check for missing values
display(X.columns.dropna())

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude'],
      dtype='object')

In [4]:
# Generate summary statistics
X.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split the raw data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Train a linear regression model on untrained data
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lin_reg.predict(X_test)
y_pred


array([0.71912284, 1.76401657, 2.70965883, ..., 4.46877017, 1.18751119,
       2.00940251])

In [6]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
# Evaluate model performance
mse_lin = mean_squared_error(y_test, y_pred)
rmse_lin = root_mean_squared_error(y_test, y_pred)
r2_lin = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse_lin:.2f}")
print(f"Root Squared Error: {rmse_lin:.2f}")
print(f"R2 Score: {r2_lin:.2f}")

Mean Squared Error: 0.56
Root Squared Error: 0.75
R2 Score: 0.58


In [7]:
# View our model's coefficients
coef_series = pd.Series(lin_reg.coef_, index = X.columns)
intercept = pd.Series(lin_reg.intercept_)
coef_series

MedInc        0.448675
HouseAge      0.009724
AveRooms     -0.123323
AveBedrms     0.783145
Population   -0.000002
AveOccup     -0.003526
Latitude     -0.419792
Longitude    -0.433708
dtype: float64

The R2 Score tells us that 58% of the variance in the target variable can be explained by the predictor variables. However, this means our model doesn't account for 42% of the variance in the target variable.
The feature that has the strongest impact on predictions is AveBedrms, followed by MedInc, then Longitude, then Latitude.
On average, predictions deviate from the actual target by 0.75. An RMSE of 0.75 is relatively high, which means the model is not the best indicator of performance, and there is some error involved.

In [8]:
# Limiting X dataframe to just most impactful features
simplified_features = ["AveBedrms", "MedInc", "Longitude"]
simplified_X = X[simplified_features]

# Displaying simplified dataframe
simplified_X

Unnamed: 0,AveBedrms,MedInc,Longitude
0,1.023810,8.3252,-122.23
1,0.971880,8.3014,-122.22
2,1.073446,7.2574,-122.24
3,1.073059,5.6431,-122.25
4,1.081081,3.8462,-122.25
...,...,...,...
20635,1.133333,1.5603,-121.09
20636,1.315789,2.5568,-121.21
20637,1.120092,1.7000,-121.22
20638,1.171920,1.8672,-121.32


In [9]:
# Split the raw data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(simplified_X, y, test_size = 0.2, random_state = 42)

# Train a linear regression model on simplified data
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lin_reg.predict(X_test)
display(y_pred)

from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
# Evaluate model performance
mse_lin = mean_squared_error(y_test, y_pred)
rmse_lin = root_mean_squared_error(y_test, y_pred)
r2_lin = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse_lin:.2f}")
print(f"Root Squared Error: {rmse_lin:.2f}")
print(f"R2 Score: {r2_lin:.2f}")

array([1.13917924, 1.50397142, 1.96088974, ..., 4.36240352, 1.61664104,
       1.91021185])

Mean Squared Error: 0.71
Root Squared Error: 0.84
R2 Score: 0.46


I chose the features AveBedrms, MedInc, and Longitude because these three features had the biggest impact on the model. That means these are the strongest indicators. 
This simplified model is significantly worse than the full model. For this model, 46% of error in the target veriable can be attributed to the predictor variables, compared to 58% for the full model. Additionally, on average, predictions deviate from the actual target by 0.84, which is more than the full model's which is 0.75. Due to the consideration of less variables, the model is less accurate. 
If given the option, I would not use this model in practice because it is less accurate. However, if faced with limitations on time or amount of data available, it could be more practical to use the simplified model. Additionally, it would be rational to use the simplified model if the use is more broad and general, and a precise prediction is not needed.

In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler and apply it to the features
scaler = StandardScaler()
scaler.fit_transform(X)

X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns = X.columns)
X_scaled_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818


In [12]:
#Split the scaled data
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled_df, y, test_state = 0.2, random_state = 42)
#Fit the scaled data
lin_reg_scaled = LinearRegression()
lin_reg_scaled.fit(X_train_scaled, y_train_scaled)
#Make predictions
y_pred_scaled = lin_reg_scaled.predict(X_test_scaled)
y_pred_scaled

TypeError: got an unexpected keyword argument 'test_state'

In [None]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score
# Evaluate model performance
mse_lin_scaled = mean_squared_error(y_test, y_pred)
rmse_lin_scaled = root_mean_squared_error(y_test, y_pred)
r2_lin_scaled = r2_score(y_test, y_pred)

print("Unscaled Data Model:")
print(f"Mean Squared Error: {mse_lin:.2f}")
print(f"Root Squared Error: {rmse_lin:.2f}")
print(f"R2 Score: {r2_lin:.2f}")

Unscaled Data Model:
Mean Squared Error: 0.71
Root Squared Error: 0.84
R2 Score: 0.46
