In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df = pd.read_csv('homework2/housing.csv')

In [2]:
filtered_df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
selected_columns = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
filtered_df = filtered_df[selected_columns]

In [3]:
missing_values = filtered_df.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]
print(columns_with_missing_values)

total_bedrooms    157
dtype: int64


In [4]:
population_median = filtered_df['population'].median()
print("Median population:", population_median)

Median population: 1195.0


In [5]:
shuffled_df = filtered_df.sample(frac=1, random_state=42)
shuffled_df['median_house_value_log'] = np.log1p(shuffled_df['median_house_value'])
# Split the data into train, val, and test sets (60%/20%/20%)
train_df, temp_df = train_test_split(shuffled_df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Calculate the mean of 'total_bedrooms' in train_df
mean_total_bedrooms = train_df['total_bedrooms'].mean()

# Fill missing values with the mean and create new variables
train_df_filled = train_df.copy()
train_df_filled['total_bedrooms'] = train_df_filled['total_bedrooms'].fillna(mean_total_bedrooms)

val_df_filled = val_df.copy()
val_df_filled['total_bedrooms'] = val_df_filled['total_bedrooms'].fillna(mean_total_bedrooms)

test_df_filled = test_df.copy()
test_df_filled['total_bedrooms'] = test_df_filled['total_bedrooms'].fillna(mean_total_bedrooms)

# Create new variables with missing values replaced by 0
train_df_zero_filled = train_df.fillna(0)
val_df_zero_filled = val_df.fillna(0)
test_df_zero_filled = test_df.fillna(0)





In [6]:
def mape(y_true, y_pred):
    error = np.abs((y_true - y_pred) / y_true)
    mape_value = error.mean()
    return mape_value

X_train = train_df_filled[['total_bedrooms']]
y_train = train_df_filled['median_house_value']
model = LinearRegression()
model.fit(X_train, y_train)

X_val = val_df_filled[['total_bedrooms']]
y_val_true = val_df_filled['median_house_value']
y_val_pred = model.predict(X_val)

# Calculate MAPE for val_df_filled predictions
mape_val = mape(y_val_true, y_val_pred)

mape_val_rounded = round(mape_val, 2)

print('MAPE for val_df_filled predictions using total_bedrooms:', mape_val_rounded)



MAPE for val_df_filled predictions using total_bedrooms: 0.61


In [7]:
# Train a linear regression model using 'total_bedrooms' as the feature
X_train_zero_filled = train_df_zero_filled[['total_bedrooms']]
y_train_zero_filled = train_df_zero_filled['median_house_value']
model_zero_filled = LinearRegression()
model_zero_filled.fit(X_train_zero_filled, y_train_zero_filled)

# Predict median_house_value for val_df_zero_filled
X_val_zero_filled = val_df_zero_filled[['total_bedrooms']]
y_val_true_zero_filled = val_df_zero_filled['median_house_value']
y_val_pred_zero_filled = model_zero_filled.predict(X_val_zero_filled)

# Calculate MAPE for val_df_zero_filled predictions
mape_val_zero_filled = mape(y_val_true_zero_filled, y_val_pred_zero_filled)

mape_val_zero_filled_rounded = round(mape_val_zero_filled, 2)

print('MAPE for val_df_zero_filled predictions using total_bedrooms:', mape_val_zero_filled_rounded)


MAPE for val_df_zero_filled predictions using total_bedrooms: 0.61


In [11]:
from sklearn.linear_model import Ridge

def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# List of regularization parameters
r_values = [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]

# Initialize variables to store the best MAPE and corresponding r
best_mape = float('inf')  # Initialize with a large value
best_r = None

# Train models with different regularization parameters and calculate MAPE
for r in r_values:
    # Train a regularized linear regression model
    w = train_linear_regression_reg(X_train, y_train, r=r)
    
    # Predict y for validation set
    y_val_pred = np.dot(np.column_stack((np.ones(X_val.shape[0]), X_val)), w)
    
    # Calculate MAPE for validation predictions
    mape_val = mape(y_val_true, y_val_pred)
    
    # Update best_mape and best_r if needed
    if mape_val < best_mape:
        best_mape = mape_val
        best_r = r

    print(f'MAPE for r = {r}: {round(mape_val, 6)}')

print(f'\nBest MAPE: {round(best_mape, 6)} for r = {best_r}')


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.

In [23]:
# Function to calculate Mean Absolute Percentage Error (MAPE)
def mape(y_true, y_pred):
    error = np.abs((y_true - y_pred) / y_true)
    mape_value = np.mean(error)
    return mape_value

# Initialize an empty list to store MAPE scores
mape_scores = []

for seed in seed_values:
    # Split the data into train, val, and test sets
    shuffled_df = filtered_df.sample(frac=1, random_state=seed)
    shuffled_df['median_house_value_log'] = np.log1p(shuffled_df['median_house_value'])

    # Fill missing values with 0
    shuffled_df_zero_filled = shuffled_df.fillna(0)

    train_df, temp_df = train_test_split(shuffled_df_zero_filled, test_size=0.4, random_state=seed)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

    # Train a model without regularization
    X_train = train_df.drop(['median_house_value', 'median_house_value_log'], axis=1)
    y_train = train_df['median_house_value_log']
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on validation set
    X_val = val_df.drop(['median_house_value', 'median_house_value_log'], axis=1)
    y_val_true = val_df['median_house_value_log']

    # Fill missing values with 0 for validation set
    X_val_zero_filled = X_val.fillna(0)

    y_val_pred = model.predict(X_val_zero_filled)

    # Convert the predictions back to original scale
    y_val_pred_original_scale = np.expm1(y_val_pred)
    y_val_true_original_scale = np.expm1(y_val_true)

    # Calculate MAPE for validation predictions
    mape_val = mape(y_val_true_original_scale, y_val_pred_original_scale)
    mape_scores.append(mape_val)

    # Calculate the standard deviation of MAPE scores
std_mape = np.std(mape_scores)

# Round the standard deviation to 3 decimal places
std_mape_rounded = round(std_mape, 6)

print('Standard deviation of MAPE scores:', std_mape_rounded)


Standard deviation of MAPE scores: 0.004132


In [49]:
shuffled_df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,median_house_value_log
19963,36.23,-119.14,22.0,2935.0,523.0,1927.0,530.0,2.5875,70400.0,11.161963
5929,34.12,-117.79,16.0,2426.0,426.0,1319.0,446.0,4.8125,224500.0,12.321635
11377,33.68,-117.97,26.0,3653.0,568.0,1930.0,585.0,5.7301,260900.0,12.471896
6443,34.10,-118.03,32.0,2668.0,609.0,1512.0,541.0,2.9422,233100.0,12.359227
17546,37.34,-121.87,39.0,2479.0,541.0,1990.0,506.0,2.4306,289100.0,12.574531
...,...,...,...,...,...,...,...,...,...,...
6651,34.15,-118.14,41.0,1256.0,407.0,855.0,383.0,1.9923,500001.0,13.122367
17810,37.40,-121.86,19.0,4043.0,764.0,2196.0,708.0,6.1504,268400.0,12.500237
6850,34.07,-118.15,52.0,1983.0,344.0,887.0,331.0,3.2875,234400.0,12.364789
2161,36.78,-119.81,37.0,1965.0,364.0,796.0,335.0,3.6250,83400.0,11.331416


In [25]:
from sklearn.metrics import mean_squared_error

# Set the seed for reproducibility
seed = 9

# Split the data into train, val, and test sets
shuffled_df = filtered_df.sample(frac=1, random_state=seed)
shuffled_df['median_house_value_log'] = np.log1p(shuffled_df['median_house_value'])

# Fill missing values with 0
shuffled_df_zero_filled = shuffled_df.fillna(0)

train_df, temp_df = train_test_split(shuffled_df_zero_filled, test_size=0.4, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

# Combine train and validation datasets
train_val_df = pd.concat([train_df, val_df])

# Train a model without regularization
X_train_val = train_val_df.drop(['median_house_value', 'median_house_value_log'], axis=1)
y_train_val = train_val_df['median_house_value_log']
model = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)

# Predict on test set
X_test = test_df.drop(['median_house_value', 'median_house_value_log'], axis=1)
y_test_true = test_df['median_house_value_log']

# Fill missing values with 0 for test set
X_test_zero_filled = X_test.fillna(0)

# Predictions on the test set
y_test_pred = model[0] + np.dot(X_test_zero_filled, model[1:])

# Convert the predictions back to original scale
y_test_pred_original_scale = np.expm1(y_test_pred)
y_test_true_original_scale = np.expm1(y_test_true)

# Calculate RMSE for test predictions
rmse_test = np.sqrt(mean_squared_error(y_test_true_original_scale, y_test_pred_original_scale))

print('RMSE on the test dataset:', round(rmse_test, 2))


ValueError: shapes (3138,8) and (1,8) not aligned: 8 (dim 1) != 1 (dim 0)

In [26]:
from sklearn.metrics import mean_squared_error

# Set the seed for reproducibility
seed = 9

# Split the data into train, val, and test sets
shuffled_df = filtered_df.sample(frac=1, random_state=seed)
shuffled_df['median_house_value_log'] = np.log1p(shuffled_df['median_house_value'])

# Fill missing values with 0
shuffled_df_zero_filled = shuffled_df.fillna(0)

train_df, temp_df = train_test_split(shuffled_df_zero_filled, test_size=0.4, random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=seed)

# Combine train and validation datasets
train_val_df = pd.concat([train_df, val_df])

# Train a model without regularization
X_train_val = train_val_df.drop(['median_house_value', 'median_house_value_log'], axis=1)
y_train_val = train_val_df['median_house_value_log']
intercept, weights = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)

# Predict on test set
X_test = test_df.drop(['median_house_value', 'median_house_value_log'], axis=1)

# Fill missing values with 0 for test set
X_test_zero_filled = X_test.fillna(0)

# Predictions on the test set
y_test_pred = intercept + np.dot(X_test_zero_filled, weights)

# Convert the predictions back to original scale
y_test_pred_original_scale = np.expm1(y_test_pred)
y_test_true_original_scale = np.expm1(test_df['median_house_value_log'])

# Calculate RMSE for test predictions
rmse_test = np.sqrt(mean_squared_error(y_test_true_original_scale, y_test_pred_original_scale))

print('RMSE on the test dataset:', round(rmse_test, 2))


RMSE on the test dataset: 83593.16


In [27]:
# Function to calculate Mean Absolute Percentage Error (MAPE)
def mape(y_true, y_pred):
    error = np.abs((y_true - y_pred) / y_true)
    mape_value = np.mean(error)
    return mape_value

# Calculate MAPE for test predictions
mape_test = mape(y_test_true_original_scale, y_test_pred_original_scale)

print('MAPE on the test dataset:', round(mape_test, 3))


MAPE on the test dataset: 0.263
