In [51]:
import pandas as pd 
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
import joblib # for saving and loading the model


homework 2.2 
In predict_prices.py, load the Ames dataset from data/house-prices-data.csv, split the data into train/test sets using train_test_split from sklearn.model_selection, and fit a decision tree model to predict Sale Price. Use cross-validation (of your choice) to select the maximum depth of the decision tree. (You can continue to use validation to select other hyperparameters if you wish.) You will need to preprocess features appropriately (for example, handle categorical features, and possibly sandardize or normalize features). Print your test root mean squared error (RMSE). Save
your best model to a file tree.joblib (already done in the code; you just need to fill in the
train function)

In [52]:
def preprocess_data(df):
    # Split features from labels
    x = df.iloc[:,:-1]
    y = df.iloc[:,-1:].values.reshape(-1,1)
    # print(x,y)

    # Encode non-numeric features
    for col in x.columns: # iterate over all columns, col refers to column name
        if x[col].dtype == object:
            # call ordinal encoder
            enc = OrdinalEncoder()
            # rewrite x[col], reshape(-1,1) for a (n,1) array
            x[col] = enc.fit_transform(x[col].values.reshape(-1,1))

    # Normalize y labels
    enc = LabelEncoder()
    y = enc.fit_transform(y).reshape(-1,1)

    # Return processed x and y
    return x, y


In [53]:
def train_tree(x_train, y_train):

    # Set hyperparameter values
    depths = range(1,20)
    rmse = []

    # Fit DecisionTreeRegressor, iterate over tress with different depts. Store RMSE values to compare the best depth.
    for i in range(depths):
        tree = DecisionTreeRegressor(max_depth=i, random_state=42)
        rmse_arr = cross_val_score(tree, x_train, y_train, scoring = 'neg_rot_mean_squared_error')
        # return a scalar
        rmse_avg = rmse_arr.mean()
        rmse.append(rmse_avg)
    
    # Find best tree depth value with the smallest RMSE
    best_rmse = min(rmse)
    best_depth = rmse.index(best_rmse) + 1 # add 1 since the list indexes at zero

    # Return tree with best tree depth value 
    tree = DecisionTreeRegressor(max_depth = best_depth, random_state=42)
    tree.fit(x_train, y_train)
    # Return the tree
    return tree 



In [54]:
def evaluate_rmse(model, x_test, y_test):
    # Make predictions on the test set
    y_pred = model.predict(x_test)

    # Evaluate the model
    rmse = root_mean_squared_error(y_test, y_pred)

    # Return rmse result 
    return rmse

In [55]:
# RUN AS MAIN

# Load dataset
ames_housing = pd.read_csv("house-prices-data.csv")

# Split + preprocess data
x, y = preprocess_data(ames_housing)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.2, random_state = 42) # random_state is the seed

# Create decision tree with the best depth that has the smallest RMSE value 
model = train_tree(x_train, y_train)

# Save the trained model
joblib.dump(model, "tree.joblib")

# Test predict function on the first 5 samples in x_test
sample_x_test = x_test[0:5,:]
sample_y_pred = model.predict(sample_x_test)
print("Prediction for the first 5 samples in x_test are=", sample_y_pred)

# Evaluate root mean squared error
rmse_results = evaluate_rmse(model, x_test, y_test)
print("RMSE=", rmse_results)


  y = column_or_1d(y, warn=True)


TypeError: 'range' object cannot be interpreted as an integer

homework 2.3 n predict_grades.py, use the Student Performance dataset from data/student-mat-data.csv to predict the final grade (‘G3’ target column), using
linear, LASSO, and ridge regression. (You can go further, such as by using elastic net regularization, but this is not required.) Use cross-validation to select the regularization parameter. Save your best model to a file regression.joblib (see example at the end of the main function).

In [None]:
def linear_regression_train(x_train, y_train):
    
    # Create linear regression model 
    model = LinearRegression()
    model.fit(x_train, y_train)

    # Return model
    return model

In [None]:
def lasso_regression_train(X_train, y_train):

    # Create lasso regression model 
    model = LassoCV(alphas=np.logspace(-3, 2, 50), cv=5, random_state=42)
    model.fit(x_train, y_train)

    # Return model 
    return model 

In [None]:
def ridge_regression_train(x_train, y_train):

    # Create ridge regression model 
    model =  RidgeCV(alphas = np.logspace(-3, 2, 50), cv=5)
    model.fit(x_train, y_train)

    # Return model
    return model 

In [None]:
# RUN AS MAIN
# Load dataset
student_performance = pd.read_csv("student-mat-data.csv")

# Split + preprocess data
x, y = preprocess_data(student_performance)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.2, random_state = 42) # random_state is the seed

# Linear regression model
print("\n=== LINEAR REGRESSION ===")
lin_model = linear_regression_train(x_train, y_train)

# Predict y from test data 
y_pred_lin = lin_model.predict(x_test)

# Calculate MSE
train_mse_lin = mean_squared_error(y_train, lin_model.predict(x_train))
test_mse_lin = mean_squared_error(y_test, y_pred_lin)

# Select features where the coefficient is not zero, meaning those features ARE included
selected_features_lin = np.sum(lin_model.coef_ != 0)
avg_coefficient_lin = np.mean(np.abs(lin_model.coef_[lin_model.coef_ != 0])) if selected_features_lin > 0 else 0

baseline_results = {
'alpha': 0.0,
'train_mse': train_mse_lin,
'test_mse': test_mse_lin,
'selected_features': selected_features_lin,
'avg_coefficient': avg_coefficient_lin
}
print(f"Linear regression train MSE: {baseline_results['train_mse']}")
print(f"Linear regression test MSE: {baseline_results['test_mse']}")
print(f"Linear regression # selected features: {baseline_results['selected_features']}")
print(f"Linear regression average coefficient: {baseline_results['avg_coefficient']}")


# Lasso regression model 
print("\n=== LASSO REGRESSION ===")
lasso_model = lasso_regression_train(x_train, y_train)

# Predict y using lasso regularized model 
y_pred_lasso = lasso_model.predict(x_test)

# Calculate MSE
train_mse_lasso = mean_squared_error(y_train, lasso_model.predict(x_train))
test_mse_lasso = mean_squared_error(y_test, y_pred_lasso)

# Return best regularization parameter (lambda), the one that controls the strength of the penalty. 
# High alpha = stronger regularization, more coefficeints are exacly zero
best_lasso_alpha = lasso_model.alpha_

# Select features where the coefficient is not zero, meaning those features ARE included
selected_features_lasso = np.sum(lasso_model.coef_ != 0)
avg_coefficient_lasso = np.mean(np.abs(lasso_model.coef_[lasso_model.coef_ != 0])) if selected_features_lasso > 0 else 0

lasso_results = {
    'alpha': best_lasso_alpha,
    'train_mse': train_mse_lasso,
    'test_mse': test_mse_lasso,
    'selected_features': selected_features_lasso,
    'avg_coefficient': avg_coefficient_lasso
}
print(f"LASSO regression best alpha: {lasso_results['alpha']}")
print(f"LASSO regression train MSE: {lasso_results['train_mse']}")
print(f"LASSO regression test MSE: {lasso_results['test_mse']}")
print(f"LASSO regression # selected features: {lasso_results['selected_features']}")
print(f"LASSO regression average coefficient: {lasso_results['avg_coefficient']}") 


# Ridge regression 
print("\n=== RIDGE REGRESSION ===")
ridge_model = ridge_regression_train(x_train, y_train)

# Predict y from ridge regularized model 
y_pred_ridge = ridge_model.predict(x_test)

# Calculate RMSE
train_mse_ridge = mean_squared_error(y_train, ridge_model.predict(x_train))
test_mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Return best regularization parameter (lambda), the one that controls the strength of the penalty. 
# High alpha = stronger regularization, more coefficeints are near but not exactly zero
best_ridge_alpha = ridge_model.alpha_

# Select features where the coefficient is not zero, meaning those features ARE included
selected_features_ridge = np.sum(ridge_model.coef_ != 0)
avg_coefficient_ridge = np.mean(np.abs(ridge_model.coef_[ridge_model.coef_ != 0])) if selected_features_ridge > 0 else 0

ridge_results = {
    'alpha': best_ridge_alpha,
    'train_mse': train_mse_ridge,
    'test_mse': test_mse_ridge,
    'selected_features': selected_features_ridge,
    'avg_coefficient': avg_coefficient_ridge
}
print(f"Ridge regression best alpha: {ridge_results['alpha']}")
print(f"Ridge regression train MSE: {ridge_results['train_mse']}")
print(f"Ridge regression test MSE: {ridge_results['test_mse']}")
print(f"Ridge regression # selected features: {ridge_results['selected_features']}")
print(f"Ridge regression average coefficient: {ridge_results['avg_coefficient']}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)



=== LINEAR REGRESSION ===
Linear regression train MSE: 8.954936034030515
Linear regression test MSE: 20.093565973939526
Linear regression # selected features: 30
Linear regression average coefficient: 0.759710459427398

=== LASSO REGRESSION ===
LASSO regression best alpha: 0.5689866029018299
LASSO regression train MSE: 12.434683794208832
LASSO regression test MSE: 14.044915614723088
LASSO regression # selected features: 6
LASSO regression average coefficient: 0.39968631223830614

=== RIDGE REGRESSION ===
Ridge regression best alpha: 100.0
Ridge regression train MSE: 12.443570331090942
Ridge regression test MSE: 14.765692483952037
Ridge regression # selected features: 30
Ridge regression average coefficient: 0.12774392897932924


In [56]:
# Return number of features
total_features = x_train.shape[1]

summary_table = pd.DataFrame({
    'Metric': ['Test MSE', '# Selected Features', 'Avg Feature Coefficient', 'Regularization Strength (alpha)'],
    'Linear Regression': [
        f"{baseline_results['test_mse']:.4f}",
        f"{baseline_results['selected_features']}/{total_features}",
        f"{baseline_results['avg_coefficient']:.4f}",
        "None"
    ],
    'Lasso (L1)': [
        f"{lasso_results['test_mse']:.4f}",
        f"{lasso_results['selected_features']}/{total_features}",
        f"{lasso_results['avg_coefficient']:.4f}",
        f"{lasso_results['alpha']:.4f}"
    ],
    'Ridge (L2)': [
        f"{ridge_results['test_mse']:.4f}",
        f"{ridge_results['selected_features']}/{total_features}",
        f"{ridge_results['avg_coefficient']:.4f}",
        f"{ridge_results['alpha']:.4f}"
    ],
})

print(summary_table)

# Pick a model to save, e.g., linear regression
joblib.dump(lin_model, "regression.joblib")

                            Metric Linear Regression Lasso (L1) Ridge (L2)
0                         Test MSE           20.0936    14.0449    14.7657
1              # Selected Features             30/79       6/79      30/79
2          Avg Feature Coefficient            0.7597     0.3997     0.1277
3  Regularization Strength (alpha)              None     0.5690   100.0000


['regression.joblib']