## Data Importing

In [2]:
# Import packages
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import os
import xgboost as xgb
from xgboost import XGBRegressor

from kagglehub import KaggleDatasetAdapter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from itertools import product

import warnings
warnings.filterwarnings("ignore")

print("Packages imported!")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'shap'

In [None]:
dir_path = "data"
file_path = os.path.join(dir_path, "raw_data.csv")
path = "manhattan.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "zohaib30/streeteasy-dataset",
  path=path)

if not os.path.exists(dir_path):
    os.makedirs(dir_path)

df.to_csv(file_path, index=False)

df.head()

: 

# Data Inspection

In [None]:
df.info()

: 

According to the above, there's 3539 rows in the data, with no Null entries.

In [None]:
df.describe()

: 

In [None]:
{column: len(df[column].unique()) for column in df.select_dtypes("object").columns }

: 

In [None]:
df['neighborhood'].value_counts()

: 

Looking at this, there's only two columns of cat vars, and in those cat vars, one column only has one unique value. We'll drop that column in preprocessing.

In [None]:
df['neighborhood'].unique()

: 

Taking a look at the neighborhoods, there don't seem to be any false NAs.

In [None]:
# Distribution of rent
plt.figure(figsize=(10, 6))
sns.histplot(df['rent'], kde=True, color='blue')
plt.title('Distribution of Rent')
plt.xlabel('Rent')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()

# Check skewness of rent
skewness = df['rent'].skew()
print(f"Skewness of rent: {skewness:.2f}")


: 

There is extremely high right-skew on the target variable, rent.

This could be a that certain neighborhoods are more expensive. We will plot to investigate.

In [None]:
# Remove neighborhoods w/ <5 occurrences as it would be hard to get a proper plot
neighborhood_counts = df['neighborhood'].value_counts()

# Filter out neighborhoods with < 5 occurrences
valid_neighborhoods = neighborhood_counts[neighborhood_counts >= 5].index
df_filtered = df[df['neighborhood'].isin(valid_neighborhoods)].reset_index(drop=True)

print(f"Remaining neighborhoods: {len(valid_neighborhoods)}")

: 

In [None]:
# unique neighborhoods after filtering
neighborhoods = df_filtered['neighborhood'].unique()
num_neighborhoods = len(neighborhoods)

# Set columns and rows
cols = 4
rows = (num_neighborhoods // cols) + (num_neighborhoods % cols > 0)

plt.figure(figsize=(20, rows * 5))
for i, neighborhood in enumerate(neighborhoods):
    plt.subplot(rows, cols, i+1)

    rent_data = df_filtered[df_filtered['neighborhood'] == neighborhood]['rent']
    
    sns.histplot(rent_data, kde=True, color='blue')
    plt.title(neighborhood)
    plt.xlabel('Rent')
    plt.ylabel('Frequency')
    plt.grid(False)

plt.tight_layout()
plt.show()


: 

There seems to be right-skew among ALL neighborhoods, we will add a new column, rent_logged, to be tested against y in data processing.

# Feature Engineering

## Logging Rent

In [None]:
df['rent_logged'] = np.log1p(df['rent'])

df[['rent','rent_logged']]

: 

In [None]:
# Distribution of rent
plt.figure(figsize=(10, 6))
sns.histplot(df['rent_logged'], kde=True, color='blue')
plt.title('Distribution of Rent')
plt.xlabel('Rent')
plt.ylabel('Frequency')
plt.grid(False)
plt.show()

# Check skewness of rent
skewness = df['rent_logged'].skew()
print(f"Skewness of rent_logged: {skewness:.2f}")


: 

In [None]:
# Re-plot histograms of log-transformed rent by neighborhood
neighborhoods = df['neighborhood'].unique()
num_neighborhoods = len(neighborhoods)

cols = 4
rows = (num_neighborhoods // cols) + (num_neighborhoods % cols > 0)

plt.figure(figsize=(20, rows * 5))
for i, neighborhood in enumerate(neighborhoods):
    plt.subplot(rows, cols, i+1)
    
    # Filter data for the neighborhood
    log_rent_data = df[df['neighborhood'] == neighborhood]['rent_logged']
    
    # Plot histogram
    sns.histplot(log_rent_data, kde=True, color='green')
    plt.title(neighborhood)
    plt.xlabel('Log(Rent)')
    plt.ylabel('Frequency')
    plt.grid(False)

plt.tight_layout()
plt.show()


: 

In [None]:
def preprocess_data(df, target_var='rent'):
    # Ensure that original data is not being manipulated inplace
    data = df.copy()
    data

    # Drop borough and rental ID columns
    data = data.drop(['rental_id','borough'], axis = 1)

    # One hot encode
    ## Usually drop_first would be True, but since we want to inspect and possibly group variables, we need all neighborhoods
    data= pd.get_dummies(data, columns = ['neighborhood'], drop_first = False, dtype=int, prefix = None)
    
    # Save processed data
    file_path = os.path.join(dir_path, "processed_data.csv")
    data.to_csv(file_path)

    # Split data into X, y
    X = data.drop(['rent','rent_logged'], axis = 1).copy()
    y = data[target_var]

    # Identify numeric & categorical columns based on 'building_age_yrs' position
    split_index = X.columns.get_loc('building_age_yrs') + 1 
    numeric_cols = X.columns[:split_index] 
    categorical_cols = X.columns[split_index:]


    # Apply StandardScaler only to numeric columns (not categorical numeric ones)
    X_scaler = StandardScaler()
    X[numeric_cols] = X_scaler.fit_transform(X[numeric_cols])

    return X, y

: 

In [None]:
X, y = preprocess_data(df, 'rent_logged')
X_numerical = X.loc[:, ~X.columns.str.contains('neighborhood')]

: 

In [None]:
X

: 

## Variance Inflation Factor

In [None]:
# VARIANCE INFLATION FACTOR

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Feature"] = X_numerical.columns
vif_data["VIF"] = [variance_inflation_factor(X_numerical.values, i) for i in range(X_numerical.shape[1])]

# Display VIF
print(vif_data)

: 

## Variable Influence on Price
Excluding CatVar (Neighborhood)

Using linear models with and without neighborhood, we can look at how much each neighborhood affects pricing, and thus decide how to utilise the variable.

In [None]:
def get_linear_coeffs(X, y):
    """Reusable function to get """
    # Fit linear regression model
    linear_model = LinearRegression()
    linear_model.fit(X, y)

    # Get coefficients
    coefficients = linear_model.coef_

    # Sort by actual coefficient values (not absolute)
    neighborhood_coeff_df = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': coefficients
    })

    return neighborhood_coeff_df

: 

In [None]:
numerical_coeff_df = get_linear_coeffs(X_numerical, y)
numerical_coeff_df = numerical_coeff_df.sort_values(by='Coefficient', ascending=False)


# Plot
plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=numerical_coeff_df, palette='Spectral')
plt.suptitle('Standardized Coefficients for House Price Prediction')
plt.title('(Not including neighborhoods)')
plt.xlabel('Change in Standardized SalePrice')
plt.ylabel('Feature')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

: 

## Neighborhood Influence on Price

In [None]:
# Sort by actual coefficient values (not absolute)
neighborhood_coeff_df = get_linear_coeffs(X, y)

: 

##### Visualization 1: Neighborhood Influences summed
(net neighborhood influence across all)

In [None]:
def plot_summed_effects(coeff_df):
    # Calculate the total neighborhood effect by summing all neighborhood coefficients
    neigh_summed_coeffs = neighborhood_coeff_df[neighborhood_coeff_df['Feature'].str.contains('neighborhood')]
    summed_neighborhood_effect = neigh_summed_coeffs['Coefficient'].sum()

    # Remove neighborhood variables from the main DataFrame
    aggregate_df = neighborhood_coeff_df[~neighborhood_coeff_df['Feature'].str.contains('neighborhood')]

    # Create a new DataFrame for the aggregated neighborhood effect
    neighborhood_aggregate = pd.DataFrame({
        'Feature': ['Neighborhood (Aggregate)'],
        'Coefficient': [summed_neighborhood_effect]
    })

    # Combine the aggregated effect with the non-neighborhood features using pd.concat()
    aggregate_df = pd.concat([aggregate_df, neighborhood_aggregate], ignore_index=True)

    # Plot the aggregated effect along with other features
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Coefficient', y='Feature', data=aggregate_df.sort_values(by='Coefficient'), palette='Spectral')
    plt.title('Standardized Coefficients for House Price Prediction\n(Including Neighborhood Aggregate)')
    plt.xlabel('Change in Standardized SalePrice')
    plt.ylabel('Feature')
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

    return None

plot_summed_effects(coeff_df= neighborhood_coeff_df)

: 

##### Visualization 2: Inspecting ONLY neighborhood influences on price

In [None]:
# Select all neighborhood variables
per_neighborhood_coeffs = neighborhood_coeff_df[neighborhood_coeff_df['Feature'].str.contains('neighborhood')]

# Plot the aggregated effect along with other features
plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=per_neighborhood_coeffs.sort_values(by='Coefficient'), palette='Spectral')
plt.title('Standardized Coefficients for House Price Prediction\n(Only looking at neighborhood)')
plt.xlabel('Change in Standardized SalePrice')
plt.ylabel('Feature')
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()


: 

### Counts

In [None]:
# NEIGHBORHOOD COUNTS
df['neighborhood'].value_counts()

: 

### Lists

In [None]:
# NEIGHBORHOOD LIST
## Saving this as a variable for potential future usage
neighborhoods = df['neighborhood'].unique().tolist()
neighborhoods

: 

### Mean Price per Neighborhood

In [None]:
# MEAN PRICE PER NEIGHBORHOOD
df.groupby('neighborhood', as_index=False)['rent'].agg(mean_price='mean').sort_values(by = 'mean_price')

: 

However these values are encompassing all types (studio, 1br, 2br), so we will do a per-bedroom price. While other factor will still influence the mean price, this would allow for a lot more clarity.

#### Mean Price per Bedroom

In [None]:
# MEAN PRICE PER NEIGHBORHOOD (PER BEDROOM)
df_temp = df.copy()
df_temp['rent_per_bedroom'] = df_temp.apply(lambda row: row['rent'] / row['bedrooms'] if row['bedrooms'] > 1 else row['rent'], axis=1)
neighborhood_means = df_temp.groupby('neighborhood', as_index=False)['rent_per_bedroom'].agg(mean_price='mean').sort_values(by = 'mean_price')
neighborhood_means

: 

In [None]:
plt.figure(figsize=(16,6)) 
sns.barplot(
    x='neighborhood', 
    y='mean_price', 
    data=neighborhood_means, 
    palette='PiYG'
)
plt.suptitle("Mean Rent Price per Neighborhood")
plt.title("(Per bedroom)")
plt.ylabel("Mean Rent Price ($)")
plt.xlabel("Neighborhood")
plt.xticks(rotation = 60, ha='right',fontsize=10)
plt.show()

: 

Neighborhood Frequency

In [None]:
bins = np.arange(0,650,25)
plt.figure(figsize=(15,8))
plt.hist(df['neighborhood'].value_counts(), bins = bins)
plt.title("Distribution of neighborhood counts")
plt.ylabel("Count")
plt.xticks(bins, rotation =45)
plt.xlabel("Frequency")
plt.show()

: 

We'll zoom into the (0,100) range of the graph, as many of the data points seem to lie ther

In [None]:
bins = np.arange(0,100,5)
plt.figure(figsize=(15,8))
plt.hist(df['neighborhood'].value_counts(), bins = bins)
plt.title("Distribution of neighborhood counts")
plt.ylabel("Count")
plt.xticks(bins, rotation =45)
plt.xlabel("Frequency")
plt.show()

: 

### Investigating Dropping Neighborhoods

To investigate how dropping low freq. neighborhoods affects modelling, we'll iterate over a thresholds list, model after dropping neighborhoods under the thresh., and compare

In [None]:
thresholds = np.arange(1,20,1).tolist()

# Untuned parameters for baseline analysis
baseline_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.0015,
    'max_depth': 3,
    'lambda': 0.001,
    'alpha': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'seed': 42
}
# Store results in a DataFrame
results = pd.DataFrame(columns=['Threshold','Final_RMSE','MAPE','Rows_Removed', 'Percent_Removed'])

print("Total rows in original dataset:", len(df), "\n")


# Loop over each threshold
for thresh in thresholds:
    print(f"Testing threshold {thresh}")

    # Copy the original DataFrame as to not affect original data
    df_temp = df.copy()
    
    # Total rows before any removal
    total_rows = len(df_temp)

    # Identify neighborhoods to drop
    neighborhood_counts = df_temp['neighborhood'].value_counts()
    to_drop = neighborhood_counts[neighborhood_counts < thresh].index
    
    # Count how many rows would be removed
    rows_removed = df_temp[df_temp['neighborhood'].isin(to_drop)].shape[0]
    pct_removed = (rows_removed / total_rows) * 100

    # Drop neighborhoods below the threshold
    df_temp = df_temp[~df_temp['neighborhood'].isin(to_drop)].reset_index(drop=True)
    
    # Preprocess data
    X, y_logged = preprocess_data(df_temp, target_var='rent_logged')

    # Split data into Train and Validation sets
    X_train, X_val, y_train_logged, y_val_logged = train_test_split(X, y_logged, test_size=0.2, random_state=42)

    dtrain_logged = xgb.DMatrix(X_train, label=y_train_logged)
    dval_logged = xgb.DMatrix(X_val, label=y_val_logged)

    # Retrain the model using the best boosting rounds on Train set
    model = xgb.train(baseline_params, dtrain_logged, num_boost_round=10000, 
                    evals=[(dval_logged, 'eval')],
                    early_stopping_rounds=50, 
                    verbose_eval=False)

    # Get predictions on Validation set
    y_pred_logged = model.predict(dval_logged)

    # Go back to original scale
    y_pred = np.expm1(y_pred_logged)
    y_val = np.expm1(y_val_logged)

    # Recalculate RMSE on original scale for reporting
    final_rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    # Calculate MAPE manually
    mape = mean_absolute_percentage_error(y_val, y_pred)

    # Store both CV RMSE and Final RMSE
    results = pd.concat([results, pd.DataFrame({
        'Threshold': [thresh],
        'Final_RMSE': [final_rmse],
        'MAPE': [mape],
        'Rows_Removed':[rows_removed],
        'Percent_Removed':[pct_removed]
    })], ignore_index=True)

    print(f"Threshold: {thresh}, Final RMSE: {final_rmse:.2f}, MAPE: {mape:.2%}, Percent Removed: {pct_removed:.2f}%")

print(results)

# plot results
plt.plot(results['Threshold'], results['Final_RMSE'])
plt.xticks(thresholds)
plt.xlabel('Threshold')
plt.ylabel('RMSE')
plt.show()


: 

We see the biggest decrease in RMSE with dropping neighborhoods that occur <13 times, while only losing 1.3% of the data. So we will create a new dataframe dropping those neighborhoods.

In [None]:
df_dropped = df.copy()

neighborhood_counts = df_dropped['neighborhood'].value_counts()
to_drop = neighborhood_counts[neighborhood_counts < 13].index

# Drop neighborhoods below the threshold
df_dropped = df_dropped[~df_dropped['neighborhood'].isin(to_drop)].reset_index(drop=True)

df_dropped['neighborhood'].value_counts()

: 

## Interaction Terms

### Correlation Heatmap

In [None]:
# CORRELATION HEATMAP
plt.figure(figsize=(12, 8))
sns.heatmap(X_numerical.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.show()

: 

In [None]:
def get_high_corr(df=df):
    high_correlations = []
    for index, row in df.iterrows():
        for col in df.columns:
            corr = df.iloc[row, col]

            if corr >0.7:
                high_correlations.append([index, col])

    
    return high_correlations

get_high_corr(df=df_dropped)

: 

## Polynomial Transformations

# Model Selection

Choosing a model from the following:
- Linear Regression
- XGBoost
- Decision Tree
- Random Forest

while also comparing Rent vs. Rent logged


In [None]:
def model_selection(df = df, target_var = 'rent'):
    X, y = preprocess_data(df, target_var=target_var)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Baseline models (no tuning)
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42),
        'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
    }

    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)                              # Train the model
        y_pred = model.predict(X_test)                           # Generate predictions
        # Calculate scores depending on logged
        if target_var == 'rent_logged':
            y_pred_original = np.expm1(y_pred)
            y_test_original = np.expm1(y_test)
            rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
            mape = mean_absolute_percentage_error(y_test_original, y_pred_original)
        else: 
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))       # Calculate RMSE
            mape = mean_absolute_percentage_error(y_test, y_pred)    # Calculate MAPE
        
        results.append({'Model': name, 'RMSE': rmse, 'MAPE (%)': mape*100})

    # Convert results to a DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    print(f"Target Value: {target_var}\n",results_df)
    
    return None

: 

In [None]:
## RENT
model_selection(df_dropped, 'rent')
print("\n\n")
## RENT LOGGED
model_selection(df_dropped, 'rent_logged')

: 

# Hyperparameter Tuning

In [None]:
# Convert data to DMatrices for optimal XGB use
X, y_logged = preprocess_data(df_dropped, target_var='rent_logged')
X_train, X_test, y_train_logged, y_test_logged = train_test_split(X, y_logged, train_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label = y_train_logged)
dtest = xgb.DMatrix(X_test, label = y_test_logged)

: 

In [None]:
# Expanded Parameter Grid
param_grid = {
    'learning_rate': [0.001, 0.005],  
    'max_depth': [3, 5, 7],  
    'lambda': [0.001, 1, 10],  
    'alpha': [0, 0.1, 1],  
    'subsample': [0.6,1.0],  
    'colsample_bytree': [0.8, 1.0],  
    'min_child_weight': [1, 3]
}

# Track best parameters and RMSE
best_rmse = float("inf")
best_params = None
best_iteration = None

# Iterate through all parameter combinations
for learning_rate, max_depth, reg_lambda, alpha, subsample, colsample_bytree, min_child_weight in product(
    param_grid['learning_rate'], 
    param_grid['max_depth'], 
    param_grid['lambda'],
    param_grid['alpha'],
    param_grid['subsample'],
    param_grid['colsample_bytree'],
    param_grid['min_child_weight']
):
    print(f"Params: learning_rate={learning_rate}, max_depth={max_depth}, lambda={reg_lambda}, alpha={alpha}, subsample={subsample}, colsample_bytree={colsample_bytree}, min_child_weight={min_child_weight}")

    # Define parameters for this combination
    params = {
        'objective': 'reg:squarederror',
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'lambda': reg_lambda,
        'alpha': alpha,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_weight': min_child_weight,
        'eval_metric': 'rmse',
        'tree_method': 'hist' 
    }

    # Perform cross-validation
    cv_results = xgb.cv(params, dtrain, num_boost_round=15000, nfold=4, 
                        metrics='rmse', early_stopping_rounds=15, seed=42)

    # Get best RMSE from CV
    best_rmse_logged = cv_results['test-rmse-mean'].min()
    this_iteration = cv_results['test-rmse-mean'].idxmin()

    # Retrain the model using the best parameters and iteration
    model = xgb.train(params, dtrain, num_boost_round=this_iteration)

    # Get predictions on validation set
    y_pred_logged = model.predict(dtest)
    
    # Reverse log transformation
    y_pred = np.expm1(y_pred_logged)
    y_test = np.expm1(y_test_logged)
    
    # Calculate RMSE on original scale
    rmse_original = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Print progress for each combination
    print(f"  ↳ Best iteration: {best_iteration}, RMSE (logged): {best_rmse_logged:.4f}, RMSE (original scale): {rmse_original:.2f}\n")


    # Update best parameters if lower RMSE found
    if rmse_original < best_rmse:
        best_rmse = rmse_original
        best_params = params
        best_iteration = this_iteration

# Display best results
print("Best parameters found:", best_params)
print(f"Optimal num_boost_round: {best_iteration}")
print(f"Best RMSE: {best_rmse:.4f}")


: 

# Model Training

In [None]:
X_train, X_test, y_train_logged, y_test_logged = train_test_split(X, y_logged, train_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label = y_train_logged)
dtest = xgb.DMatrix(X_test, label = y_test_logged)

: 

In [None]:
# Use the best parameters found from tuning
params = best_params
print("params:", params)

# Train final model using the optimal num_boost_round
final_model = xgb.train(params, dtrain, num_boost_round=best_iteration)

: 

In [None]:
# Make predictions on test set
y_pred_logged = final_model.predict(dtest)

y_pred = np.expm1(y_pred_logged)
y_test = np.expm1(y_test_logged)

# Calculate RMSE
rmse_logged = np.sqrt(mean_squared_error(y_test_logged, y_pred_logged))
print(f"Test RMSE (logged ): {rmse_logged:.2f}")

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE (original scale): {rmse:.2f}")

: 

In [None]:
# Make predictions on test set
y_pred_logged = final_model.predict(dtest)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test_logged, y_pred))
print(f"Test RMSE (logged ): {rmse:.2f}")
print(f"Test RMSE (original scale): {np.expm1(rmse):.2f}")

: 

In [None]:
print(f"Test RMSE (logged ): {rmse:.2f}")
print(f"Test RMSE (original scale): {np.expm1(rmse):.2f}")

: 

# Model Evalutation

In [None]:
# Residual Analysis
y_pred = final_model.predict(dtest)

: 

In [None]:
residuals = y_val - y_pred
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True, color='blue')
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.grid(False)
plt.show()

# R-Squared
from sklearn.metrics import r2_score
r2 = r2_score(y_test_logged, y_pred)
print(f"R-Squared: {r2:.4f}")


: 

In [None]:
# Plot RMSE over Boosting Rounds
plt.figure(figsize=(10, 6))
plt.plot(cv_results['train-rmse-mean'], label='Train RMSE')
plt.plot(cv_results['test-rmse-mean'], label='Test RMSE')
plt.xlabel('Boosting Rounds')
plt.ylabel('RMSE')
plt.title('Cross-Validation Performance')
plt.legend()
plt.grid(False)
plt.show()


: 