In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Import data

In [None]:
df = pd.read_csv("./data/boston.csv")

Input features in order:
1) CRIM: per capita crime rate by town
2) ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
3) INDUS: proportion of non-retail business acres per town
4) CHAS: Charles River dummy variable (1 if tract bounds river; 0 otherwise)
5) NOX: nitric oxides concentration (parts per 10 million) [parts/10M]
6) RM: average number of rooms per dwelling
7) AGE: proportion of owner-occupied units built prior to 1940
8) DIS: weighted distances to five Boston employment centres
9) RAD: index of accessibility to radial highways
10) TAX: full-value property-tax rate per $10,000 [$/10k]
11) PTRATIO: pupil-teacher ratio by town
12) B: The result of the equation B=1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13) LSTAT: % lower status of the population

Output variable:
1) MEDV: Median value of owner-occupied homes in $1000's [k$]



In [None]:
# get familiar with the data, check the shape, the first 5 rows, df.describe(), df.info()

# Data cleaning

## Remove NaN values

In [None]:
df = 

# Handle outliers

In [None]:
# Did you notice anything weird in df.describe()? If not, check out MEDV's max value, the min value and the mean value. 
# What's going on? Is this a problem? How can you fix it?

## Remove duplicates

In [None]:
# HINT: there is a function in pandas made just for this purpose
# HINT 2: https://google.gprivate.com/search.php?search?q=pandas+remove+duplicates
df = 

# Model creation

In [None]:
# Create the XGBoost regression model. XGBoost stands for: eXtreme Gradient Boosting. 
# This is a very popular algorithm, used in machine learning competitions and in the industry. 
# We will use it for regression, but it can also be used for classification.

model = xgb.XGBRegressor()

In [None]:
# let y be the target column, and X be the rest of the df
X = 
y = 

In [None]:
# Split the data into train and test sets with the function train_test_split from sklearn. Use test_size=0.2 and random_state=42
# We use train_test_split to split the data into train and test sets. We will use the train set to train the model, and the test set to evaluate the model.
# The reason we need a test set is to be able to evaluate the model. If we train the model on the whole dataset, 
# it will learn the dataset perfectly, but we will not know how it performs on unseen data. 

X_train, X_test, y_train, y_test = 

In [None]:
# use the training set (X_train, y_train) to train the model by calling the .fit() method
model.fit(?, ?)

In [None]:
# Use the model to predict the target values for the test set (X_test)
preds = model.predict(?)

In [None]:
# find the mean squared error for the predictions (a value to see the value of the predictions, lower is better)
# find the error between the y_test and the preds
mse = mean_squared_error(?, ?)

In [None]:
# print the mse to see how much, on average, your model is off (squared)

# Hyperparameter tuning

In [None]:
# These are some of the hyperparameters you can tune for XGBoost. 
# A hyperparameter is a parameter that is not learned by the model, but is set by the user.
# The parameters that are learned by the model are called model parameters.
# The model starts off with some default values for the hyperparameters, but you can change them to get potentially better results.
# This process is called hyperparameter tuning.

# If you want, you can adjust the hyperparameters and see if you can get a better result. You can also add more hyperparameters to the dictionary.
# List of hyperparameters: https://xgboost.readthedocs.io/en/latest/parameter.html
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
    "n_estimators": [100, 200, 300, 400, 500, 900, 1100, 1500],
}

In [None]:
# Use RandomizedSearchCV to find the best hyperparameters for the model. There are other ways to do this, but random search will work for this purpose.
# Random search is a method for hyperparameter tuning that will try a given number of random combinations of hyperparameters.
# Use the training set (X_train, y_train) to instantiate the random search by calling the .fit() method with the test set
# HINT: n_iter is the number of iterations to run the random search, if this number is too high, it will take a long time to run, 
# but if it's too low, it will not find the best hyperparameters. You should try to find a happy medium.

# First, create a new, similar model, but with the default hyperparameters. Do not fit this model with the training set.
model2 = 

random_search = RandomizedSearchCV(?, param_distributions=params, n_iter=?, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

# Fit the model with x and y train sets
random_search.fit(?, ?)

In [None]:
# Retrieve the best model/estimator from the random search
model_new = ?

In [None]:
# Create new predictions with the new model
preds = 

In [None]:
# Get the new mean square error
mse_new = ?

mse_new

In [None]:
print(f"relation between better error on the new model and the old error: {(mse_new / mse)}")

# If the new model did not perform better, this means that the default hyperparameters were better, but it is highly likely that even better ones exist.
# You can try to run the random search again, but with more iterations, or you can try to use GridSearchCV instead of RandomizedSearchCV ot test _every_ combination of hyperparameters.
# You can also edit the hyperparameters in the dictionary to see if you can get better results.