# 1. Info

Notebook with all the code needed to solve the homework for the week six of the machine learning zoomcamp.

## Install the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import pickle
import requests
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Getting the data

In this homework, we will use the California Housing Prices from Kaggle.

In [2]:
# !wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

# Preparing the dataset

* We are going to use all columns of the dataset.
* First, keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'
* Fill missing values with zeros.
* Apply the log tranform to median_house_value.
* Do train/validation/test split with 60%/20%/20% distribution.
* Use the train_test_split function and set the random_state parameter to 1.
* Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [3]:
data = pd.read_csv('./housing.csv')

In [4]:
def data_preparation(data):

    df = data[(data["ocean_proximity"] == "<1H OCEAN") | (data["ocean_proximity"] == "INLAND")].copy()

    df.fillna(0, inplace=True)

    # adding some data cleaning for the xboost 
    df['ocean_proximity'] = df['ocean_proximity'].str.replace('<','under_')

    df["median_house_value"] = np.log1p(df["median_house_value"])
    df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
    df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

    df_train = df_train.reset_index(drop=True).copy()
    df_val = df_val.reset_index(drop=True).copy()
    df_test = df_test.reset_index(drop=True).copy()

    y_train = df_train["median_house_value"].values
    y_val = df_val["median_house_value"].values
    y_test = df_test["median_house_value"].values


    df_train.drop(labels=['median_house_value'], axis=1, inplace=True)
    df_val.drop(labels=['median_house_value'], axis=1, inplace=True)
    df_test.drop(labels=['median_house_value'], axis=1, inplace=True)

    dv = DictVectorizer(sparse=False)

    X_train = dv.fit_transform(df_train.to_dict(orient='records'))
    X_test = dv.fit_transform(df_test.to_dict(orient='records'))
    X_val = dv.fit_transform(df_val.to_dict(orient='records'))

    return X_train, X_val, X_test, y_train, y_val, y_test, dv

X_train, X_val, X_test, y_train, y_val, y_test, dv = data_preparation(data)

# Question 1

Let's train a decision tree regressor to predict the median_house_value variable.

Train a model with max_depth=1.
Which feature is used for splitting the data?

* ocean_proximity
* total_rooms
* latitude
* population

In [5]:

feature_names = [i for i in dv.get_feature_names_out()]

regressor = DecisionTreeRegressor(max_depth=1)
regressor.fit(X_train, y_train)

feature_index = regressor.tree_.feature[0]
feature_name = feature_names[feature_index]

print(f"The feature used for splitting the data is {feature_name}")
     

The feature used for splitting the data is ocean_proximity=under_1H OCEAN


# Question 2

Train a random forest model with these parameters:

* n_estimators=10
* random_state=1
* n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on validation?

* 0.045
* 0.245
* 0.545
* 0.845

In [6]:
model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"RMSE = {round(rmse,3)}")

RMSE = 0.245


# Question 3

Now let's experiment with the n_estimators parameter

* Try different values of this parameter from 10 to 200 with step 10.
* Set random_state to 1.
* Evaluate the model on the validation dataset.

After which value of n_estimators does RMSE stop improving?

* 10
* 25
* 50
* 160

In [7]:
n_estimators_range = range(10, 201, 10)

estimators = []
rmse_list = []
# best_rmse = float('inf')
# best_n_estimators = 0

for estimator_value in n_estimators_range:
    model_rf = RandomForestRegressor(
        n_estimators=estimator_value, 
        random_state=1)
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_val)

    rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)),3)
    estimators.append(estimator_value)
    rmse_list.append(rmse)

In [8]:
pd.DataFrame({'estimator':estimators,'rmse':rmse_list}).sort_values(by=['estimator'])

Unnamed: 0,estimator,rmse
0,10,0.245
1,20,0.239
2,30,0.237
3,40,0.235
4,50,0.235
5,60,0.234
6,70,0.234
7,80,0.235
8,90,0.235
9,100,0.234


# Question 4
Let's select the best max_depth:

* Try different values of max_depth: [10, 15, 20, 25]
* For each of these values, try different values of n_estimators from 10 till 200 (with step 10)
* Fix the random seed: random_state=1

What's the best max_depth:

* 10
* 15
* 20
* 25

In [9]:
scores = []

for max_depth in [10, 15, 20, 25]:
    for n_estimators in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n_estimators, 
            max_depth=max_depth,
            random_state=1
            )
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        rmse = round(np.sqrt(mean_squared_error(y_val, y_pred)), 3)
        scores.append((n_estimators, max_depth, rmse))

In [10]:
data_columns = ["n_estimators","max_depth", "rmse"]
df_scores = pd.DataFrame(scores,columns=data_columns)
df_scores.groupby(['max_depth'])['rmse'].mean()

max_depth
10    0.24525
15    0.23645
20    0.23510
25    0.23485
Name: rmse, dtype: float64

# Question 5

We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorith, it finds the best split. When doint it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the imporatant features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

* Train the model with these parametes:
    * n_estimators=10,
    * max_depth=20,
    * random_state=1,
    * n_jobs=-1 (optional)
* Get the feature importance information from this model

What's the most important feature (among these 4)?

* total_rooms
* median_income
* total_bedrooms
* longitude


In [11]:
rf_model = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

In [12]:
importance_df = pd.DataFrame({'feature':dv.feature_names_,'importance':rf.feature_importances_})
importance_df.sort_values(by="importance", ascending=False)

Unnamed: 0,feature,importance
4,median_income,0.336561
6,ocean_proximity=under_1H OCEAN,0.23123
5,ocean_proximity=INLAND,0.133051
2,latitude,0.100664
3,longitude,0.08656
1,housing_median_age,0.032059
7,population,0.027506
9,total_rooms,0.021402
8,total_bedrooms,0.015813
0,households,0.015155


# Question 6

Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

* Install XGBoost
* Create DMatrix for train and validation
* Create a watchlist
* Train a model with these parameters for 100 rounds:

```Python
    xgb_params = {
        'eta': 0.3, 
        'max_depth': 6,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        
        'seed': 1,
        'verbosity': 1,
    }
```

Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

* 0.3
* 0.1
* Both give equal value

In [16]:
features = dv.feature_names_
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [17]:
watchlist = [(dtrain,'train'),(dval, 'validation')]

In [31]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=100, verbose_eval=10, evals=watchlist)
y_pred = model.predict(dval)
initial_rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))

[0]	train-rmse:8.07362	validation-rmse:8.07348
[10]	train-rmse:0.33195	validation-rmse:0.34802
[20]	train-rmse:0.20036	validation-rmse:0.24508
[30]	train-rmse:0.18204	validation-rmse:0.23833
[40]	train-rmse:0.16422	validation-rmse:0.23379
[50]	train-rmse:0.15210	validation-rmse:0.23262
[60]	train-rmse:0.14218	validation-rmse:0.23160
[70]	train-rmse:0.13471	validation-rmse:0.23108
[80]	train-rmse:0.12835	validation-rmse:0.23045
[90]	train-rmse:0.12174	validation-rmse:0.22957
[99]	train-rmse:0.11656	validation-rmse:0.22897


In [32]:
xgb_params['eta'] = 0.1
model = xgb.train(xgb_params, dtrain, num_boost_round=100, verbose_eval=10, evals=watchlist)
y_pred = model.predict(dval)
updated_rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))

[0]	train-rmse:10.37456	validation-rmse:10.37545
[10]	train-rmse:3.63299	validation-rmse:3.62939
[20]	train-rmse:1.29412	validation-rmse:1.29329
[30]	train-rmse:0.50217	validation-rmse:0.51149
[40]	train-rmse:0.26743	validation-rmse:0.29345
[50]	train-rmse:0.21112	validation-rmse:0.24907
[60]	train-rmse:0.19724	validation-rmse:0.24107
[70]	train-rmse:0.18911	validation-rmse:0.23824
[80]	train-rmse:0.18145	validation-rmse:0.23594
[90]	train-rmse:0.17418	validation-rmse:0.23307
[99]	train-rmse:0.17000	validation-rmse:0.23234
