# Requirements

In [None]:
import pandas as pd

In [None]:
# Add as many imports as you need.

# Laboratory Exercise - Run Mode (8 points)

## Introduction
In this laboratory assignment, the focus is on time series forecasting, specifically targeting the prediction of the current **count of page loads**. Your task involves employing bagging and boosting methods to forecast the **count of page loads**. To accomplish this, you will use data from the preceding three days, consisting of **count of page loads**, **count of unique visits**, **count of first time-visits**, **count of returning visits**, and the current **day**.

**Note: You are required to perform this laboratory assignment on your local machine.**

## The Website Visitors Dataset

## Downloading the Website Visitors Dataset

In [None]:
!pip install gdown==4.6.3



In [None]:
!gdown 1h1P3NboXKbWNngqgCB1i4bHFRcv7uDJe # Download the dataset.

Downloading...
From: https://drive.google.com/uc?id=1h1P3NboXKbWNngqgCB1i4bHFRcv7uDJe
To: /content/website-visitors.csv
  0% 0.00/92.8k [00:00<?, ?B/s]100% 92.8k/92.8k [00:00<00:00, 44.7MB/s]


## Exploring the Website Visitors Dataset
This datatset consists of daily counts of page loads, unique visitors, first-time visitors, and returning visitors for an academic teaching notes website. The dataset contains 2167 rows spanning from September 14, 2014, to August 19, 2020. A visit is defined as a series of hits on one or more pages by the same user on a given day, identified by IP address. Shared IP addresses are considered a single user. A visit is "unique" if a hit from the same IP address hasn't occurred within the last 6 hours. Returning visitors are identified by cookies, and the rest are classified as first-time visitors. The unique visitor count is the sum of returning and first-time visitors.

The dataset consists of the following columns:
- date - date in mm/dd/yyyy format,
- day_of_week - day of the week in text form (Monday, Tuesday, etc.),
- page_loads - daily number of pages loaded,
- unique_visits - daily number of visitors from IP addresses with no hits in over 6 hours,
- first_time_visits - number of unique visitors without a identifying cookie, and
- returning_visits - number of unique visitors minus first-time visitors.

*Note: The dataset is complete, with no missing values in any of its entries.*

Load the dataset into a `pandas` data frame.

In [None]:
# Write your code here. Add as many boxes as you need.
df = pd.read_csv('/content/website-visitors.csv')
df

Unnamed: 0,date,day_of_week,page_loads,unique_visits,first_time_visits,returning_visits
0,9/14/2014,Sunday,2.146,1.582,1.430,152.0
1,9/15/2014,Monday,3.621,2.528,2.297,231.0
2,9/16/2014,Tuesday,3.698,2.630,2.352,278.0
3,9/17/2014,Wednesday,3.667,2.614,2.327,287.0
4,9/18/2014,Thursday,3.316,2.366,2.130,236.0
...,...,...,...,...,...,...
2162,8/15/2020,Saturday,2.221,1.696,1.373,323.0
2163,8/16/2020,Sunday,2.724,2.037,1.686,351.0
2164,8/17/2020,Monday,3.456,2.638,2.181,457.0
2165,8/18/2020,Tuesday,3.581,2.683,2.184,499.0


Explore the dataset using visualizations of your choice.

In [None]:
# Write your code here. Add as many boxes as you need.

Encode the categorical features.

In [None]:
# Write your code here. Add as many boxes as you need.
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['day_of_week'] = encoder.fit_transform(df['day_of_week'])
df

Unnamed: 0,date,day_of_week,page_loads,unique_visits,first_time_visits,returning_visits
0,9/14/2014,3,2.146,1.582,1.430,152.0
1,9/15/2014,1,3.621,2.528,2.297,231.0
2,9/16/2014,5,3.698,2.630,2.352,278.0
3,9/17/2014,6,3.667,2.614,2.327,287.0
4,9/18/2014,4,3.316,2.366,2.130,236.0
...,...,...,...,...,...,...
2162,8/15/2020,2,2.221,1.696,1.373,323.0
2163,8/16/2020,3,2.724,2.037,1.686,351.0
2164,8/17/2020,1,3.456,2.638,2.181,457.0
2165,8/18/2020,5,3.581,2.683,2.184,499.0


# Feauture Extraction
Apply a lag of one, two, and three days to each chosen feature (except `day_of_week`), creating a set of features representing the statistics from the previous three days. To maintain dataset integrity, eliminate any resulting missing values at the beginning of the dataset.

Hint: Use `df['column_name'].shift(period)`. Check the documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html.

In [None]:
# Write your code here. Add as many boxes as you need.
import numpy as np

to_shift = []

for c in df.columns:
  if c != 'date' and c != 'day_of_week':
    to_shift.append(c)

to_shift

['page_loads', 'unique_visits', 'first_time_visits', 'returning_visits']

In [None]:
lag = 4

features = []

for i in range(lag, 0, -1):
  for s in to_shift:
    df[f'{s}_prev_{i}'] = df[s].shift(i)
    # mozda ne treba ova
    # if s != 'page_loads':
    features.append(f'{s}_prev_{i}')

features

['page_loads_prev_4',
 'unique_visits_prev_4',
 'first_time_visits_prev_4',
 'returning_visits_prev_4',
 'page_loads_prev_3',
 'unique_visits_prev_3',
 'first_time_visits_prev_3',
 'returning_visits_prev_3',
 'page_loads_prev_2',
 'unique_visits_prev_2',
 'first_time_visits_prev_2',
 'returning_visits_prev_2',
 'page_loads_prev_1',
 'unique_visits_prev_1',
 'first_time_visits_prev_1',
 'returning_visits_prev_1']

In [None]:
df.drop('unique_visits', axis=1, inplace=True)
df.drop('first_time_visits', axis=1, inplace=True)
df.drop('returning_visits', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)

## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**



In [None]:
# Write your code here. Add as many boxes as you need.
df.dropna(axis=0, inplace=True)
df.isnull().sum()

day_of_week                 0
page_loads                  0
page_loads_prev_4           0
unique_visits_prev_4        0
first_time_visits_prev_4    0
returning_visits_prev_4     0
page_loads_prev_3           0
unique_visits_prev_3        0
first_time_visits_prev_3    0
returning_visits_prev_3     0
page_loads_prev_2           0
unique_visits_prev_2        0
first_time_visits_prev_2    0
returning_visits_prev_2     0
page_loads_prev_1           0
unique_visits_prev_1        0
first_time_visits_prev_1    0
returning_visits_prev_1     0
dtype: int64

In [None]:
features

['page_loads_prev_4',
 'unique_visits_prev_4',
 'first_time_visits_prev_4',
 'returning_visits_prev_4',
 'page_loads_prev_3',
 'unique_visits_prev_3',
 'first_time_visits_prev_3',
 'returning_visits_prev_3',
 'page_loads_prev_2',
 'unique_visits_prev_2',
 'first_time_visits_prev_2',
 'returning_visits_prev_2',
 'page_loads_prev_1',
 'unique_visits_prev_1',
 'first_time_visits_prev_1',
 'returning_visits_prev_1']

In [None]:
X, Y = df[features], df['page_loads']

In [None]:
X

Unnamed: 0,page_loads_prev_4,unique_visits_prev_4,first_time_visits_prev_4,returning_visits_prev_4,page_loads_prev_3,unique_visits_prev_3,first_time_visits_prev_3,returning_visits_prev_3,page_loads_prev_2,unique_visits_prev_2,first_time_visits_prev_2,returning_visits_prev_2,page_loads_prev_1,unique_visits_prev_1,first_time_visits_prev_1,returning_visits_prev_1
4,2.146,1.582,1.430,152.0,3.621,2.528,2.297,231.0,3.698,2.630,2.352,278.0,3.667,2.614,2.327,287.0
5,3.621,2.528,2.297,231.0,3.698,2.630,2.352,278.0,3.667,2.614,2.327,287.0,3.316,2.366,2.130,236.0
6,3.698,2.630,2.352,278.0,3.667,2.614,2.327,287.0,3.316,2.366,2.130,236.0,2.815,1.863,1.622,241.0
7,3.667,2.614,2.327,287.0,3.316,2.366,2.130,236.0,2.815,1.863,1.622,241.0,1.658,1.118,985.000,133.0
8,3.316,2.366,2.130,236.0,2.815,1.863,1.622,241.0,1.658,1.118,985.000,133.0,2.288,1.656,1.481,175.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2162,3.740,2.742,2.258,484.0,3.767,2.904,2.477,427.0,3.621,2.780,2.322,458.0,2.971,2.308,1.922,386.0
2163,3.767,2.904,2.477,427.0,3.621,2.780,2.322,458.0,2.971,2.308,1.922,386.0,2.221,1.696,1.373,323.0
2164,3.621,2.780,2.322,458.0,2.971,2.308,1.922,386.0,2.221,1.696,1.373,323.0,2.724,2.037,1.686,351.0
2165,2.971,2.308,1.922,386.0,2.221,1.696,1.373,323.0,2.724,2.037,1.686,351.0,3.456,2.638,2.181,457.0


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

## Ensemble Learning Methods

### Bagging

Create an instance of a Random Forest model and train it using the `fit` function.

In [None]:
# Write your code here. Add as many boxes as you need.
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=1500, criterion='squared_error', max_depth=10)
rf.fit(X_train, Y_train)

Use the trained model to make predictions for the test set.

In [None]:
# Write your code here. Add as many boxes as you need.
y_pred = rf.predict(X_test)

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
# Write your code here. Add as many boxes as you need.
from sklearn.metrics import mean_squared_error, r2_score

r2_score(Y_test, y_pred)

0.8315276566893826

### Boosting

Create an instance of an XGBoost model and train it using the `fit` function.

In [193]:
# Write your code here. Add as many boxes as you need.
from xgboost import XGBRegressor

# model = XGBRegressor(objective ='reg:linear')


# model.fit(X_train, Y_train)

model = XGBRegressor(objective ='reg:linear', learning_rate = 0.1, max_depth=3, n_estimators=100)
model.fit(X_train, Y_train)



Use the trained model to make predictions for the test set.

In [194]:
# Write your code here. Add as many boxes as you need.
y_pred = model.predict(X_test)


Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [195]:
# Write your code here. Add as many boxes as you need.
r2_score(y_pred, Y_test)

0.7828771412684465

# Laboratory Exercise - Bonus Task (+ 2 points)

As part of the bonus task in this laboratory assignment, your objective is to fine-tune the max_depth (`max_depth`) for the Random Forest model using a cross-validation with grid search and time series split. This involves systematically experimenting with various values for `max_depth` and evaluating the model's performance using cross-validation. Upon determining the most suitable `max_depth` value, evaluate the model's performance on a test set for final assessment.

Hints:
- For grid search use the `GridCVSearch` from the `scikit-learn` library. Check the documentation at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html.
- For cross-validation use the `TimeSeriesSplit` from the `scikit-learn` library. Check the documentation at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html.

## Dataset Splitting
Partition the dataset into training and testing sets with an 90:10 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**

In [198]:
# Write your code here. Add as many boxes as you need.
sdf = df.head(500)
sdf

Unnamed: 0,day_of_week,page_loads,page_loads_prev_4,unique_visits_prev_4,first_time_visits_prev_4,returning_visits_prev_4,page_loads_prev_3,unique_visits_prev_3,first_time_visits_prev_3,returning_visits_prev_3,page_loads_prev_2,unique_visits_prev_2,first_time_visits_prev_2,returning_visits_prev_2,page_loads_prev_1,unique_visits_prev_1,first_time_visits_prev_1,returning_visits_prev_1
4,4,3.316,2.146,1.582,1.430,152.0,3.621,2.528,2.297,231.0,3.698,2.630,2.352,278.0,3.667,2.614,2.327,287.0
5,0,2.815,3.621,2.528,2.297,231.0,3.698,2.630,2.352,278.0,3.667,2.614,2.327,287.0,3.316,2.366,2.130,236.0
6,2,1.658,3.698,2.630,2.352,278.0,3.667,2.614,2.327,287.0,3.316,2.366,2.130,236.0,2.815,1.863,1.622,241.0
7,3,2.288,3.667,2.614,2.327,287.0,3.316,2.366,2.130,236.0,2.815,1.863,1.622,241.0,1.658,1.118,985.000,133.0
8,1,3.638,3.316,2.366,2.130,236.0,2.815,1.863,1.622,241.0,1.658,1.118,985.000,133.0,2.288,1.656,1.481,175.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,5,4.791,3.340,2.455,1.990,465.0,2.344,1.639,1.338,301.0,2.973,2.094,1.736,358.0,4.638,3.205,2.649,556.0
500,6,5.104,2.344,1.639,1.338,301.0,2.973,2.094,1.736,358.0,4.638,3.205,2.649,556.0,4.791,3.409,2.834,575.0
501,4,4.621,2.973,2.094,1.736,358.0,4.638,3.205,2.649,556.0,4.791,3.409,2.834,575.0,5.104,3.768,3.197,571.0
502,0,3.831,4.638,3.205,2.649,556.0,4.791,3.409,2.834,575.0,5.104,3.768,3.197,571.0,4.621,3.439,2.876,563.0


In [200]:
X_train, X_test, y_train, y_test = train_test_split(sdf[features], sdf['page_loads'], test_size=0.2, random_state=42)

## Fine-tuning the Random Forest Hyperparameter
Experiment with various values for `max_depth` and evaluate the model's performance using cross-validation.

In [207]:
# Write your code here. Add as many boxes as you need.
from sklearn.model_selection import GridSearchCV

rf = RandomForestRegressor()

param_grid = {
    'max_depth': [None, 50, 100, 200, 500, 1000]  # Adjust the range based on your requirements
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

In [205]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)

Best Parameters: {'max_depth': 200}
Best Score: -0.8820899502035866


## Final Assessment of the Model Performance
Upon determining the most suitable `max_depth` value, evaluate the model's performance on a test set for final assessment.