# Imports 

In [17]:
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils
from pathlib import Path
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn.pipeline import make_pipeline, Pipeline

## Importing Data

In [10]:
X, y = utils.get_train_data()
X_final_test = pd.read_parquet(Path("data") / "final_test.parquet")

## Importing Starter Kit Functions

In [53]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = X["date"] <= cutoff_date
    X_train, X_test = X.loc[mask], X.loc[~mask]
    y_train, y_test = y[mask], y[~mask]

    return X_train, y_train, X_test, y_test

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

def filter_columns(X):
    columns_to_keep = ['date', 'latitude', 'longitude']
    return X[columns_to_keep]

# Preprocessing

## Create X + Weather

In [45]:
def create_x_weather(X):
    weather_wi = pd.read_csv('data/weather_data_paris_daily.csv')
    
    columns_to_keep = ['datetime', 'temp', 'precip', 'windspeed', 'visibility']
    weather = weather_wi[columns_to_keep].copy().rename(columns={'datetime':'date'})
    
    mapping = {'snow': 0, 'rain': 1, 'cloudy': 2, 'partly-cloudy-day': 3, 'clear-day': 4}
    weather.loc[:, 'icon_encoded'] = weather_wi['icon'].copy().map(mapping)
    
    
    weather['date'] = pd.to_datetime(weather['date'].values.astype('<M8[us]'), format='%Y-%m-%d')
    weather['date_merge'] = weather['date']
    X_weather = X.copy() 
    X_weather['date_merge'] = pd.to_datetime(X_weather['date'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
    X_weather = X_weather.merge(weather.drop(columns=['date']), how='left', on='date_merge').drop(columns=['date_merge'])
    
    return X_weather

# Pipelines

In [55]:
X_train, y_train, X_test, y_test = train_test_split_temporal(X, y, delta_threshold="30 days")

model = xgb.XGBRegressor(colsample_bynode=0.6,
        colsample_bytree=0.6,
        colsample_bylevel=0.5,
        learning_rate=0.1,
        n_estimators=400,
        max_depth=10,
        subsample=0.8)

date_cols = FunctionTransformer(_encode_dates)
add_weather = FunctionTransformer(create_x_weather)
keep_specific_columns = FunctionTransformer(filter_columns)

pipe = Pipeline([
    ('filter_columns',keep_specific_columns),
    ('add_weather', add_weather),
    ("date_encode", FunctionTransformer(_encode_dates)),
    ('model', model)
])


pipe.fit(X_train, y_train)

predictions = pipe.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f'Mean Squared Error: {rmse}')


Mean Squared Error: 0.6827472066069165


# TEST

In [65]:
model = xgb.XGBRegressor(colsample_bynode=0.6,
        colsample_bytree=0.6,
        colsample_bylevel=0.5,
        learning_rate=0.1,
        n_estimators=400,
        max_depth=10,
        subsample=0.8)

date_cols = FunctionTransformer(_encode_dates)
add_weather = FunctionTransformer(create_x_weather)
keep_specific_columns = FunctionTransformer(filter_columns)

pipe = Pipeline([
    ('filter_columns',keep_specific_columns),
    ('add_weather', add_weather),
    ("date_encode", FunctionTransformer(_encode_dates)),
    #('model', model)
])


pipe.fit_transform(X_final_test).head(20)

Unnamed: 0,latitude,longitude,temp,precip,windspeed,visibility,icon_encoded,year,month,day,weekday,hour
0,48.846028,2.375429,19.9,0.931,14.3,20.8,1,2021,9,10,4,1
1,48.846028,2.375429,19.9,0.931,14.3,20.8,1,2021,9,10,4,13
2,48.846028,2.375429,19.9,0.931,14.3,20.8,1,2021,9,10,4,17
3,48.846028,2.375429,19.9,0.931,14.3,20.8,1,2021,9,10,4,19
4,48.846028,2.375429,19.9,0.931,14.3,20.8,1,2021,9,10,4,22
5,48.846028,2.375429,18.4,0.0,13.8,21.4,3,2021,9,11,5,0
6,48.846028,2.375429,18.4,0.0,13.8,21.4,3,2021,9,11,5,1
7,48.846028,2.375429,18.4,0.0,13.8,21.4,3,2021,9,11,5,3
8,48.846028,2.375429,18.4,0.0,13.8,21.4,3,2021,9,11,5,4
9,48.846028,2.375429,18.4,0.0,13.8,21.4,3,2021,9,11,5,6


# Format Output

In [58]:
submission = pipe.predict(X_final_test)
pd.Series(submission).to_frame().rename_axis('Id').rename(columns={0:'log_bike_count'}).to_csv('submission5_61223.csv')