# Summary and inital Notes 
We are looking at a timeseries problem which is similar to the January TPS with some additional features \
We will start with a similar base of features as the [Jan TPS kernel](https://www.kaggle.com/slythe/tps-jan22-catboost-multi-step-seeding-recursive) 

#### Additional data to potentially include:
* US holidays :  less work = less traffic 
* weather : bad weather = more traffic

#### Time period and target
* Target column = Congestion 
* Training dataset consists of traffic congestion measurements across 65 roadways from April through September of 1991
* prediction period is twelve-hours (1991-09-30 12:00:00 ---> 1991-09-30 23:40:00

# Features 
* We have directional features( EB = Eastbound) as well as midpoint coordinates (x,y)

#### Feature engineering: 
* Time series extraction (day,month, year etc) 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit, KFold

In [None]:
SCALER_NAME = "None" 
SCALER = MinMaxScaler() 

EPOCHS = 10000     
EARLY_STOPPING = 30

FOLDS = 10

#predict only monday data
PRED_MONDAY = True

#use sklearn timeseriessplit vs Kfold/ groupKfold
TimeSeriesSplit = False

sns.set(font_scale = 1)

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv", index_col =0)
test_original = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv", index_col = 0)
sub = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv", index_col = 0)

# Describe me 

In [None]:
train_original.tail()

In [None]:
train_original.describe()

In [None]:
# Set as datetime 
train_original["time"] = pd.to_datetime(train_original["time"])
test_original["time"] = pd.to_datetime(test_original["time"])

print("Min train date" ,train_original["time"].min())
print("Max train date:" , train_original["time"].max())

print("\nMin test date" ,test_original["time"].min())
print("Max test date:" , test_original["time"].max())

In [None]:
print("\n   Info")
display(train_original.info())

print("\n   Null values")
display(train_original.isnull().sum())


print("\n   Duplicates")
train_original.duplicated().sum()

# Feature Extraction 

In [None]:
#features
features_original = train_original.columns 
features_original

In [None]:
def additional_features(df):
    #set as copy to not update original
    df= df.copy(deep=True)
    
    ## Temporal features 
    df["minute"] = df["time"].dt.minute
    df["hour"] = df["time"].dt.hour    
    df["day"] = df["time"].dt.day
    df["month"] = df["time"].dt.month
    df["dayofweek"]= df["time"].dt.weekday
    
    #New features
    #df["x+y"] = df["x"]+df["y"]
    #df['x_y'] = df['x'].astype('str') + df['y'].astype('str')
    df['x_y_direction'] = df['x'].astype('str') + df['y'].astype('str')+ df['direction']
    #df['hour_direction'] = df['hour'].astype('str') + df['direction'].astype('str')
    
    #we are predicting afternoon only
    df['afternoon'] = (df['time'].dt.hour > 12).astype('int')
    #df['is_weekend'] = np.where((df['dayofweek'] == 5) | (df['dayofweek'] == 6), 1, 0)

    # these features can cause noise - optional
    df['dayofyear'] = df['time'].dt.dayofyear
    df['inverse_dayofyear'] = 365 - df['time'].dt.dayofyear
    #df["weekofyear"]= df["time"].dt.isocalendar().week.astype('int')
    
    return df

train = additional_features(train_original)
test = additional_features(test_original)
train.columns

# Resampling

In [None]:
train.set_index("time", drop= True, inplace = True)
test.set_index("time", drop= True, inplace = True)

In [None]:
train_grp = train.groupby("x_y_direction").resample("20T").last()
train_grp = train_grp.set_index(train_grp.index.get_level_values(1))
train_grp.backfill(inplace = True)

# Drop Mondays 

In [None]:
train = train[train.index.date != pd.to_datetime("1991-05-27")]
train =  train[train.index.date != pd.to_datetime("1991-09-02")]
train

# Quick EDA 

In [None]:
# plt.figure(figsize=(25,8))
# sns.lineplot(data = train, x = train.index.date, y = "congestion")
# plt.title("Congestion full")
# plt.show()

In [None]:
plt.figure(figsize= (25,5))
sns.histplot(train["direction"])
plt.title("Direction count")
plt.show()

In [None]:
# fig,ax = plt.subplots(2,1,figsize= (25,12))
# sns.lineplot(ax= ax[0], data = train, x = "hour", y = "congestion")
# sns.lineplot(ax= ax[1], data = train, x = "hour", y = "congestion", hue = "direction")
# fig.suptitle("Congestion by hour")
# plt.show()

In [None]:
# fig,ax = plt.subplots(2,1,figsize= (25,12))
# sns.lineplot(ax= ax[0], data = train, x = "dayofweek", y = "congestion")
# sns.lineplot(ax= ax[1], data = train, x = "dayofweek", y = "congestion", hue = "direction")
# fig.suptitle("Congestion by dayofweek")
# plt.show()

In [None]:
# fig,ax = plt.subplots(2,1,figsize= (25,12))
# sns.lineplot(ax= ax[0], data = train, x = "day", y = "congestion")
# sns.lineplot(ax= ax[1], data = train, x = "day", y = "congestion", hue = "direction")
# fig.suptitle("Congestion by dayofmonth")
# plt.show()

In [None]:
# plt.figure(figsize = (20,8))
# sns.histplot(data =train["congestion"])
# plt.title("histogram of target")
# plt.show()

## Holidays
There are a few federal holidays on monday like Labour day , Easter and Memorial data \


# Monday Investigation 
We are only predicting Monday from 12pm->24:00

This makes me think, why include all the other days at all \
**Assumption**: previous days traffic should not affect the next days (maybe accidents?)

In [None]:
mondays = train[train["dayofweek"]==0].reset_index(drop= True)
mondays

In [None]:
mondays.groupby(["month","day"])["congestion"].sum().unstack(level=0).plot(kind='bar', subplots=True, rot=0, figsize=(20, 8), layout=(2, 3), title="Sum of Congestion for Mondays by Month by Day")
plt.tight_layout()

In [None]:
mondays.groupby(["month","hour"])["congestion"].sum().unstack(level=0).plot(kind='bar', subplots=True, rot=0, figsize=(20, 8), layout=(2, 3), title="Sum of Congestion for Mondays by Month by Hour")
plt.tight_layout()

##### Note 
September isnt the full month - we are to predict the last 12 hours \
It would be interesting to add our predictions to Septembers histogram and see if it evens out

# Median Modelling
From the amazing [@ambrosm](https://www.kaggle.com/ambrosm/tpsmar22-without-machine-learning)

In [None]:
groupby_cols = ["direction","x","y","dayofweek","hour","minute"]
mondays_median = pd.DataFrame(mondays.groupby(groupby_cols)["congestion"].median())
mondays_median

In [None]:
val = train.merge(mondays_median, 
                 left_on=groupby_cols,right_index=True)
print("MAE", mean_absolute_error(val["congestion_x"], val["congestion_y"]) )

In [None]:
merged = test.merge(mondays_median, 
                 left_on=groupby_cols,
                 right_index=True)[['congestion']]
merged.head()

In [None]:
sub_median= sub.copy(deep= True)
sub_median["congestion"] = np.round(merged["congestion"].values)
sub_median.to_csv("submission_median.csv")
sub_median

In [None]:
# merge train and test(with predictions)
test_sub = test.copy(deep= True) 
test_sub["congestion"]  = sub_median["congestion"]

In [None]:
full_df = pd.concat([mondays,test_sub], ignore_index = True)
full_df.groupby(["month","hour"])["congestion"].sum().unstack(level=0).plot(kind='bar', subplots=True, rot=0, figsize=(20, 8), layout=(2, 3), title="Sum of congestion with added Test predictions")
plt.tight_layout()

# ML Modelling

In [None]:
# Onehotencoding

all_df = pd.concat([train.assign(ds=1),test.assign(ds=0)],axis =0)
all_df = pd.get_dummies(all_df)
test = all_df[all_df["ds"]==0].drop(["congestion","ds"],axis =1)
train = all_df[all_df["ds"]==1].drop(["ds"],axis =1)

mondays_train = train[train["dayofweek"]==0].reset_index(drop= True)

mondays_train

## Scaling & Splitting

In [None]:
if PRED_MONDAY: 
    #predict only monday data
    X = mondays_train.drop(["congestion","dayofweek"],axis =1 )
    y = mondays_train["congestion"]
    test = test.drop("dayofweek",axis =1)
else:
    X= train.drop("congestion",axis =1 )
    y = train["congestion"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Run model 

In [None]:
lgb_params = {
    "objective" : "regression",
    "metric": "mae",
    "device_type": "cpu",
    'boosting': "gbdt",  
    "learning_rate": 0.05,
    #"lambda_l1": 0.03469015403439412,
    #"lambda_l2": 9.993162304351474,
    "num_leaves": 1000,
    #"max_depth": 10
    "force_col_wise" : True,
                   }
def fit_model(X_train,y_train,X_test,y_test,test_df):
    
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    
    model = lgb.train(params=lgb_params,
                      train_set= train_data, 
                      valid_sets= [test_data], 
                      num_boost_round= EPOCHS,
                      callbacks=[lgb.early_stopping(EARLY_STOPPING), lgb.log_evaluation(-1)],
                     )    
    #predict test data
    train_preds =model.predict(X_test) 
    test_preds = model.predict(test_df)
    
    mae = mean_absolute_error(y_test, train_preds)
    print("\nMAE:", mae)
    print("r2: ",r2_score(y_test, train_preds)) 
    
    return train_preds, test_preds, mae, model

train_preds, test_preds, mae ,model= fit_model(X_train, y_train, X_test, y_test, test)

In [None]:
def plotImp(model, X , num = 20, fig_size = (20, 10)):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    
    plt.figure(figsize=fig_size)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM feature importance ')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()
    
plotImp(model, X)

In [None]:
# Zero value importances
feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
feature_imp[feature_imp["Value"]==0]

In [None]:
# plt.figure(figsize = (25,8))
# sns.lineplot(x = train.loc[y_test.index].index.date,y = train_preds, label = "predicted", ci=None)
# sns.lineplot(x = train.loc[y_test.index].index.dt.date,y = y_test, label = "actual",ci=None)
# plt.title("Validation vs Actual ")
# plt.show()

## Cross Validation - ML Model

In [None]:
if TimeSeriesSplit:
    #train.set_index("time", inplace = True)
    #test.set_index("time", inplace = True)
    kfold = TimeSeriesSplit(n_splits= FOLDS)
else:
    kfold = StratifiedKFold(n_splits= FOLDS)

In [None]:
train

In [None]:
def cross_val(X,y, test):
    
    test_predictions = []
    lgb_scores = []

    for idx, (train_idx, val_idx) in enumerate(kfold.split(X, y)):

        print("\n",10*"=", f"Fold={idx+1}", 10*"=")

        X_train, y_train = X.iloc[train_idx,:], y.iloc[train_idx,]
        X_valid, y_val = X.iloc[val_idx,:], y.iloc[val_idx,]

        if SCALER_NAME !="None":
            scaler = SCALER
            print(f"Scaling with {SCALER_NAME}")
            X_train = scaler.fit_transform(X_train)
            X_valid = scaler.transform(X_valid)
            test_scaled = scaler.transform(test)
            
        else:
            test_scaled = test
        
        train_preds, test_preds, mae ,model= fit_model(X_train, y_train, X_valid, y_val, test_scaled)                          
                                   
                                   
        lgb_scores.append(mae)
        test_predictions.append(test_preds)
        
        del X_valid
        del y_val

    print("Mean Validation MAE :", np.mean(lgb_scores))
    return test_predictions

test_predictions = cross_val(X,y, test)

In [None]:
sub_CV = sub.copy(deep = True)
sub_CV["congestion"] = np.round(np.mean(test_predictions, axis =0))
sub_CV.to_csv("submission_ml.csv")
sub_CV

# Predicted & Residual Analysis 

In [None]:
# plt.figure(figsize = (20,8))
# sns.lineplot(data = train.iloc[-8000:], x = train.iloc[-8000:].index,  y = "congestion", label = "actual")
# sns.lineplot(x = test.index,  y = sub_CV["congestion"], label = "predicted")
# plt.show()

In [None]:
plt.figure(figsize = (20,8))
sns.scatterplot(x = train.iloc[y_test.index].index, y = (y_test - train_preds), ci=None)
plt.title("Scatter plot of ML residuals")
plt.show()

In [None]:
plt.figure(figsize = (20,8))
sns.histplot(y_test - train_preds)
plt.title("Histogram of ML residuals")
plt.show()

Very nice distribution of residuals \
This shows the model is predicting consistently 

# Median Vs LightGBM: model comparison 

In [None]:
plt.figure(figsize = (20,8))
sns.lineplot(data = train.iloc[-4000:], x = train.iloc[-4000:].index,  y = "congestion", label = "actual", ci=None)
sns.lineplot(x = test.index,  y = sub_median["congestion"] , label = "Median", ci=None)
sns.lineplot(x = test.index,  y = sub_CV["congestion"], label = "ML", ci=None)
plt.title("Median vs ML model ")
plt.show()

##### Results of parameter changes
* Mondays - kfold -  5 fold - no scaling  =5.8863860103463015
* Mondays - stratifiedKfold -  5 fold - no scaling  = 5.807200301183459
* Mondays - Timeseries - 3 fold - no scaling = 5.807200301183459
* Mondays - GroupKfold - 5 fold - no scaling = +-12

##### Scaling
* Mondays - stratifiedKfold -  5 fold - MinMax  = 6.657225777334406
* Mondays - stratifiedKfold -  5 fold - StandardScaler = 6.656892457739071

##### Full data 
Full data - Kfold -  5 fold - no scaling = 5.8863860103463015

5.836967808836048