## To do 
* Optimize LightGBM 
* Feature engineering 
* different shift and rolling timeframe
* different shift and rolling measures (mean, median, std)  --remove rolling/shift
* Resample! 

### Done 

1. recursive CSV

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

import lightgbm as lgb
from catboost import CatBoostRegressor

from optuna.samplers import TPESampler
import optuna

from sklearn.model_selection import StratifiedKFold, GroupKFold, TimeSeriesSplit, KFold, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

import math

In [None]:
# SCALER_NAME = "None" 
# SCALER = MinMaxScaler() 

EPOCHS = 10000     
EARLY_STOPPING = 30

FOLDS = 3

#predict only monday data
PRED_MONDAY = False

LAG_FEATURES = True

sns.set(font_scale = 1)

OPTUNA = False
TASK = "CPU"  #GPU

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv", index_col = 0)
test_original = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv", index_col = 0)
submission = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv", index_col = 0)

In [None]:
degree = {'EB':180, 'NB':90, 'SB':270, 'WB':0, 'NE':135, 'SW':315, 'NW': 45, 'SE':225}

Fantastic feature engineering from [ALEXANDER RYZHKOV](https://www.kaggle.com/alexryzhkov/lightautoml-with-fe-tps-mar-22)

In [None]:
dir_mapper = {'EB': [1,0], 
              'NB': [0,1], 
              'SB': [0,-1], 
              'WB': [-1,0], 
              'NE': [1,1], 
              'SW': [-1,-1], 
              'NW': [-1,1], 
              'SE': [1,-1]}


def basic_feats(df):
    df= df.copy(deep = True)
    
    df["time"] = pd.to_datetime(df["time"])
    df["dir_x_y"] = df['x'].astype('str') + df['y'].astype('str')+ df['direction']
    
    df["minute"] = df["time"].dt.minute
    df["hour"] = df["time"].dt.hour    
    df["day"] = df["time"].dt.day
    df["month"] = df["time"].dt.month
    df["dayofweek"]= df["time"].dt.weekday
    
    #New features
    df["x+y"] = df["x"]+df["y"]
    df['location'] = df['x'].astype('str') + df['y'].astype('str')
    df['hour_direction'] = df['hour'].astype('str') + df['direction'].astype('str')
    
    # degree and radius
    df["degree"] = df["direction"].map(degree).astype("int32")
    #df['rad'] = math.pi * df['degree'] / 180
    df['afternoon'] = df['hour'] >= 12
    
    #from ALEXANDER RYZHKOV
#     df['converted_direction_coord_0'] = df['direction'].map(lambda x: dir_mapper[x][0])
#     df['converted_direction_coord_1'] = df['direction'].map(lambda x: dir_mapper[x][1])
#     df['is_month_start'] = df['time'].dt.is_month_start.astype('int')
#     df['is_month_end'] = df['time'].dt.is_month_end.astype('int')
#     df['x+y+direction0'] = df['x'].astype('str') + df['y'].astype('str') + df['converted_direction_coord_0'].astype('str')
#     df['x+y+direction1'] = df['x'].astype('str') + df['y'].astype('str') + df['converted_direction_coord_1'].astype('str')
#     df['hour+x+y'] = df['hour'].astype('str') + df['x'].astype('str') + df['y'].astype('str')    
#     df['hour+direction+y'] = df['hour'].astype('str') + df['direction'].astype('str') + df['y'].astype('str')
#     df['hour+direction+x+y'] = df['hour'].astype('str') + df['direction'].astype('str') + df['x'].astype('str') + df['y'].astype('str')
    df['hour+x'] = df['hour'].astype('str') + df['x'].astype('str')
    df['hour+y'] = df['hour'].astype('str') + df['y'].astype('str')    
    
    return df 

train = basic_feats(train_original)
test = basic_feats(test_original)

From this amazing notebook by [@MARTYNOV ANDREY ](https://www.kaggle.com/martynovandrey/tps-mar-22-don-t-forget-special-values)

In [None]:
location_anomalies = [('21', 'NE', 15), ('22', 'SE', 20), ('22', 'NW', 21), ('21', 'NW', 29), ('21', 'SE', 34)]
def special_apply(x):
    if (x[0] =="21" and x[1] =="NE") or (x[0] =="22" and x[1] =="SE") or (x[0] =="22" and x[1] =="NW") or (x[0] =="21" and x[1] =="NW")  or (x[0] =="21" and x[1] =="SE"):
        y= 1
    else:
        y= 0
    return y
train["special"] = train[["location","direction"]].apply(special_apply ,axis =1)
train["special"].value_counts()

In [None]:
if PRED_MONDAY:
    #only predict monday
    print("Predicting Monday only")
    train[train["dayofweek"]==0].reset_index(drop= True)

In [None]:
train

## Rolling & Lag Features 

In [None]:
shift_list = [1 
              ,2,3,4
             ]
roll_window = [4,8, 12]

In [None]:
def shift_vals(df, shift_list, groupby):
    df = df.copy(deep = True)
    
    for i in shift_list:
        #df[f"shift_{i}"]  = df.groupby(groupby)['congestion'].shift(1, fill_value=0)
        df[f"shift_{i}"] = df.groupby(groupby)['congestion'].transform(lambda s: s.shift(i, fill_value=0))
    return df

def rolling(df, roll_window, groupby):
    df = df.copy(deep = True)
    
    for i in roll_window:
        df[f"rolling_mean_{i}"] = df.groupby(groupby)['shift_1'].transform(lambda s: s.rolling(i, min_periods=1).mean())
        df[f"rolling_median_{i}"] = df.groupby(groupby)['shift_1'].transform(lambda s: s.rolling(i, min_periods=1).median())
        df[f"rolling_std_{i}"] = df.groupby(groupby)['shift_1'].transform(lambda s: s.rolling(i, min_periods=1).std())
    return df.fillna(0)

if LAG_FEATURES:
    train = shift_vals(train,shift_list,groupby= 'dir_x_y')
    train = rolling(train,roll_window, groupby= 'dir_x_y')

In [None]:
#Check 
train[train["dir_x_y"]=="00EB"].head(5)

# Encoding 
### Assumption 
There is a relationship in the direction of the highways \
i.e. EB highway becomes NB at some point therefore NB is affected by EB's congestion 

I cant seem to find a way to identify this relationship at this time, I therefore will apply **2 types of encodings** on the data: 
* LabelEncoder - to identify any sort of relationship (albet a poor one) 
* Onehotencoder - categorical engineering 

In [None]:
# label encoder
encoder = LabelEncoder()
train["dir_x_y_LE"] = encoder.fit_transform(train["dir_x_y"])
test ["dir_x_y_LE"] = encoder.transform(test["dir_x_y"])

#Onehot
# have to concatenate as test does not include all the feats as train 
all_df = pd.concat([train.assign(ds=1),test.assign(ds=0)],axis =0)
all_df = pd.get_dummies(all_df)
test = all_df[all_df["ds"]==0].drop(["congestion","ds"],axis =1)
train = all_df[all_df["ds"]==1].drop(["ds"],axis =1)

del all_df

In [None]:
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

train = downcast(train)
test = downcast(test)

# Split 

In [None]:
num_col = []
for col in train.columns:
    if train[col].dtypes != "object" and col != "congestion" and col !="time":
        num_col.append(col)

In [None]:
X = train.drop(["congestion","time"],axis =1 )
y = train["congestion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = StandardScaler()
X_train[num_col] = scaler.fit_transform(X_train[num_col])
X_test[num_col] = scaler.transform(X_test[num_col])

In [None]:
features = train.drop(["time","congestion"],axis=1).columns 
print(len(features))
print([col for col in features])

# Hyperparameter Tuning

In [None]:
def objective(trial):
    # 2. Suggest values of the hyperparameters using a trial object.
    lgb_params = {
        'objective': 'regression',
        'metric': "rmse",
        'verbosity': -100,
        'num_iterations': EPOCHS,
        "num_threads": -1,
        "force_col_wise": True,
        "learning_rate": trial.suggest_float('learning_rate',0.01,0.2),
        'boosting_type': trial.suggest_categorical('boosting',["gbdt"]),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1000, 10000),
        'max_depth': trial.suggest_int('max_depth', 4,15)
    }
        
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial,metric = "rmse")
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_test, label=y_test)
    
    model = lgb.train(params=lgb_params,
                  train_set= train_data, 
                  valid_sets= [val_data], 
                  num_boost_round= EPOCHS,
                  callbacks=[lgb.early_stopping(EARLY_STOPPING),pruning_callback]
                 ) 
    
    
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test,y_pred)
    print(f"MAE score of {mae}")
    
    return mae

In [None]:
%%time
if OPTUNA:
                
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=200)
    
    trial = study.best_trial
    best_params = study.best_params
    
    #Print our results
    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    print(" MAE Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# Model

In [None]:
if OPTUNA: 
    lgb_params=best_params
    lgb_params["force_col_wise"] = True
    lgb_params["num_threads"]= -1
    
else:
    lgb_params = {
        "objective" : "regression",
        "metric": "mae",
        "device_type": "cpu",
        'boosting': "gbdt",  
        "learning_rate": 0.1959206443930991,   # 0.05,
        "lambda_l1": 3.0671246702097357e-06,   #5, #0.03469015403439412,
        "lambda_l2":  0.0005623715885069156, #9.993162304351474,
        "num_leaves": 231,#20, #100
        "max_depth": 14, #7,
        "force_col_wise" : True,
        "num_threads": -1
                       }

In [None]:
def fit_model(X_train,y_train,EPOCHS, X_test= None,y_test= None,test_df=None):
    #scaling
   
    train_data = lgb.Dataset(X_train, label=y_train)
    
    # validation data prediction and stopping
    if X_test is not None:
        print("Validation prediction")
        
        val_data = lgb.Dataset(X_test, label=y_test)
        
        model = lgb.train(params=lgb_params,
                  train_set= train_data, 
                  valid_sets= [val_data], 
                  num_boost_round= EPOCHS,
                  callbacks=[lgb.early_stopping(EARLY_STOPPING)],
                 )    
        train_preds =model.predict(X_test)
        mae = mean_absolute_error(y_test, train_preds)
        print("\nMAE:", mae)
        print("r2: ",r2_score(y_test, train_preds)) 
    
    #full data fit 
    else:
        print("Full data prediction")
        print("Using best iterations: ", EPOCHS)
        model = lgb.train(params=lgb_params,
                          train_set= train_data, 
                          num_boost_round= EPOCHS)  
        train_preds = []
        mae = []
        
    #predict test data
    if test_df is None:
        test_preds = []
    else:
        test_preds = model.predict(test_df)

    return train_preds, test_preds, mae, model

train_preds, test_preds, mae ,model= fit_model(X_train, y_train,EPOCHS, X_test, y_test, None)
best_iter = model.best_iteration
train_preds

In [None]:
if not LAG_FEATURES:
    print("Full data prediction")
    train_preds, test_preds, mae ,model= fit_model(X[features], y,best_iter, None, None, test[features])

    sub_base = submission.copy(deep = True)
    sub_base["congestion"] = np.round(test_preds)
    sub_base.to_csv("sub_base.csv")
    sub_base

In [None]:
def plotImp(model, X , num = 30, fig_size = (20, 10)):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    
    plt.figure(figsize=fig_size)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM feature importance ')
    plt.tight_layout()
    plt.show()
    
plotImp(model, X)

In [None]:
#Zero value importances
feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
feature_imp[feature_imp["Value"]==0].head(20)

## CV 

In [None]:
kfold = KFold(n_splits= FOLDS,shuffle = False)

In [None]:
def cross_val(X,y, test):
    
    test_predictions = []
    lgb_scores = []

    for idx, (train_idx, val_idx) in enumerate(kfold.split(X, y)):

        print("\n",10*"=", f"Fold={idx+1}", 10*"=")

        X_train, y_train = X.iloc[train_idx,:], y.iloc[train_idx,]
        X_valid, y_val = X.iloc[val_idx,:], y.iloc[val_idx,]

        #Scaling
        scaler = StandardScaler()
        X_train[num_col] = scaler.fit_transform(X_train[num_col])
        X_test[num_col] = scaler.transform(X_test[num_col])
        
        if test is not None:
            test[num_col] = scaler.transform(test[num_col])
        
        train_preds, test_preds, mae ,model= fit_model(X_train, y_train,EPOCHS, X_valid, y_val, test)
        
        lgb_scores.append(mae)
        test_predictions.append(test_preds)
        
        del X_valid
        del y_val

    print("Mean Validation MAE :", np.mean(lgb_scores))
    return test_predictions


test_predictions = cross_val(X,y, test= None)

In [None]:
# submission["congestion"] = np.round(np.mean(test_predictions, axis =0))
# submission.to_csv("submission_ml.csv")

# plt.figure(figsize = (20,8))
# sns.lineplot(data = train.iloc[-8000:], x = train.iloc[-8000:].index.date,  y = "congestion", label = "actual")
# sns.lineplot(x = test.index.date,  y = submission["congestion"], label = "predicted")
# plt.show()

# Recursive 

In [None]:
FREQUENCY = 20 #prediction interval in minutes

start_date = min(test["time"]) 
end_date = max(test["time"])
print(start_date)
print(end_date)

In [None]:
def multi_step_recursive(start_date, end_date, freq, sub, train_i, test_i):
    delta = pd.DateOffset(minutes = freq)

    all_df = pd.concat([train_i.assign(ds="a"),test_i.assign(ds="b")],axis =0)
    
    #Shift and rolling features
    if LAG_FEATURES:
        all_df = shift_vals(all_df,shift_list, groupby= "dir_x_y_LE")
        all_df = rolling(all_df,roll_window, groupby= "dir_x_y_LE")
        
    while start_date <= end_date:
        
        print("\n############ Start date" , start_date, " ############")
        #Select slice to predict
        test_split = all_df [  (all_df["time"]>= start_date ) & (all_df["time"]< start_date+delta) ][features]

        X = all_df[ all_df["time"]< start_date][features]
        y = all_df[ all_df["time"]< start_date]["congestion"]
        
        #predict 1 timeframe - full data
        one_period_preds = cross_val(X,y,test_split)
        
        print(np.mean(one_period_preds, axis =0))
        
        #Add predicted test data back to all_df
        test_split["congestion"] = np.mean(one_period_preds, axis =0)
        all_df.loc[test_split.index, "congestion"]  = test_split["congestion"]

        #update submission 
        sub.loc[test_split.index , "congestion"] =  np.mean(one_period_preds, axis =0)
        
        if LAG_FEATURES:
            all_df = shift_vals(all_df,shift_list, groupby= "dir_x_y_LE")
            all_df = rolling(all_df,roll_window, groupby= "dir_x_y_LE")
        
        #update start date
        start_date += delta
    
    return sub

In [None]:
sub_recursive  = multi_step_recursive(start_date, end_date, FREQUENCY, submission.copy(deep=True), train, test)
sub_recursive

In [None]:
sub_recursive["congestion"] = np.round(sub_recursive["congestion"] )
sub_recursive.to_csv("sub_recursive.csv")
sub_recursive

In [None]:
plt.figure(figsize = (20,8))
sns.lineplot(data = train.iloc[-8000:], x = train.iloc[-8000:]["time"],  y = "congestion", label = "actual")
sns.lineplot(x = test["time"],  y = sub_recursive["congestion"], label = "predicted")
plt.show()