### To Do
1. Fit on Full Dataset - with best iter
1. hyperparameter tuning- catboost and LGB
1. Missing values
1. backfill - currently set as backfill - try median /mean
1. NB - add grouped median columns for each period (['x', 'y', 'direction', 'weekday', 'hour', 'minute']) 

### DONE
1. Rolling median/mean 
1. Scaling   --- Scaling on 
1. label encoding/ ohe --- only labelencoding (TBC) 
1. CV with or without s --No shuffle
1. added features 

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, r2_score

import lightgbm as lgb
from catboost import CatBoostRegressor

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv", index_col = 0)
test_original = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv",index_col = 0)
sub = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv",index_col = 0)

In [None]:
sns.set_theme()
THRESHOLD = 0.40

SCALING = True

EPOCHS = 10000
EARLY_STOPPING = 30

PRED_MONDAY = False

In [None]:
def feature_engineering(df):
    df= df.copy(deep = True)
    df['time'] = pd.to_datetime(df['time'])
    df["x_y_direction"] = df["x"].astype("str") + df["y"].astype("str") + df["direction"].astype("str")
    
    return df

train = feature_engineering(train_original)
test = feature_engineering(test_original)
train

In [None]:
train_groupby = train.groupby(["time",'x_y_direction']).sum()
train_groupby

In [None]:
train_pvt = train_groupby.pivot_table(
        values='congestion', 
        index='time', 
        columns='x_y_direction', 
        aggfunc=np.sum)
train_pvt

## Null values 

In [None]:
#Fill null values
train_pvt.isnull().sum()
train_pvt.fillna(0,inplace = True)

# Correlated Columns analysis

In [None]:
train_corr = train_pvt.corr()
train_corr.style.apply(lambda x: ["background: red" if v > THRESHOLD else "background: green" if v < -THRESHOLD  else "" for v in x], axis = 1)

Lets try visualise this better

In [None]:
plt.figure(figsize = (25,10))
sns.heatmap(train_corr, vmin=-1, vmax = 1, annot=False, cmap= "Spectral")
plt.show()

## Correlated features above Threshold

In [None]:
# Create a dictionary of highest correlated columns 
corr_dict= {}
for col in train_corr.columns:
    values = list(train_corr[(train_corr[col] >THRESHOLD) | (train_corr[col] < -THRESHOLD)][col][1:].index)
    if values:
        corr_dict[col] = values
print(corr_dict)

In [None]:
train_stack = train_corr.stack()
train_stack = train_stack[((train_stack >= THRESHOLD) | (train_stack <= -THRESHOLD)) & (train_stack != 1)]
train_stack.head(20)

In [None]:
fig, axs = plt.subplots(nrows=16, sharex= True, ncols=2, figsize=(25, 60))

for dir, ax in zip(train_stack.index.get_level_values(0).unique(), axs.ravel()):
    sns.barplot(ax = ax, x=  train_stack[(train_stack != 1 )& (train_stack.index.get_level_values(0) == dir)].index.get_level_values(1),
               y=  train_stack[(train_stack != 1 )& (train_stack.index.get_level_values(0) == dir)].values, palette="deep").set(title=dir)

ax.tick_params(axis='x', rotation=90)
fig.tight_layout()
plt.show()

# Additional Feature Engineering
Now that we have the data in a correct pivot format \ 
we will create new columns for prediction 

In [None]:
train.set_index("time", drop= True, inplace = True)
test.set_index("time", drop= True, inplace = True)

In [None]:
degree = {'EB':180, 'NB':90, 'SB':270, 'WB':0, 'NE':135, 'SW':315, 'NW': 45, 'SE':225}

def basic_feats(df):
    df= df.copy(deep = True)
    
    df["minute"] = df.index.minute
    df["hour"] = df.index.hour    
    df["day"] = df.index.day
    df["month"] = df.index.month
    df["dayofweek"]= df.index.weekday
    
    df['moment']  = df.index.hour * 3 + df.index.minute // 20 
   # df["weekofmonth"]= df['day']//7+1
    
    #New features
    df["x+y"] = df["x"]+df["y"]
    df['x_y'] = df['x'].astype('str') + df['y'].astype('str')
    df['hour_direction'] = df['hour'].astype('str') + df['direction'].astype('str')

    df["degree"] = df["direction"].map(degree).astype("int32")
    #df['rad'] = math.pi * df['degree'] / 180
    #df['afternoon'] = df['hour'] >= 12
    #df['weekend'] = (df.index.weekday >= 5)

    #df['is_morning'] = (6 <= df['hour']) & (df['hour'] < 12)#.median()
    #df['is_afternoon'] = (12 <= df['hour']) & (df['hour'] < 18)#.median()
    #df['is_evening'] = (18 <= df['hour']) & (df['hour'] <= 23)#.median()
    #df['is_night'] = (0 <= df['hour']) & (df['hour'] < 6)#.median()

    return df

train= basic_feats(train)
test= basic_feats(test)
train

In [None]:
location_anomalies = [('21', 'NE', 15), ('22', 'SE', 20), ('22', 'NW', 21), ('21', 'NW', 29), ('21', 'SE', 34)]
def special_apply(x):
    if (x[0] =="21" and x[1] =="NE") or (x[0] =="22" and x[1] =="SE") or (x[0] =="22" and x[1] =="NW") or (x[0] =="21" and x[1] =="NW")  or (x[0] =="21" and x[1] =="SE"):
        y= 1
    else:
        y= 0
    return y
# train["special"] = train[["x_y","direction"]].apply(special_apply ,axis =1)
# train["special"].value_counts()

In [None]:
features = list(train.columns)
features.append("ds")
print(features)

# Encoding

In [None]:
#Label Encoding
encoder = LabelEncoder()
def lencoder(df, col , encoder):
    df = df.copy(deep = True)
    df[col] = encoder.fit_transform(df[col]).astype("int32")
    return df

for col in ["x_y_direction", "direction", "x_y", "hour_direction"]:
    train = lencoder(train, col , encoder)
    test = lencoder(test, col , encoder)

# OnHotEncoding
# all_df = pd.get_dummies(all_df,columns= [col+"_e" for col in encode_cols],drop_first=True)
#all_df.drop(encode_cols,axis = 1, inplace = True)
train.head()

# Correlation Shifting  
As we have seen there is a correlation between in target when looked at specific x_y_direction groupings. \
As such we will want to include these values in our training, however these values are part of the target\

We have an option of multistep recursion, where we shift the values by 1 period (i.e. 20mins) and use these values \
The process is as follow: 
* Concatenate train and test (needed for shifting train values into test )
* groupby 'x_y_direction' column 
* shift concatenation by 1 (20mins) 
* add any lost columns needed due to groupby 
* Apply pivot transformation as seen above
* Merge pivot to concatenated data 


**NOTE**: I backfilled null values - > with subsequent values --> option to use mean/median?

In [None]:
all_df = pd.concat([train.assign(ds="train"),test.assign(ds="test")],axis =0)

def pvt_shift_values(all_df):
    
    all_shift = all_df.groupby(['x_y_direction'])[["congestion"]].shift(1)

    #ensure have the correct column for pivot
    all_shift["x_y_direction"]  =all_df["x"].astype("str") + all_df["y"].astype("str") +all_df["direction"].astype("str")
    
    #Pivot 
    all_pvt = all_shift.pivot_table(
        values='congestion', 
        index='time', 
        columns='x_y_direction', 
        aggfunc=np.sum)
    
    #backfill row 0 as this caused issues in the pvt due to shift 
    all_pvt.iloc[0] =all_pvt.iloc[1]
    
    #Merge pivot data to full data set 
    all_merge = pd.merge(all_df[features], all_pvt,  how='left', right_index=True, left_index=True)
    
    # Rolling values 
    all_merge["rolling_median_4"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(4, min_periods=1).median())
    all_merge["rolling_median_8"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(8, min_periods=1).median())
    all_merge["rolling_median_12"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(10, min_periods=1).median())
    #all_merge["rolling_std_4"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(4, min_periods=1).std())
    #all_merge["rolling_std_8"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(8, min_periods=1).std())
    
    #backfill std deviation
    #all_merge.loc[all_merge.index == pd.to_datetime("1991-04-01 00:00:00"), "rolling_std_4"] = all_merge.loc[all_merge.index == pd.to_datetime("1991-04-01 00:20:00")]["rolling_std_4"].values
    
    return all_merge 

all_df = pvt_shift_values(all_df)

#check 
all_df[all_df.index == min(test.index)].head(20)

# Scaling and splitting 

In [None]:
train = all_df[all_df["ds"] =="train"].drop("ds",axis =1)
test = all_df[all_df["ds"] =="test"].drop(["ds","congestion"],axis =1)

In [None]:
X = train.drop("congestion",axis =1 )
y = train["congestion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
if SCALING:
    print("Applying Scaling")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    test_s = scaler.transform(test)
else: 
    test_s = test.copy()

# Fit model 

In [None]:
# lgb_params = {
#     "objective" : "regression",
#     "metric": "mae",
#     "device_type": "cpu",
#     'boosting': "gbdt",  
#     "learning_rate": 0.05,
#     "lambda_l1": 5, #0.03469015403439412,
#     #"lambda_l2":  9.993162304351474,
#     "num_leaves": 20, #100
#     "max_depth": 7,
#     "force_col_wise" : True,
#                    }

# lgb_params= {"force_col_wise" : True ,
#            "objective" : "regression",
#            "metric": "mae",
#            "device_type": "cpu",
#            'learning_rate': 0.13498645651250302, 
#            'boosting': 'gbdt', 
#            'lambda_l1': 3.190603236750105e-07,
#            'lambda_l2': 0.06581234211397913, 
#            'num_leaves': 249, 
#            'max_depth': 15}

lgb_params = {"force_col_wise" : True ,
              "objective" : "regression",
              "num_threads": -1,
              "metric": "mae",
              'learning_rate': 0.03386223199544998,
              'boosting': 'gbdt', 
              'lambda_l1': 2.989505976417424e-07, 
              'lambda_l2': 1.6651524609127486e-06, 
              'num_leaves': 236,
              'max_depth': 12,
              "device_type": "gpu",
              "gpu_platform_id" : 0,
              "gpu_device_id" : 0}

In [None]:
%%time
def fit_model(X_train,y_train,X_test, y_test):
    
    #lightgbm
    train_set = lgb.Dataset(X_train, y_train, params={'verbose': -1})
    valid_set = lgb.Dataset(X_test, y_test, params={'verbose': -1})
    model = lgb.train(params=lgb_params,
                          train_set= train_set, 
                          valid_sets= (valid_set), 
                          num_boost_round= EPOCHS,
                          callbacks=[lgb.early_stopping(EARLY_STOPPING)]  ) 

    #catboost
#     model = CatBoostRegressor(verbose=False, eval_metric='MAE',
#                               iterations = EPOCHS,
#                               thread_count= -1,
#                              task_type="GPU")
    
#     model.fit(X_train,y_train,early_stopping_rounds=EARLY_STOPPING,eval_set=(X_test,y_test))
    
    mae =mean_absolute_error(y_test, model.predict(X_test))
    print("\nMAE lgb:", mae) 
    
    return model , mae 

model , mae  = fit_model(X_train,y_train,X_test, y_test)

## Intrinsic residuals & analysis 

In [None]:
# intrinsic
if SCALING:
    train_preds = model.predict(scaler.transform( X ) )
    print("Intrinsic MAE", mean_absolute_error(y, train_preds))
else:
    train_preds = model.predict(X)
    print("Intrinsic MAE", mean_absolute_error(y, train_preds))
train_preds

In [None]:
plt.figure(figsize = (25,5))
plt.hist((train_preds- y), bins = 100)
plt.title("Residuals")
plt.show()

In [None]:
plt.figure(figsize = (25,5))
sns.scatterplot(x = X.reset_index().index , y= (train_preds- y))
plt.title("Residuals")
plt.show()

In [None]:
def plotImp(model, X , num = 30, fig_size = (20, 10)):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    
    plt.figure(figsize=fig_size)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM feature importance ')
    plt.tight_layout()
    plt.show()
    
#plotImp(model, X)

In [None]:
#Zero value importances
# feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
# feature_imp[feature_imp["Value"]==0].head(20)

In [None]:
# predict the 1 shifted section of test for confirmation 
test_1pred = model.predict(  scaler.transform (test[test.index == min(test.index)]) ) 
test_1pred

# CV

In [None]:
kfold = KFold(n_splits = 3)

In [None]:
def cross_validation(X,y,test):
    mae_list = []
    test_preds = []
    for idx, (train_idx, val_idx) in enumerate(kfold.split(X,y)):
        
        print(f"\n######## Fold {idx+1} ########")
        X_train, y_train = X.iloc[train_idx,:], y[train_idx]
        X_test, y_test = X.iloc[val_idx,:], y[val_idx]
        
        if SCALING:
            print("Applying Scaling")
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            test_s = scaler.transform(test)
        else: 
            test_s = test.copy()
        
        #lightgbm/ catboost
        model , mae  = fit_model(X_train,y_train,X_test, y_test)
        test_preds.append(model.predict(test_s))   
        mae_list.append(mae)
        
    print("Ensemble MAE",np.mean(mae_list))    
    return np.mean(test_preds,axis =0)

#train on X,y , predict on only 1 period of test 
#test_preds = cross_validation(X,y, test[test.index == min(test.index)] )

In [None]:
# plt.figure(figsize = (20,8))

# sns.lineplot(x = test[test.index == min(test.index)].reset_index().index,  y = np.mean(test_preds,axis =0), label = "CV model prediction")
# sns.lineplot(x = test[test.index == min(test.index)].reset_index().index,  y = test_1pred, label = "Base model prediction")
# plt.title("CrossVal vs base model: one step prediction ")
# plt.show()

# Recursive 

In [None]:
PERIOD = 20 #prediction interval in minutes

start_date = min(test.index) 
end_date = max(test.index)
print(start_date)
print(end_date)

In [None]:
def multi_step_CV(start_date, end_date, period, train_i, test_i):
    delta = pd.DateOffset(minutes = 20)
    preds = []

    all_df = pd.concat([train_i.assign(ds="train"),test_i.assign(ds="test")],axis =0)

    while start_date <= end_date:

        print("\n############ Start date" , start_date, " ############")

        #shift and pivot
        all_df = pvt_shift_values(all_df)

        #prediction period and test slice
        test_timeframe = (all_df.index>= start_date ) & (all_df.index< start_date+delta)

        test_split = all_df [  test_timeframe ].drop(["ds","congestion"],axis =1)
        
        X = all_df[ all_df.index< start_date].drop(["ds","congestion"],axis =1)
        y = all_df[ all_df.index< start_date]["congestion"]
        
        #predict 1 timeframe 
        one_period_preds = cross_validation(X,y, test_split )

        #Add predicted test data back to all_df
        all_df.loc[ test_timeframe ,"congestion"] = one_period_preds
        
        print( one_period_preds)
        
        #testing
        preds.extend(list(one_period_preds))

        start_date += delta
    return preds

#test_preds  = multi_step_CV(start_date, end_date, PERIOD, train, test)

In [None]:
def multi_step_fitmodel(start_date, end_date, period, train_i, test_i):
    delta = pd.DateOffset(minutes = 20)
    preds = []

    all_df = pd.concat([train_i.assign(ds="train"),test_i.assign(ds="test")],axis =0)

    while start_date <= end_date:

        print("\n############ Start date" , start_date, " ############")

        #shift and pivot
        all_df = pvt_shift_values(all_df)

        #prediction period and test slice
        test_timeframe = (all_df.index>= start_date ) & (all_df.index< start_date+delta)

        test_split = all_df [  test_timeframe ].drop(["ds","congestion"],axis =1)
        
        X = all_df[ all_df.index< start_date].drop(["ds","congestion"],axis =1)
        y = all_df[ all_df.index< start_date]["congestion"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

        if SCALING:
            print("Applying Scaling")
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            test_split = scaler.transform(test_split)

        #predict 1 timeframe 
        model , mae  = fit_model(X_train,y_train,X_test, y_test)
        one_period_preds = model.predict(test_split)

        #Add predicted test data back to all_df
        all_df.loc[ test_timeframe ,"congestion"] = one_period_preds
        
        print( one_period_preds)
        #testing
        preds.extend(list(one_period_preds))

        start_date += delta
    return preds

test_preds  = multi_step_fitmodel(start_date, end_date, PERIOD, train, test)

In [None]:
sub_multistep_rounded = sub.copy(deep = True)
sub_multistep_rounded["congestion"] =np.round(test_preds)
sub_multistep_rounded.to_csv("sub_multistep_rounded.csv")

sub_multistep = sub.copy(deep = True)
sub_multistep["congestion"] =test_preds
sub_multistep.to_csv("sub_multistep.csv")
sub_multistep

In [None]:
# plt.figure(figsize = (25,8))
# sns.lineplot(data = sub_multistep , x= sub_multistep.index, y = "congestion", hue = test["x_y_direction"].values, palette = "bright")
# #sns.lineplot(data = train_original, x= train_original.index, y = "congestion", hue = train["x_y_direction"].values, palette = "bright")
# plt.show

In [None]:
train.iloc[-5000:]["congestion"].plot(figsize = (25,10))
plt.plot(test.index, sub_multistep["congestion"])
plt.title("Actual vs predicted")
plt.show()