### To Do
1. hyperparameter tuning- catboost and LGB
1. different residual model 
1. different main model - random forests 
1. Missing values / backfill - currently set as backfill - try median /mean
1. NB - add grouped median columns for each period (['x', 'y', 'direction', 'weekday', 'hour', 'minute']) 

### DONE
1. Rolling median/mean 
1. Scaling   --- Scaling on 
1. label encoding/ ohe --- only labelencoding (TBC) 
1. CV with or without s --No shuffle
1. added features 
1. Fit on full dataset
1. residual perturbing

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostRegressor

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv", index_col = 0)
test_original = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv",index_col = 0)
sub = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv",index_col = 0)

In [None]:
sns.set_theme()
THRESHOLD = 0.40

SCALING = False
SCALER = StandardScaler()

EPOCHS = 1000
EARLY_STOPPING = 30

LAG_FEATURES = True

In [None]:
def feature_engineering(df):
    df= df.copy(deep = True)
    df['time'] = pd.to_datetime(df['time'])
    df["x_y_direction"] = df["x"].astype("str") + df["y"].astype("str") + df["direction"].astype("str")
    
    return df

train = feature_engineering(train_original)
test = feature_engineering(test_original)
train

# Additional Feature Engineering
Now that we have the data in a correct pivot format \ 
we will create new columns for prediction 

In [None]:
train.set_index("time", drop= True, inplace = True)
test.set_index("time", drop= True, inplace = True)

In [None]:
degree = {'EB':180, 'NB':90, 'SB':270, 'WB':0, 'NE':135, 'SW':315, 'NW': 45, 'SE':225}

def basic_feats(df):
    df= df.copy(deep = True)
    
    df["minute"] = df.index.minute
    df["hour"] = df.index.hour    
    df["day"] = df.index.day
    df["month"] = df.index.month
    df["dayofweek"]= df.index.weekday
    
    df['moment']  = df.index.hour * 3 + df.index.minute // 20 
   # df["weekofmonth"]= df['day']//7+1
    
    #New features
    df["x+y"] = df["x"]+df["y"]
    df['x_y'] = df['x'].astype('str') + df['y'].astype('str')
    df['hour_direction'] = df['hour'].astype('str') + df['direction'].astype('str')

    df["degree"] = df["direction"].map(degree).astype("int32")
    #df['rad'] = math.pi * df['degree'] / 180
#     df['afternoon'] = df['hour'] >= 12
#     df['weekend'] = (df.index.weekday >= 5)

    #df['saturday'] = df.index.weekday == 5
    #df['sunday'] = df.index.weekday == 6
    #df['daytime'] = df.index.hour * 60 + df.index.minute
    #df['dayofyear'] = df.index.dayofyear # to model the trend

    return df

train= basic_feats(train)
test= basic_feats(test)
train

In [None]:
location_anomalies = [('21', 'NE', 15), ('22', 'SE', 20), ('22', 'NW', 21), ('21', 'NW', 29), ('21', 'SE', 34)]
def special_apply(x):
    if (x[0] =="21" and x[1] =="NE") or (x[0] =="22" and x[1] =="SE") or (x[0] =="22" and x[1] =="NW") or (x[0] =="21" and x[1] =="NW")  or (x[0] =="21" and x[1] =="SE"):
        y= 1
    else:
        y= 0
    return y
# train["special"] = train[["x_y","direction"]].apply(special_apply ,axis =1)
# test["special"] = test[["x_y","direction"]].apply(special_apply ,axis =1)
# train["special"].value_counts()

# Encoding
As there is correlation between direction, x,y, my assumption is to:
1. flatten the data (done further in kernel)
1. label encode features

Im unsure whether to OneHotEncode for categical features or LabelEncode to show correlation,  so im using both 

In [None]:
#Label Encoding
encoder = LabelEncoder()
def lencoder(df, col , encoder):
    df = df.copy(deep = True)
    df[col] = encoder.fit_transform(df[col]).astype("int32")
    return df

for col in ["x_y_direction", "direction", "x_y", "hour_direction"]:
    train = lencoder(train, col , encoder)
    test = lencoder(test, col , encoder)

# OnHotEncoding
ohe = OneHotEncoder(sparse = False,drop="first")

def OHE(df, cols):
    df = df.copy(deep = True)
    encoded = pd.DataFrame(ohe.fit_transform(df[cols]),columns =ohe.get_feature_names_out(), index = df.index)
    df = pd.concat([df, encoded], axis =1 )
    return df 

train = OHE(train, ["x_y_direction"])
test = OHE(test, ["x_y_direction"])

train.head()

# Correlation Shifting  
As we have seen there is a correlation between in target when looked at specific x_y_direction groupings. \
As such we will want to include these values in our training, however these values are part of the target\

We have an option of multistep recursion, where we shift the values by 1 period (i.e. 20mins) and use these values \
The process is as follow: 
* Concatenate train and test (needed for shifting train values into test )
* groupby 'x_y_direction' column 
* shift concatenation by 1 (20mins) 
* add any lost columns needed due to groupby 
* Apply pivot transformation as seen above
* Merge pivot to concatenated data 


**NOTE**: I backfilled null values - > with subsequent values --> option to use mean/median?

In [None]:
features = list(train.columns)
features.append("ds")
print(features)

In [None]:
all_df = pd.concat([train.assign(ds="train"),test.assign(ds="test")],axis =0)

def pvt_shift_values(all_df):
    
    all_shift = all_df.groupby(['x_y_direction'])[["congestion"]].shift(1).fillna(method='bfill')

    #ensure have the correct column for pivot
    all_shift["x_y_direction"]  =all_df["x"].astype("str") + all_df["y"].astype("str") +all_df["direction"].astype("str")
    
    #Pivot 
    all_pvt = all_shift.pivot_table(
        values='congestion', 
        index='time', 
        columns='x_y_direction', 
        aggfunc=np.sum)
    
    #backfill row 0 as this caused issues in the pvt due to shift 
    all_pvt.iloc[0] =all_pvt.iloc[1]
    
    #Merge pivot data to full data set 
    all_merge = pd.merge(all_df[features], all_pvt,  how='left', right_index=True, left_index=True)
    
    # Rolling values 
    #all_merge["rolling_median_3"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(3, min_periods=1).median())
    #all_merge["rolling_median_8"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(6, min_periods=1).median())
    #all_merge["rolling_median_12"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(9, min_periods=1).median())
    #all_merge["rolling_std_4"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(4, min_periods=1).std())
    #all_merge["rolling_std_8"] = all_merge.groupby("x_y_direction")["congestion"].transform(lambda s: s.rolling(8, min_periods=1).std())
    
    #backfill std deviation
    #all_merge.loc[all_merge.index == pd.to_datetime("1991-04-01 00:00:00"), "rolling_std_4"] = all_merge.loc[all_merge.index == pd.to_datetime("1991-04-01 00:20:00")]["rolling_std_4"].values
    
    return all_merge 

all_df = pvt_shift_values(all_df)

#check 
all_df[all_df.index == min(test.index)].head(20)

In [None]:
# all_df = pd.concat([train.assign(ds="train"),test.assign(ds="test")],axis =0)

# def pvt_shift_values(all_df):
#     all_df["congestion_1"] = all_df.groupby(['x_y_direction'])[["congestion"]].shift(1)
#     all_df["congestion_2"] = all_df.groupby(['x_y_direction'])[["congestion"]].shift(2)

#     #Pivot 
#     all_pvt_1 = all_df.pivot_table(
#         values='congestion_1', 
#         index='time', 
#         columns='x_y_direction', 
#         aggfunc=np.sum)
#     all_pvt_2 = all_df.pivot_table(
#         values='congestion_2', 
#         index='time', 
#         columns='x_y_direction', 
#         aggfunc=np.sum)

#     all_pvt= pd.merge(all_pvt_1,all_pvt_2, left_index=True, right_index= True)
#     all_merge = pd.merge(all_df[features], all_pvt,  how='left', right_index=True, left_index=True)
#     return all_merge

# all_df = pvt_shift_values(all_df)

# #check 
# all_df[all_df.index == min(test.index)].head(20)

# Target shifting (and Rolling)
I removed rolling mean/median/std as this seems to cause a lot of incorrect validation so its tricky to analyse results

In [None]:
shift_list = [1, 3, 6, 9
             ]
roll_window = [4,8, 12]

In [None]:
def shift_target(df, shift_list, groupby):
    df = df.copy(deep = True)
    
    for i in shift_list:
        #df[f"shift_{i}"]  = df.groupby(groupby)['congestion'].shift(1, fill_value=0)
        df[f"shift_{i}"] = df.groupby(groupby)['congestion'].transform(lambda s: s.shift(i))
        df[f"shift_{i}"] =df.groupby("x_y_direction")[f"shift_{i}"].backfill()
    return df

def rolling(df, roll_window, groupby):
    df = df.copy(deep = True)
    
    for i in roll_window:
        df[f"rolling_mean_{i}"] = df.groupby(groupby)['shift_1'].transform(lambda s: s.rolling(i, min_periods=1).mean())
        df[f"rolling_median_{i}"] = df.groupby(groupby)['shift_1'].transform(lambda s: s.rolling(i, min_periods=1).median())
        df[f"rolling_std_{i}"] = df.groupby(groupby)['shift_1'].transform(lambda s: s.rolling(i, min_periods=1).std())
    return df.fillna(0)

if LAG_FEATURES:
    print("Lag Target")
    all_df = shift_target(all_df,shift_list,groupby= 'x_y_direction')
    all_df = rolling(all_df,roll_window, groupby= 'x_y_direction')
all_df

#### shift by a week

In [None]:
all_df["shift_week"] = all_df.groupby("x_y_direction")['congestion'].transform(lambda s: s.shift(7, freq = "D"))
all_df['shift_week'] =all_df.groupby("x_y_direction")['shift_week'].backfill()
all_df 

In [None]:
# Check 
all_df[all_df.index ==pd.to_datetime("1991-04-08 00:00:00")]

# Downcasting

In [None]:
#reduce memory 

def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

reduce_mem_usage(all_df)

# Scaling and splitting 

In [None]:
train = all_df[all_df["ds"] =="train"].drop("ds",axis =1)
test = all_df[all_df["ds"] =="test"].drop(["ds","congestion"],axis =1)
#Check
test[test.index ==pd.to_datetime("1991-09-30 13:00:00")]

In [None]:
#full fit
X = train.drop("congestion",axis =1 )
y = train["congestion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

scaler = SCALER

if SCALING:
    print("Applying Scaling")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    test_s = scaler.transform(test)
else: 
    test_s = test.copy()

# Fit model: Train data
Lightgbm

In [None]:
num_cols = [col for col in train.columns if train[col].dtype == 'int64' and col not in ["congestion"]]

In [None]:
if SCALING:
    X_s = scaler.fit_transform(X)
else:
    X_s = X.copy()

In [None]:
lgb_params = {"force_col_wise" : True ,
              "objective" : "regression",
              "num_threads": -1,
              "metric": "mae",
              'learning_rate': 0.03386223199544998,
              'boosting': 'gbdt', 
              'lambda_l1': 2.989505976417424e-07, 
              'lambda_l2': 1.6651524609127486e-06, 
              'num_leaves': 236,
              'max_depth': 12,
              "device_type": "cpu",
              "gpu_platform_id" : 0,
              "gpu_device_id" : 0
             }

In [None]:
def fit_model_lgb(X,y):
    train_set = lgb.Dataset(X, y, params={'verbose': -1})

    model = lgb.train(params=lgb_params,
                          train_set= train_set, 
                          #valid_sets= (valid_set), 
                          num_boost_round= EPOCHS,
                          #callbacks=[lgb.early_stopping(EARLY_STOPPING)]  
                     ) 
    train_preds = model.predict(X)
    print("\nIntrinsic MAE", mean_absolute_error(y, train_preds))
    print("Intrinsic R2", r2_score(y, train_preds))    
    return model, train_preds

#predict
model, train_preds   = fit_model_lgb(X_s,y)

# Fit Model: Residual Data 
I wanted a different model to predict residuals as I assumed a different model would identify different aspects that the original model didnt \
**Option** Use NN or linear model for residuals 

In [None]:
def fit_model_ET(X,y):
    train_set = lgb.Dataset(X, y, params={'verbose': -1})
    model = ExtraTreesRegressor(n_estimators = 20, min_samples_split = 101,n_jobs=-1)
    model.fit(X,y)
    train_preds = model.predict(X)
    print("\nIntrinsic MAE", mean_absolute_error(y, train_preds))
    print("Intrinsic R2", r2_score(y, train_preds))
    return model, train_preds

# fit ET model on residuals 
model_res, train_res   = fit_model_ET(X_s,  (y - train_preds) )

In [None]:
def fit_model_cat(X,y):
    
    model = CatBoostRegressor(
        verbose=1000,
        early_stopping_rounds=10,
        random_seed=2022,
        max_depth=12,
        task_type='CPU',
        learning_rate=0.035,
        iterations=EPOCHS,
        loss_function='MAE',
        eval_metric= 'MAE'
    ).fit(X, y)
    
    train_preds =model.predict(X) 
    
    print("\nIntrinsic MAE", mean_absolute_error(y, train_preds))
    print("Intrinsic R2", r2_score(y, train_preds))   
    
    return model,train_preds,

#model, train_preds   = fit_model_cat(X_s,y)

## Analysis 

In [None]:
train.iloc[-10000:]["congestion"].plot(figsize = (25,10), label = "actual")
plt.plot(train.iloc[-10000:].index, train_preds[-10000:], label = "prediction")
plt.plot(train.iloc[-10000:].index, (train_preds+train_res)[-10000:], label = "prediction + residuals")
plt.title("Predicted vs Residuals + Predicted vs Actual")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (25,5))
plt.hist((y-train_preds), bins = 100, label = "LGB Residuals " , alpha =0.5)
plt.hist(train_res, bins = 100 , label = "ET Predicted Residuals ", alpha =0.5)
plt.title("Residuals")
plt.show()

In [None]:
def plotImp(model, X , num = 30, fig_size = (20, 10)):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    
    plt.figure(figsize=fig_size)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('Top LightGBM feature importance ')
    plt.tight_layout()
    plt.show()
    
plotImp(model, X)

In [None]:
#Zero value importances
feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
feature_imp[feature_imp["Value"]==0].head(20)

# Recursive + Peturbing 
## Multi-Step Recursive Time Series  Forecasting 
The concept of multistep recursive is a "walk forward" model that predicts one period at a time of the test data (i.e. 20min intervals) \
The predicted data is added to the training data and the next period (or window) is predicted \ 
This continues till the full training data is predicted 

This concept is done in a previous notebook  

## Perturbing the predictions 

This concept comes from this [paper](https://www.researchgate.net/publication/220764883_Recursive_Multi-step_Time_Series_Forecasting_by_Perturbing_Data) 
Due to the inherent issue with multi-steo forecasts which suffers from the accumulation of errors (i.e. a poor initial prediction is projected into the subsequent predictions)  

The process uses a second model to predict the residuals of each time period (window), these residuals are added to the test predictions \
I will try 2 approaches: 
1. Save the residuals and add the resudiausl to the full test predictions after the full multi-step process is completed 
1. Add the resdiuals after each step 

In [None]:
PERIOD = 20 #prediction interval in minutes

start_date = min(test.index) 
end_date = max(test.index)
print(start_date)
print(end_date)

# Submit

In [None]:
def multi_step_fitmodel(start_date, end_date, period, train_i, test_i):
    
    preds = []
    residuals = []
    delta = pd.DateOffset(minutes = PERIOD)
    all_df = pd.concat([train_i.assign(ds="train"),test_i.assign(ds="test")],axis =0)
    
    while start_date <= end_date:

        print("\n############ Start date" , start_date, " ############")

        #shift and pivot
        all_df = pvt_shift_values(all_df)
        
        if LAG_FEATURES:
            "Applying lag features"
            all_df = shift_target(all_df,shift_list,groupby= 'x_y_direction')
            all_df = rolling(all_df,roll_window, groupby= 'x_y_direction')

        #prediction period and test slice
        test_timeframe = (all_df.index>= start_date ) & (all_df.index< start_date+delta)
        test_split = all_df [  test_timeframe ].drop(["ds","congestion"],axis =1)
        
        X = all_df[ all_df.index< start_date].drop(["ds","congestion"],axis =1)
        y = all_df[ all_df.index< start_date]["congestion"]

        if SCALING:
            print("Applying Scaling")
            scaler = StandardScaler()
            X[num_cols] = scaler.fit_transform(X[num_cols])
            test_split[num_cols] = scaler.transform(test_split[num_cols])
        
        #Fit training data
        model, train_preds   = fit_model_lgb(X,y)
        one_period_preds = model.predict(test_split)
        
        # fit new model on residuals 
        model_res, train_res   = fit_model_ET(X, y - train_preds)
        residuals_preds = model_res.predict(test_split)

        #Add predicted test data back to all_df
        all_df.loc[ test_timeframe ,"congestion"] = one_period_preds + residuals_preds
        preds.extend(list(one_period_preds + residuals_preds))
        residuals.extend(list(residuals_preds))
        
        del test_split
        del test_timeframe

        start_date += delta
    return preds, residuals

test_preds, residuals  = multi_step_fitmodel(start_date, end_date, PERIOD, train, test)

In [None]:
test_preds[:20]

In [None]:
residuals[:20]

In [None]:
## residuals added in model

sub_rec_rounded = sub.copy(deep = True)
sub_rec_rounded["congestion"] =test_preds
sub_rec_rounded["congestion"] =np.round(sub_rec_rounded["congestion"])
sub_rec_rounded.to_csv("sub_rec_rounded.csv")

sub_rec = sub.copy(deep = True)
sub_rec["congestion"] =test_preds
sub_rec.to_csv("sub_rec.csv")
sub_rec_rounded

In [None]:
train.iloc[-5000:]["congestion"].plot(figsize = (25,10))
plt.plot(test.index, test_preds)
plt.title("Predicted Only vs Actual")
plt.show()

In [None]:
# train.iloc[-5000:]["congestion"].plot(figsize = (25,10))
# plt.plot(test.index, sub_rec_rounded["congestion"])
# plt.title("Predicted + Residuals vs Actual")
# plt.show()