In [None]:
import numpy as np
import pandas as pd
import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from matplotlib.ticker import PercentFormatter
import statsmodels.api as sm
import statsmodels.tsa.stattools as ts
from statsmodels.stats.outliers_influence import variance_inflation_factor
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Comments, some calculations are pretty rough here, apology for the lazyness

# Helper functions

In [None]:
def add_time_related_cols(df):
    """
    Adding time related columns, we can use this later for dummies
    """
    df["datetime"]=pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')
    df["hour"]=df["datetime"].dt.hour
    df["minute"]=df["datetime"].dt.minute
    df["day"]=df["datetime"].dt.day
    df["month"]=df["datetime"].dt.month
    df["date"]=df["datetime"].dt.date
    
def add_holidays_weekends(df, holidays):
    df["holiday"]=np.where(df["date"].isin(holidays),1,0)
    df['weekend'] = np.where( df["datetime"].dt.dayofweek>4 , 1, 0)
    
def generate_prediction_for_ols(ols_estimator, df):
    to_predict=df[(df["congestion"].isna())
                               &(df["congestion_m20"].notnull())
                               &(df["congestion_m40"].notnull())
                               &(df["congestion_m60"].notnull())].copy()

    while len(to_predict)>0:
        traffic=est.predict(to_predict[to_predict.columns[3:-1]])

        temp_df=pd.DataFrame({
            "row_id": to_predict["row_id"],
            "node_id": to_predict["node_id"],
            "datetime": to_predict["datetime"],
            "datetime_m20": to_predict["datetime"]+datetime.timedelta(minutes=20),
            "datetime_m40": to_predict["datetime"]+datetime.timedelta(minutes=40),
            "datetime_m60": to_predict["datetime"]+datetime.timedelta(minutes=60),
            "congestion_forecast":traffic
        })

        df=df.merge(temp_df[["row_id","congestion_forecast"]],how="left",on="row_id")
        df["congestion"]=np.where(
            df["congestion"].isnull(),
            df["congestion_forecast"],
            df["congestion"])

        for col in ["_m20", "_m40", "_m60"]:
            df=df.merge(
                temp_df[["node_id","datetime"+col,"congestion_forecast"]].copy().rename(
                    columns={"datetime"+col:"datetime","congestion_forecast":"congestion"+col+"_forecast"}), 
                how="left", 
                on=["node_id","datetime"])
            df["congestion"+col]=np.where(df["congestion"+col].isnull(),
                                                       df["congestion"+col+"_forecast"],
                                                       df["congestion"+col])
            df.drop(columns=["congestion"+col+"_forecast"],inplace=True)


        df.drop(columns=["congestion_forecast"],inplace=True)
        to_predict=df[(
            df["congestion"].isna())&(
            df["congestion_m20"].notnull())&(
            df["congestion_m40"].notnull())&(
            df["congestion_m60"].notnull())].copy()
    return df

def grab_final_edges(target,edges,add_outgoing=False):
    opposite={
        'EB':"WB", 
        'NB':"SB", 
        'SB':"NB", 
        'WB':"EB", 
        'NE':"SW", 
        'SW':"NE", 
        'NW':"SE", 
        'SE':'NW'
    }

    opposite_dir={
        '1-1':"SW", 
        '1-0':"WB", 
        '1--1':"NW", 
        '0-1':"SB", 
        '0-0':"SWSE", 
        '0--1':"NB", 
        '-1-1':"SE", 
        '-1-0':'EB',
        '-1--1':'NE'
    }
    incoming_edge=list()
    pos_list=target.split("-")
    for i in range(-1,2):
        for j in range(-1,2):
                dirpos=str(i)+"-"+str(j)
                dirv=str(int(pos_list[0])+i)+"-"+str(int(pos_list[1])+j)+"-"+opposite_dir[dirpos]
                if dirv in edges:
                    incoming_edge.append(dirv)
    
    if add_outgoing:
        out_going_edge=[edge for edge in edges if edge.startswith(target[0:3]) and edge!=target]
        incoming_edge=incoming_edge+out_going_edge
    
    final_edges=list()
    for e in incoming_edge:
        final_edges.append(e+"_m20")
        final_edges.append(e+"_m40")
        final_edges.append(e+"_m60")
        final_edges.append(e+"_delta1")
        final_edges.append(e+"_delta2")
    return final_edges

class RegressionResult():
    
    def __init__(self,target):
        self._target=target
        self._forecast_regression_results={}
        self._variable_list={}
        self._adf_results={}
        
    def fit(self,datatable,
            var_lists,
            delta_lists,
            cont_vars,
            step_1=0.2, step_2=0.05):
        # base model
        self._forecast_regression_results["base"] = sm.OLS(datatable[self._target+"_m20"], datatable[var_lists])
        self._forecast_regression_results["base"] = self._forecast_regression_results["base"].fit(cov_type="HC1")
        self._variable_list["base"] = var_lists
        self._adf_results["base"]=ts.adfuller(self._forecast_regression_results["base"].resid)
        print("R2 initial : ", round(self._forecast_regression_results["base"].rsquared,4))
        
        datatable["resid"]=self._forecast_regression_results["base"].resid
        
        if "const" not in delta_lists:
            delta_lists=delta_lists+["const"]
        if "resid" not in delta_lists:
            delta_lists=delta_lists+["resid"]
            
        self._forecast_regression_results["ecm_base"]=sm.OLS(
            datatable[self._target+"_delta0"],datatable[delta_lists])
        self._forecast_regression_results["ecm_base"]=self._forecast_regression_results["ecm_base"].fit(cov_type="HC1")
        self._variable_list["ecm_base"] = delta_lists
        self._adf_results["ecm_base"]=ts.adfuller(self._forecast_regression_results["ecm_base"].resid)
        print("R2 DELTA initial : ", round(self._forecast_regression_results["ecm_base"].rsquared,4))
        
        # step 1 model
        var_restrict=list(self._forecast_regression_results["base"].pvalues[
            self._forecast_regression_results["base"].pvalues<step_1].index)
        
        if "const" not in var_restrict:
            var_restrict=var_restrict+["const"]
            
        self._forecast_regression_results["step_1"] = sm.OLS(datatable[self._target+"_m20"], datatable[var_restrict])
        self._forecast_regression_results["step_1"] = self._forecast_regression_results["step_1"].fit(cov_type="HC1")
        self._variable_list["step_1"] = var_restrict.copy()
        self._adf_results["step_1"]=ts.adfuller(self._forecast_regression_results["step_1"].resid)
        
        # step 1 ecm
        datatable["resid"]=self._forecast_regression_results["step_1"].resid
        self._forecast_regression_results["ecm_step_1"]=sm.OLS(datatable[self._target+"_delta0"],datatable[delta_lists])
        self._forecast_regression_results["ecm_step_1"]=self._forecast_regression_results["ecm_step_1"].fit(cov_type="HC1")
        self._variable_list["ecm_step_1"] = delta_lists.copy()
        self._adf_results["ecm_step_1"]=ts.adfuller(self._forecast_regression_results["ecm_step_1"].resid)
        
        delta_lists=list(self._forecast_regression_results["ecm_step_1"].pvalues[
            self._forecast_regression_results["ecm_step_1"].pvalues<step_1].index)
        
        if "const" not in delta_lists:
            delta_lists=delta_lists+["const"]
        if "resid" not in delta_lists:
            delta_lists=delta_lists+["resid"]
            
        self._forecast_regression_results["ecm_step_1r"]=sm.OLS(
           datatable[self._target+"_delta0"],datatable[delta_lists])
        self._forecast_regression_results["ecm_step_1r"]=self._forecast_regression_results["ecm_step_1r"].fit(cov_type="HC1")
        self._variable_list["ecm_step_1r"] = delta_lists.copy()
        self._adf_results["ecm_step_1r"]=ts.adfuller(self._forecast_regression_results["ecm_step_1r"].resid)
        
def create_forecast_ecm(estimation_data, df, dateforc,method_lags="step_1", ecm_method="ecm_step_1r"):
    for i in range(0,len(dateforc)):
        for target in edges:
            variables=estimation_data[target]._variable_list[method_lags]
            predicted_value=estimation_data[target]._forecast_regression_results[method_lags].predict(
                df[df["datetime"]==dateforc[i]][variables])

            variables=estimation_data[target]._variable_list[ecm_method]
            temp_df=df[df["datetime"]==dateforc[i]].copy()
            temp_df["resid"]=-temp_df[target+"_m20"]+predicted_value
            predicted_value2=estimation_data[target]._forecast_regression_results[ecm_method].predict(temp_df[variables])
            if i<2:
                predicted_value=predicted_value+predicted_value2
            else:
                 predicted_value=predicted_value+predicted_value2

            df[target]=np.where(
                df["datetime"]==dateforc[i],
                predicted_value,df[target])

            if (i+1)<len(dateforc):
                df[target+"_m20"]=np.where(
                    df["datetime"]==dateforc[i+1],
                    predicted_value,
                    df[target+"_m20"])
            if (i+2)<len(dateforc):    
                df[target+"_m40"]=np.where(
                    df["datetime"]==dateforc[i+2],
                    predicted_value,
                    df[target+"_m40"])
            if (i+3)<len(dateforc):    
                df[target+"_m60"]=np.where(
                    df["datetime"]==dateforc[i+3],
                    predicted_value,
                    df[target+"_m60"])

            df[target+"_delta2"]=df[target+"_m40"]-df[target+"_m60"]
            df[target+"_delta1"]=df[target+"_m20"]-df[target+"_m40"]
    return df

# 1. Load and view

train.csv - the training set, 
comprising measurements of traffic congestion across 65 roadways from April through September of 1991.
<li> row_id - a unique identifier for this instance </li>
<li> time - the 20-minute period in which each measurement was taken  </li>
<li> x - the east-west midpoint coordinate of the roadway </li>
<li> y - the north-south midpoint coordinate of the roadway </li>
<li> direction - the direction of travel of the roadway. EB indicates "eastbound" travel, for example, while SW indicates a "southwest" direction of travel.  </li>
<li> congestion - congestion levels for the roadway during each hour; the target. The congestion measurements have been normalized to the range 0 to 100. </li>

In [None]:
train=pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/train.csv")
test=pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/test.csv")
sample_submission=pd.read_csv("/kaggle/input/tabular-playground-series-mar-2022/sample_submission.csv")
# creating daytime related ...
add_time_related_cols(train)
add_time_related_cols(test)

In [None]:
display(train.head())
display(train.tail())
display(train.info())
display(train["direction"].unique())
display(train["x"].unique())
display(train["y"].unique())
print(" MIN date : "+ train["time"].min())
print(" MAX date : "+ train["time"].max())

In [None]:
display(test.head())
display(test.info())
print(" MIN date : "+ test["time"].min())
print(" MAX date : "+ test["time"].max())

In [None]:
display(sample_submission.head())

## 1.1 Adding holidays and identifying weekends

In [None]:
holidays=[ datetime.date(1991,5,27), # Memorial day 1991.05.27
            datetime.date(1991,7,4), # Independence day 1991.07.04
            datetime.date(1991,9,2)] # Labor day 1991.09.02

add_holidays_weekends(train, holidays)
add_holidays_weekends(test, holidays)


## 1.2 Adding ad hoc test days (monday)

In [None]:
# These are the orig set of dates that I tried to get optimal parameter settings for the forecast models
# ... and to blend the models if possible
my_test_dates=[ datetime.date(1991,9,23), 
        datetime.date(1991,9,16),
        datetime.date(1991,9,9),
        datetime.date(1991,9,2),
        datetime.date(1991,4,1),    
        datetime.date(1991,4,8),
        datetime.date(1991,4,15),
        datetime.date(1991,4,22),
        datetime.date(1991,4,29),       
        datetime.date(1991,5,6),
        datetime.date(1991,5,12),
        datetime.date(1991,5,20),
         ]
# here I used cut of 0.28
# These are the other set of dates I got by grabbing similar dates from the data
# E.g. grabbing dates that can predict well 1991.09.30 morning congestion data
my_test_dates=[
datetime.date(1991, 8, 27),
datetime.date(1991, 8, 21),
datetime.date(1991, 4, 23),
datetime.date(1991, 6, 19),
datetime.date(1991, 6, 18),
datetime.date(1991, 4, 10),
datetime.date(1991, 4, 8),
datetime.date(1991, 9, 23),
datetime.date(1991, 5, 8),
datetime.date(1991, 4, 24),
datetime.date(1991, 5, 29),
datetime.date(1991, 5, 6),
datetime.date(1991, 4, 2),
datetime.date(1991, 4, 9),
datetime.date(1991, 4, 25),
datetime.date(1991, 6, 12),
datetime.date(1991, 4, 26),
datetime.date(1991, 6, 5),
datetime.date(1991, 5, 20),
datetime.date(1991, 4, 11)]
# here I used cut of 0.20

# Solution 1.0, naive predict with averages

In [None]:
avg_gen=["x","y","hour","minute","direction"]
avg_congestion=train.groupby(avg_gen).agg({"congestion":[np.nanmean]}).reset_index()
avg_congestion.columns=avg_gen+["congestion"]
test_result_avg=test.merge(avg_congestion,how="left",on=avg_gen)
test_result_avg[["row_id","congestion"]].to_csv("naive_avg_submission_v10.csv",index=False)

<font size="5"><b> Result: Score = 5.077 </b></font>

# Solution 2.0, naive predict with some cyclicality

In [None]:
# Assume september is different (start of school days)
avg_gen=["x","y","hour","minute","direction"]
avg_congestion=train[train["month"]==9].groupby(avg_gen).agg({"congestion":[np.nanmean]}).reset_index()
avg_congestion.columns=avg_gen+["congestion"]
test_result=test.merge(avg_congestion,how="left",on=avg_gen)
test_result[["row_id","congestion"]].to_csv("naive_avg_submission_v20.csv",index=False)

<font size="5"><b> Result: Score = 5.162 </b></font>

# Solution 3.0, naive predict with holiday and weekend data

In [None]:
avg_gen=["x","y","hour","minute","direction"]
# Note: 1991.09.30 is a Monday and not a holiday
condition=(train["weekend"]<1)&(train["holiday"]<1)
avg_congestion3=train[condition].groupby(avg_gen).agg({"congestion":[np.nanmean]}).reset_index()
avg_congestion3.columns=avg_gen+["congestion"]
test_result_avg3=test.merge(avg_congestion3,how="left",on=avg_gen)
test_result_avg3[["row_id","congestion"]].to_csv("naive_avg_submission_v30.csv",index=False)

<font size="5"><b> Result: Score = 5.136 </b></font>

# Solution 4.0, naive predict with using the fact that we are on Monday

In [None]:
avg_gen=["x","y","hour","minute","direction"]
# Note: 1991.09.30 is a Monday and not a holiday
condition=(train["datetime"].dt.dayofweek==0)&(train["holiday"]<1)
avg_congestion4=train[condition].groupby(avg_gen).agg({"congestion":[np.nanmean]}).reset_index()
avg_congestion4.columns=avg_gen+["congestion"]
test_result_avg4=test.merge(avg_congestion4,how="left",on=avg_gen)
test_result_avg4[["row_id","congestion"]].to_csv("naive_avg_submission_v40.csv",index=False)

<font size="5"><b> Result: Score = 5.062 </b></font>

# Solution 5.0, using Monday stuff but with Median

In [None]:
avg_gen=["x","y","hour","minute","direction"]
# Note: 1991.09.30 is a Monday and not a holiday
condition=(train["datetime"].dt.dayofweek==0)&(train["holiday"]<1)
avg_congestion5=train[condition].groupby(avg_gen).agg({"congestion":[np.median]}).reset_index()
avg_congestion5.columns=avg_gen+["congestion"]
test_result_avg5=test.merge(avg_congestion5,how="left",on=avg_gen)
test_result_avg5[["row_id","congestion"]].to_csv("naive_avg_submission_v50.csv",index=False)

<font size="5"><b> Result: Score = 5.030 </b></font>

In [None]:
# We use the results here for fill miss...
avg_gen=["x","y","hour","minute","direction"]
# Note: 1991.09.30 is a Monday and not a holiday
condition=(train["datetime"].dt.dayofweek==0)&(train["congestion"]>0)
# condition=(train["datetime"].dt.dayofweek==0)&(train.datetime.dt.month!=8)&(train["holiday"]<1)&(train["congestion"]>0)
avg_congestion5b=train[condition].groupby(avg_gen).agg({"congestion":[np.median]}).reset_index()
avg_congestion5b.columns=avg_gen+["congestion"]
avg_congestion5b["congestion"]=round(avg_congestion5b["congestion"])
test_result_avg5b=test.merge(avg_congestion5b,how="left",on=avg_gen)
test_result_avg5b[["row_id","congestion"]].to_csv("naive_avg_submission_v50_nomiss.csv",index=False)

<font size="5"><b> Result: Score = 5.008 </b></font>

In [None]:
# we use these estimates to fill miss the train (for some cases)
train=train.merge(avg_congestion5b.rename(columns={"congestion":"congestion_estim"}),how="left",on=avg_gen)
train["congestion"]=np.where(train["congestion"]<1,train["congestion_estim"],train["congestion"])
in_sample_test_dates=train[train["datetime"].dt.date.isin(my_test_dates)].copy()
in_sample_test_dates.rename(columns={"congestion_estim":"congestion_mean","congestion":"congestion_realized"},inplace=True)
in_sample_test_dates["node"]=in_sample_test_dates["x"].astype(str)+"_"+in_sample_test_dates["y"].astype(str)+"_"+in_sample_test_dates["direction"]

test_decision=test.copy()
test_decision["node"]=test_decision["x"].astype(str)+"_"+test_decision["y"].astype(str)+"_"+test_decision["direction"]
test_decision=test_decision.merge(test_result_avg5b[["row_id","congestion"]].rename(columns={"congestion":"congestion_mean"}),how="left",on="row_id")

train.drop(columns=["congestion_estim"],inplace=True)

In [None]:
len(in_sample_test_dates)

# Solution 6.0 using regression based techniques

## 6.1 OLS/Panel estimation

In [None]:
# create base data and 
regression_data=train.copy().append(test.copy())
regression_data["hour"]=regression_data["hour"].astype(str)
regression_data["minute"]=regression_data["minute"].astype(str)
regression_data["hour-minute"]=regression_data["hour"]+"-"+regression_data["minute"]
regression_data["x"]=regression_data["x"].astype(str)
regression_data["y"]=regression_data["y"].astype(str)
regression_data["node_id"]=regression_data["x"]+"-"+regression_data["y"]+"-"+regression_data["direction"]
regression_data["datetime_m20"]=regression_data["datetime"]+datetime.timedelta(minutes=20)
regression_data["datetime_m40"]=regression_data["datetime"]+datetime.timedelta(minutes=40)
regression_data["datetime_m60"]=regression_data["datetime"]+datetime.timedelta(minutes=60)

In [None]:
regression_data_enriched=regression_data[regression_data.columns[:-3]].merge(
    regression_data[["node_id","datetime_m20","congestion"]].copy().rename(
        columns={"datetime_m20":"datetime","congestion":"congestion_m20"}), 
    how="left", 
    on=["node_id","datetime"])

regression_data_enriched=regression_data_enriched.merge(
    regression_data[["node_id","datetime_m40","congestion"]].copy().rename(
        columns={"datetime_m40":"datetime","congestion":"congestion_m40"}), 
    how="left", 
    on=["node_id","datetime"])

regression_data_enriched=regression_data_enriched.merge(
    regression_data[["node_id","datetime_m60","congestion"]].copy().rename(
        columns={"datetime_m60":"datetime","congestion":"congestion_m60"}), 
    how="left", 
    on=["node_id","datetime"])

regression_data_enriched['Tuesday'] = np.where( regression_data_enriched["datetime"].dt.dayofweek==1 , 1, 0)
regression_data_enriched['Wednesday'] = np.where( regression_data_enriched["datetime"].dt.dayofweek==2 , 1, 0)
regression_data_enriched['Thursday'] = np.where( regression_data_enriched["datetime"].dt.dayofweek==3 , 1, 0)
regression_data_enriched['Friday'] = np.where( regression_data_enriched["datetime"].dt.dayofweek==4 , 1, 0)
regression_data_enriched['const'] = 1


regression_data_enriched=regression_data_enriched[regression_data_enriched["weekend"]<1].copy()
variables=["const","congestion_m20","congestion_m40", 
           "congestion_m60","holiday",
          'Tuesday', "Wednesday","Thursday","Friday"
          ]

target="congestion"

nodes = pd.get_dummies(data=regression_data_enriched["node_id"], drop_first=True)
hourminute = pd.get_dummies(data=regression_data_enriched["hour-minute"], drop_first=True)

In [None]:
X_variables=pd.concat([regression_data_enriched[["row_id","node_id","datetime"]], 
                       regression_data_enriched[variables],
                       nodes,
                       hourminute, 
                       regression_data_enriched[target]],axis=1)
X_variables_nan=X_variables.tail(6000).copy()
X_variables.dropna(axis=0,inplace=True)

In [None]:
# We observe how the 23rd of September looked like...
X_variables_nan_23=X_variables[X_variables["datetime"].dt.date==datetime.date(1991,9,23)].copy()
congestion_realized_23=X_variables_nan_23[["congestion"]].copy().rename(columns={"congestion":"congestion_realized"})

artificial_test_cond=(X_variables_nan_23["datetime"].dt.date==datetime.date(1991,9,23))&(X_variables_nan_23["datetime"].dt.hour>11)
X_variables_nan_23["congestion"]=np.where(
    artificial_test_cond,
    np.nan,
    X_variables_nan_23["congestion"])

In [None]:
est = sm.OLS(X_variables[X_variables.columns[-1]], X_variables[X_variables.columns[3:-1]])
est = est.fit()
print(est.summary())

In [None]:
X_variables_my_dates=X_variables[X_variables["datetime"].dt.date.isin(my_test_dates)].copy()
X_variables_my_dates.drop(columns=["congestion_m20","congestion_m40","congestion_m60"],inplace=True)
X_variables_my_dates["congestion"]=np.where(
    X_variables_my_dates["datetime"].dt.hour>11,
    np.nan,
    X_variables_my_dates["congestion"])

X_variables_my_dates["datetime_m20"]=X_variables_my_dates["datetime"]+datetime.timedelta(minutes=20)
X_variables_my_dates["datetime_m40"]=X_variables_my_dates["datetime"]+datetime.timedelta(minutes=40)
X_variables_my_dates["datetime_m60"]=X_variables_my_dates["datetime"]+datetime.timedelta(minutes=60)

X_variables_my_dates=X_variables_my_dates.merge(
    X_variables_my_dates[["node_id","datetime_m20","congestion"]].copy().rename(
        columns={"datetime_m20":"datetime","congestion":"congestion_m20"}), 
    how="left", 
    on=["node_id","datetime"])

X_variables_my_dates=X_variables_my_dates.merge(
    X_variables_my_dates[["node_id","datetime_m40","congestion"]].copy().rename(
        columns={"datetime_m40":"datetime","congestion":"congestion_m40"}), 
    how="left", 
    on=["node_id","datetime"])

X_variables_my_dates=X_variables_my_dates.merge(
    X_variables_my_dates[["node_id","datetime_m60","congestion"]].copy().rename(
        columns={"datetime_m60":"datetime","congestion":"congestion_m60"}), 
    how="left", 
    on=["node_id","datetime"])

X_variables_my_dates=X_variables_my_dates[X_variables.columns]

In [None]:
X_variables_my_dates=generate_prediction_for_ols(ols_estimator=est, df=X_variables_my_dates)
X_variables_my_dates.rename(columns={"congestion":"congestion_panel"}, inplace=True)
if "congestion_panel" in in_sample_test_dates.columns:
    in_sample_test_dates.drop(columns=["congestion_panel"], inplace=True)
in_sample_test_dates=in_sample_test_dates.merge(X_variables_my_dates[["row_id","congestion_panel"]],how="left",on="row_id")

in_sample_test_dates["error_mean"]=abs(in_sample_test_dates["congestion_mean"]-in_sample_test_dates["congestion_realized"])
in_sample_test_dates["error_panel"]=abs(in_sample_test_dates["congestion_panel"]-in_sample_test_dates["congestion_realized"])

In [None]:
len(in_sample_test_dates)

In [None]:
# Raw panel predict
X_variables_nan=generate_prediction_for_ols(ols_estimator=est, df=X_variables_nan)
regression_output=test[["row_id"]].merge(X_variables_nan[["row_id","congestion"]],how="left",on="row_id")
regression_output[["row_id","congestion"]].to_csv("regression_submission_v60.csv",index=False)

<font size="5"><b> Result: Score = 5.703 </b></font>

In [None]:
# combine forecast...
test_decision=test_decision.merge(regression_output[["row_id","congestion"]].rename(columns={"congestion":"congestion_panel"}),on="row_id",how="left")

In [None]:
# There are differences in forecast accuracy...
predict_blend=in_sample_test_dates[in_sample_test_dates["datetime"].dt.hour>11].groupby("node").agg({
    "error_mean":[np.nanmean],
    "error_panel":[np.nanmean],
}).reset_index()
predict_blend.columns=["node","error_mean","error_panel"]
predict_blend["recommend"]=np.where(
    predict_blend["error_mean"]<(predict_blend["error_panel"]-0.5), # I trust in the panel estim a bit more
    "mean",
    "panel"
)

In [None]:
test_decision["panel_and_mean"]=test_decision["congestion_panel"].copy()
test_decision["congestion_mean"]=test_decision["congestion_mean"].astype(np.float64)

for node in list(predict_blend["node"].unique()):
    node_result=list(predict_blend[predict_blend["node"]==node]["recommend"])[0]
    test_decision["panel_and_mean"]=np.where(
                                    node==test_decision["node"],
                                    np.where(node_result=="panel",test_decision["congestion_panel"],test_decision["congestion_mean"]),
                                    test_decision["panel_and_mean"]
                                    )

In [None]:
test_decision[["row_id","panel_and_mean"]].rename(columns={"panel_and_mean":"congestion"}).to_csv("regression_submission_panel_mean.csv",index=False)

<font size="5"><b> Result: Score = 5.137 </b></font>

In [None]:
X_variables_nan_23=generate_prediction_for_ols(ols_estimator=est, df=X_variables_nan_23)
X_variables_nan_23["congestion_realized"]=list(congestion_realized_23["congestion_realized"])

regression_output23=train[["row_id","datetime","x","y","direction"]].merge(
    X_variables_nan_23[["row_id","congestion","congestion_realized"]],how="right",on="row_id")
regression_output23=regression_output23[(regression_output23["datetime"].dt.date==datetime.date(1991,9,23))&(regression_output23["datetime"].dt.hour>11)]
accuracy_panel=pd.DataFrame()
plt.figure(figsize=(20,50))
index=0
for i in range(0,3): # x
    for j in range(0,4): # y
        for direction_value in ['EB', 'NB', 'SB', 'WB', 'NE', 'SW', 'NW', 'SE']:
            select_forc=regression_output23[(regression_output23["x"]==i)&(regression_output23["y"]==j)&(regression_output23["direction"]==direction_value)]
            if len(select_forc)>0:
                
                accuracy_panel=accuracy_panel.append(pd.DataFrame({
                    "target":[str(i) + "-"+str(j)+"-"+direction_value],
                    "MAE":[np.nanmean(abs(select_forc["congestion"]-select_forc["congestion_realized"]))]
                }))
                plt.subplot(20,4,index+1)
                sns.lineplot(data=select_forc, x="datetime", y="congestion", palette="Blues");
                sns.lineplot(data=select_forc, x="datetime", y="congestion_realized", palette="hls");
                plt.legend(["regression forecast", "realized traffic"], fontsize=6)
                plt.xticks(rotation=90);
                plt.title("x=" + str(i) + "  y="+str(j)+"  dir="+direction_value, fontsize=10);
                index+=1


In [None]:
# Plot the distribution of the test predictions
# compared to the other Monday afternoons
plt.figure(figsize=(20,3))
plt.hist(train.congestion[((train.datetime.dt.weekday == 0) &
                           (train.datetime.dt.hour >= 12)).values],
         bins=np.linspace(-0.5, 100.5, 102),
         density=True, label='Train',
         color='#ffd700')
plt.hist(avg_congestion5b['congestion'], np.linspace(-0.5, 100.5, 102),
         density=True, rwidth=0.5, label='Test predictions',
         color='r')
plt.xlabel('Congestion')
plt.ylabel('Frequency')
plt.title('Congestion on Monday afternoons')
plt.gca().yaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=1))
plt.legend()
plt.show()

In [None]:
accuracy_panel.mean()

# Multiple VAR models

In [None]:
try:
    del X_variables
except NameError:
    pass

In [None]:
train=train[(train.datetime.dt.weekday < 5) & (train.holiday < 1)].copy()
regression_data=train.copy().append(test.copy())

sep_ecm = train[(train.datetime.dt.weekday < 5)]
lower = sep_ecm.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.05).reset_index()
lower.columns=['hour', 'minute', 'x', 'y', 'direction',"lower"]
upper = sep_ecm.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.95).reset_index()
upper.columns=['hour', 'minute', 'x', 'y', 'direction',"upper"]
if "lower" in regression_data.columns:
    regression_data.drop(columns=["upper","lower"],inplace=True)
regression_data=regression_data.merge(lower,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
regression_data=regression_data.merge(upper,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
regression_data['congestion'] = regression_data["congestion"].clip(regression_data["lower"], regression_data["upper"])
if "lower" in regression_data.columns:
    regression_data.drop(columns=["upper","lower"],inplace=True)

regression_data["hour"]=regression_data["hour"].astype(str)
regression_data["minute"]=regression_data["minute"].astype(str)
regression_data["hour-minute"]=regression_data["hour"]+"-"+regression_data["minute"]
regression_data["x"]=regression_data["x"].astype(str)
regression_data["y"]=regression_data["y"].astype(str)
regression_data["node_id"]=regression_data["x"]+"-"+regression_data["y"]+"-"+regression_data["direction"]
regression_data.sort_values(by=["node_id","datetime","hour","minute"], inplace=True)

edges=list(regression_data["node_id"].unique())
regression_data=regression_data[regression_data["weekend"]<1].copy()

In [None]:
datatable=regression_data[regression_data["node_id"]==edges[0]][
    ["time","datetime","date","hour","minute","holiday","congestion","hour-minute"]
].rename(columns={"congestion":edges[0]})

for i in range(1,len(edges)):
    temp_df=regression_data[regression_data["node_id"]==edges[i]][
        ["time","congestion"]
    ].rename(columns={"congestion":edges[i]})
    datatable=datatable.merge(temp_df,how="left",on="time")
    
datatable["datetime_m20"]=datatable["datetime"]+datetime.timedelta(minutes=20)
datatable["datetime_m40"]=datatable["datetime"]+datetime.timedelta(minutes=40)
datatable["datetime_m60"]=datatable["datetime"]+datetime.timedelta(minutes=60)

datatable['Tuesday'] = np.where( datatable["datetime"].dt.dayofweek==1 , 1, 0)
datatable['Wednesday'] = np.where( datatable["datetime"].dt.dayofweek==2 , 1, 0)
datatable['Thursday'] = np.where( datatable["datetime"].dt.dayofweek==3 , 1, 0)
datatable['Friday'] = np.where( datatable["datetime"].dt.dayofweek==4 , 1, 0)

In [None]:
# Adding lags:
map_20={"datetime_m20":"datetime"}
map_40={"datetime_m40":"datetime"}
map_60={"datetime_m60":"datetime"}
for edge in edges:
    map_20[edge]=edge+"_m20"
    map_40[edge]=edge+"_m40"
    map_60[edge]=edge+"_m60"

In [None]:
datatable=datatable.merge(
    datatable[["datetime_m20"]+list(map_20.keys())[1:]].copy().rename(
        columns=map_20), 
    how="left", 
    on=["datetime"])

datatable=datatable.merge(
    datatable[["datetime_m40"]+list(map_40.keys())[1:]].copy().rename(
        columns=map_40), 
    how="left", 
    on=["datetime"])

datatable=datatable.merge(
    datatable[["datetime_m60"]+list(map_60.keys())[1:]].copy().rename(
        columns=map_60), 
    how="left", 
    on=["datetime"])

delta_edge_cols=list()
for edge in edges:    
    datatable[edge+"_delta0"]=datatable[edge]-datatable[edge+"_m20"]
    datatable[edge+"_delta1"]=datatable[edge+"_m20"]-datatable[edge+"_m40"]
    datatable[edge+"_delta2"]=datatable[edge+"_m40"]-datatable[edge+"_m60"]

In [None]:
import gc
gc.collect()

In [None]:
hourminute = pd.get_dummies(data=datatable["hour-minute"], drop_first=True)
dummies=list(hourminute.columns)
datatable=pd.concat([datatable,hourminute],axis=1)
datatable["const"]=1
datatable_predict=datatable.copy()

datatable.dropna(inplace=True)
del hourminute

In [None]:
gc.collect()

In [None]:
estimation_data={}

for target in edges:
    print("Target : "+ target)
    final_edges=grab_final_edges(target,edges,True)
    cont_vars=[target+"_m60"]+final_edges
    lags=[variable for variable in cont_vars if variable.endswith("0")]
    #delta_lists=[variable for variable in final_edges if not variable.endswith("0")]
    delta_lists1=[variable for variable in final_edges if variable.endswith("delta1")]
    delta_lists2=[variable for variable in final_edges if variable.endswith("delta2")]
    day_related=["holiday","Tuesday", "Wednesday","Thursday","Friday","const"]
    var_lists=lags+day_related+dummies
    delta_lists=day_related+delta_lists1+delta_lists2+dummies+[target+"_delta1",target+"_delta2"]
    
    estimation_data[target]=RegressionResult(target=target)
    estimation_data[target].fit(datatable,
                                var_lists,
                                delta_lists=delta_lists, cont_vars=cont_vars, step_1=0.1, step_2=0.05)


In [None]:
datatable_predict_23=datatable.copy()

datatable_predict_23=datatable_predict_23[
    datatable_predict_23["datetime"].dt.date==datetime.date(1991,9,23)].copy()
datatable_predict_23_filled=datatable_predict_23.copy()
dateforc=list(datatable_predict_23[datatable_predict_23[target].isnull()]["datetime"])

for edge in edges:
    datatable_predict_23[edge]=np.where(
        datatable_predict_23["datetime"].dt.hour>8,np.nan,datatable_predict_23[edge]
    )

In [None]:
dateforc=list(datatable_predict_23[datatable_predict_23[target].isnull()]["datetime"])
method_lags="step_1"
ecm_method="ecm_step_1r"
df=datatable_predict_23

datatable_predict_23=create_forecast_ecm(
                                        estimation_data=estimation_data,
                                        df=datatable_predict_23,
                                         dateforc=dateforc,
                                         method_lags=method_lags,
                                         ecm_method=ecm_method)
   

In [None]:
index=0
plt.figure(figsize=(20,10))
for target in ["2-1-NW","2-1-SE","2-2-NW"]:
    plt.subplot(3,1,index+1)
    fig_data=datatable_predict_23[["datetime",target]].merge(datatable_predict_23_filled[["datetime",target]],how="left",on="datetime")
    fig_data=fig_data[fig_data["datetime"].dt.hour>5]
    sns.lineplot(data=fig_data, x="datetime", y=target+"_x", palette="Blues");
    sns.lineplot(data=fig_data, x="datetime", y=target+"_y", palette="hls");
    plt.legend(["regression forecast", "realized traffic"], fontsize=6)
    plt.xticks(rotation=90);
    plt.title(target, fontsize=10);
    index+=1

In [None]:
index=0
plt.figure(figsize=(20,40))
accuracy=pd.DataFrame()
for target in edges:
    plt.subplot(20,4,index+1)
    fig_data=datatable_predict_23[["datetime",target]].merge(datatable_predict_23_filled[["datetime",target]],how="left",on="datetime")
    fig_data=fig_data[fig_data["datetime"].dt.hour>11]
    accuracy=accuracy.append(pd.DataFrame({
        "target":[target],
        "MAE":[np.nanmean(abs(round(fig_data[target+"_x"])-fig_data[target+"_y"]))]
    }))
    sns.lineplot(data=fig_data, x="datetime", y=target+"_x", palette="Blues");
    sns.lineplot(data=fig_data, x="datetime", y=target+"_y", palette="hls");
    plt.legend(["regression forecast", "realized traffic"], fontsize=6)
    plt.xticks(rotation=90);
    plt.title(target, fontsize=10);
    index+=1

In [None]:
dateforc=list(datatable_predict[datatable_predict["0-0-NB"].isnull()]["datetime"])
method_lags="step_1"
ecm_method="ecm_step_1r"

datatable_predict=create_forecast_ecm(
                                        estimation_data=estimation_data,
                                        df=datatable_predict,
                                         dateforc=dateforc,
                                         method_lags=method_lags,
                                         ecm_method=ecm_method)
   

In [None]:
all_ecm_final_predict=pd.DataFrame()
for edge in edges:
    tempdf=pd.DataFrame({
        "congestion_ecm":datatable_predict[(datatable_predict["datetime"].dt.date==datetime.date(1991,9,30)) & (datatable_predict["datetime"].dt.hour>11)][edge],
        "datetime":datatable_predict[(datatable_predict["datetime"].dt.date==datetime.date(1991,9,30)) & (datatable_predict["datetime"].dt.hour>11)]["datetime"]
    })
    tempdf["node"]=edge.replace("-", "_")
    all_ecm_final_predict=all_ecm_final_predict.append(tempdf)

In [None]:
if "congestion_ecm" in test_decision.columns:
    test_decision.drop(columns=["congestion_ecm"], inplace=True)
test_decision=test_decision.merge(all_ecm_final_predict[["datetime","node","congestion_ecm"]],how="left",on=["node","datetime"])

In [None]:
test_decision[["row_id","congestion_ecm"]].rename(columns={"congestion_ecm":"congestion"}).to_csv("pure_ecm.csv",index=False)

<font size="5"><b> Result: Score = 5.386 </b></font>

In [None]:
sep = train[(train.datetime.dt.hour >= 12) & (train.datetime.dt.weekday < 5) &
            (train.datetime.dt.dayofyear >= 246)]
lower = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.25).values
upper = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.75).values
test_decision['congestion_ecm_clip'] = test_decision.congestion_ecm.clip(lower, upper)
test_decision[["row_id","congestion_ecm_clip"]].rename(columns={"congestion_ecm_clip":"congestion"}).to_csv("pure_ecm_cliped.csv",index=False)

<font size="5"><b> Result: Score = 4.985 </b></font>

In [None]:
# We clip panel data yet again...
test_decision['congestion_panel_clip'] = test_decision.congestion_panel.clip(lower, upper)
test_decision[["row_id","congestion_panel_clip"]].rename(columns={"congestion_panel_clip":"congestion"}).to_csv("panel_cliped.csv",index=False)

<font size="5"><b> Result: Score = 5.179 </b></font>

In [None]:
test_decision.head()

# Final forecast combine

In [None]:
all_ecm_predicts=pd.DataFrame()
for datet in my_test_dates:
    datatable_predict_temp=datatable[
        datatable["datetime"].dt.date==datet].copy()
    datatable_predict_temp_filled=datatable_predict_temp.copy()

    for edge in edges:
        datatable_predict_temp[edge]=np.where(
            datatable_predict_temp["datetime"].dt.hour>11,np.nan,datatable_predict_temp[edge]
        )

    dateforc=list(datatable_predict_temp[datatable_predict_temp[target].isnull()]["datetime"])
    method_lags="step_1"
    ecm_method="ecm_step_1r"

    datatable_predict_temp=create_forecast_ecm(
                                            estimation_data=estimation_data,
                                            df=datatable_predict_temp,
                                             dateforc=dateforc,
                                             method_lags=method_lags,
                                             ecm_method=ecm_method)

    all_ecm_final_predict_temp=pd.DataFrame()
    for edge in edges:
        tempdf=pd.DataFrame({
            "congestion_ecm":datatable_predict_temp[(datatable_predict_temp["datetime"].dt.date==datet)][edge],
            "datetime":datatable_predict[(datatable_predict["datetime"].dt.date==datet)]["datetime"]
        })
        tempdf["node"]=edge.replace("-", "_")
        all_ecm_final_predict_temp=all_ecm_final_predict_temp.append(tempdf)
    all_ecm_predicts=all_ecm_predicts.append(all_ecm_final_predict_temp)

In [None]:
in_sample_tests_final=in_sample_test_dates.merge(all_ecm_predicts,how="left",on=["node","datetime"])
in_sample_tests_final.dropna(inplace=True)
in_sample_tests_final=in_sample_tests_final[in_sample_tests_final["datetime"].dt.hour>11].copy()
in_sample_tests_final["error_ecm"]=abs(in_sample_tests_final["congestion_ecm"]-in_sample_tests_final["congestion_realized"])
in_sample_tests_final["error_mean"]=abs(in_sample_tests_final["congestion_mean"]-in_sample_tests_final["congestion_realized"])
in_sample_tests_final["error_panel"]=abs(in_sample_tests_final["congestion_panel"]-in_sample_tests_final["congestion_realized"])

In [None]:
errors_start=in_sample_tests_final[["node","error_mean","error_panel","error_ecm"]].groupby("node").mean().reset_index()
errors_start.columns=["node","error_mean","error_panel","error_ecm"]
errors_start["error_panel"]=errors_start["error_panel"]-errors_start["error_mean"]
errors_start["error_ecm"]=errors_start["error_ecm"]-errors_start["error_mean"]
errors_start["error_mean"]=0

In [None]:
plt.figure(figsize=(20,6))
plt.scatter(errors_start["node"], errors_start["error_mean"], alpha=0.5)
plt.scatter(errors_start["node"], errors_start["error_panel"], alpha=0.5)
plt.scatter(errors_start["node"], errors_start["error_ecm"], alpha=0.5)
plt.legend(["Mean","Panel","ECM"])
plt.xticks(rotation=90);
plt.title("Error comparison without cliping");

In [None]:
def neg_improve(x):
    return np.mean(x[x<0])/len(x)
sep = train[(train.datetime.dt.hour >= 12) & (train.datetime.dt.weekday < 5) &
            (train.datetime.dt.dayofyear >= 246)& (train["holiday"]<1)]

In [None]:
# for cot in np.arange(0.01,0.4,0.01):
#     if "lower" in in_sample_tests_final:
#         in_sample_tests_final.drop(columns=["lower","upper"],inplace=True)
    
    
#     lower = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.07).reset_index()
#     lower.columns=['hour', 'minute', 'x', 'y', 'direction',"lower"]
#     upper = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(1-0.18).reset_index()
#     upper.columns=['hour', 'minute', 'x', 'y', 'direction',"upper"]
#     in_sample_tests_final=in_sample_tests_final.merge(lower,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
#     in_sample_tests_final=in_sample_tests_final.merge(upper,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
#     in_sample_tests_final['congestion_mean_clip'] = in_sample_tests_final["congestion_mean"].clip(in_sample_tests_final["lower"], in_sample_tests_final["upper"])
#     in_sample_tests_final['congestion_panel_clip'] = in_sample_tests_final["congestion_panel"].clip(in_sample_tests_final["lower"], in_sample_tests_final["upper"])
#     in_sample_tests_final['congestion_ecm_clip'] = in_sample_tests_final["congestion_ecm"].clip(in_sample_tests_final["lower"], in_sample_tests_final["upper"])

#     in_sample_tests_final["error_ecm_clip"]=abs(in_sample_tests_final["congestion_ecm_clip"]-in_sample_tests_final["congestion_realized"])
#     in_sample_tests_final["error_mean_clip"]=abs(in_sample_tests_final["congestion_mean_clip"]-in_sample_tests_final["congestion_realized"])
#     in_sample_tests_final["error_panel_clip"]=abs(in_sample_tests_final["congestion_panel_clip"]-in_sample_tests_final["congestion_realized"])

#     errors_start_clip=in_sample_tests_final[["node","error_mean","error_panel_clip","error_ecm_clip"]].groupby("node").mean().reset_index()
#     errors_start_clip.columns=["node","error_mean","error_panel","error_ecm"]
#     errors_start_clip["error_panel"]=errors_start_clip["error_panel"]-errors_start_clip["error_mean"]
#     errors_start_clip["error_ecm"]=errors_start_clip["error_ecm"]-errors_start_clip["error_mean"]
#     errors_start_clip["error_mean"]=0
#     print("cut "+str(cot))
#     print(neg_improve(errors_start_clip["error_ecm"]))

In [None]:
if "lower" in in_sample_tests_final:
    in_sample_tests_final.drop(columns=["lower","upper"],inplace=True)

lower = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.07).reset_index()
lower.columns=['hour', 'minute', 'x', 'y', 'direction',"lower"]
upper = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.82).reset_index()
upper.columns=['hour', 'minute', 'x', 'y', 'direction',"upper"]
in_sample_tests_final=in_sample_tests_final.merge(lower,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
in_sample_tests_final=in_sample_tests_final.merge(upper,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
in_sample_tests_final['congestion_mean_clip'] = in_sample_tests_final["congestion_mean"].clip(in_sample_tests_final["lower"], in_sample_tests_final["upper"])
in_sample_tests_final['congestion_panel_clip'] = in_sample_tests_final["congestion_panel"].clip(in_sample_tests_final["lower"], in_sample_tests_final["upper"])
in_sample_tests_final['congestion_ecm_clip'] = in_sample_tests_final["congestion_ecm"].clip(in_sample_tests_final["lower"], in_sample_tests_final["upper"])

in_sample_tests_final["error_ecm_clip"]=abs(in_sample_tests_final["congestion_ecm_clip"]-in_sample_tests_final["congestion_realized"])
in_sample_tests_final["error_mean_clip"]=abs(in_sample_tests_final["congestion_mean_clip"]-in_sample_tests_final["congestion_realized"])
in_sample_tests_final["error_panel_clip"]=abs(in_sample_tests_final["congestion_panel_clip"]-in_sample_tests_final["congestion_realized"])

errors_start_clip=in_sample_tests_final[["node","error_mean","error_panel_clip","error_ecm_clip"]].groupby("node").mean().reset_index()
errors_start_clip.columns=["node","error_mean","error_panel","error_ecm"]
errors_start_clip["error_panel"]=errors_start_clip["error_panel"]-errors_start_clip["error_mean"]
errors_start_clip["error_ecm"]=errors_start_clip["error_ecm"]-errors_start_clip["error_mean"]
errors_start_clip["error_mean"]=0
print(neg_improve(errors_start_clip["error_ecm"]))

In [None]:
plt.figure(figsize=(20,6))
plt.scatter(errors_start_clip["node"], errors_start_clip["error_mean"], alpha=0.5)
plt.scatter(errors_start_clip["node"], errors_start_clip["error_panel"], alpha=0.5)
plt.scatter(errors_start_clip["node"], errors_start_clip["error_ecm"], alpha=0.5)
plt.xticks(rotation=90);
plt.legend(["Mean","Panel","ECM"])
plt.title("Error comparison", fontsize=15);

In [None]:
errors_start_clip["recommend"]=np.where(
    errors_start_clip["error_ecm"]<0.0,"ecm","mea"
)

In [None]:
lower = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.07).reset_index()
lower.columns=['hour', 'minute', 'x', 'y', 'direction',"lower"]
upper = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.82).reset_index()
upper.columns=['hour', 'minute', 'x', 'y', 'direction',"upper"]
if "lower" in test_decision.columns:
    test_decision.drop(columns=["upper","lower"],inplace=True)
test_decision=test_decision.merge(lower,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
test_decision=test_decision.merge(upper,how="left",on=['hour', 'minute', 'x', 'y', 'direction'])
test_decision['congestion_ecm_clip_blended'] = test_decision["congestion_ecm"].clip(test_decision["lower"], test_decision["upper"])
test_decision['congestion_mean_clip_blended'] = test_decision["congestion_mean"].clip(test_decision["lower"], test_decision["upper"])

In [None]:
test_decision["ecm_and_mean"]=test_decision["congestion_ecm_clip_blended"].copy()
test_decision["congestion_mean_clip_blended"]=test_decision["congestion_mean_clip_blended"].astype(np.float64)

for node in list(errors_start_clip["node"].unique()):
    node_result=list(errors_start_clip[errors_start_clip["node"]==node]["recommend"])[0]
    test_decision["ecm_and_mean"]=np.where(
                                    node==test_decision["node"],
                                    np.where(node_result=="ecm",
                                             test_decision["congestion_ecm_clip_blended"],
                                             test_decision["congestion_mean_clip_blended"]),
                                    test_decision["ecm_and_mean"]
                                    )

In [None]:
test_decision[["row_id","ecm_and_mean"]].rename(columns={"ecm_and_mean":"congestion"}).to_csv("ecm_and_mean.csv",index=False)

<font size="5"><b> Result: Score = 4.873 </b></font>

In [None]:
test_decision["ecm_and_mean_round"]=np.round(test_decision["ecm_and_mean"],0)
test_decision[["row_id","ecm_and_mean_round"]].rename(columns={"ecm_and_mean_round":"congestion"}).to_csv("ecm_and_mean_round.csv",index=False)

<font size="5"><b> Result: Score = 4.864 </b></font>

In [None]:
test_decision["ecm_and_mean_round_afternoon"]=np.where(
    test_decision["hour"]<15,np.round(test_decision["congestion_mean_clip_blended"],0),test_decision["ecm_and_mean_round"]
)
test_decision[["row_id","ecm_and_mean_round_afternoon"]].rename(columns={"ecm_and_mean_round_afternoon":"congestion"}).to_csv("ecm_and_mean_afternoon_afternoon.csv",index=False)

<font size="5"><b> Result: Score = 4.941 </b></font>

In [None]:
test_decision[["row_id","congestion_mean_clip_blended"]].rename(columns={"congestion_mean_clip_blended":"congestion"}).to_csv("ecm_opt_clip.csv",index=False)

<font size="5"><b> Result: Score = 4.923 </b></font>

# Plot various forecasts

In [None]:
test_decision.head()

In [None]:
forecasts=[
        'congestion_mean',
       'congestion_panel', 'panel_and_mean', 'congestion_ecm',
       'congestion_ecm_clip', 'congestion_panel_clip',
       'congestion_ecm_clip_blended', 'ecm_and_mean']

In [None]:
plt.figure(figsize=(20,50))
accuracy=pd.DataFrame()
index=0
for target in list(test_decision["node"].unique()):
    plt.subplot(25,3,index+1)
    fig_data=test_decision[test_decision["node"]==target][["datetime"]+forecasts]
    fig_data=fig_data[fig_data["datetime"].dt.hour>11]
 
    sns.lineplot(data=fig_data, x="datetime", y="congestion_mean", palette="Blues");
    sns.lineplot(data=fig_data, x="datetime", y="congestion_ecm", palette="hls");
    sns.lineplot(data=fig_data, x="datetime", y="congestion_panel", palette="Reds");
    sns.lineplot(data=fig_data, x="datetime", y="ecm_and_mean", palette="Purples");
    plt.legend(["based on mean","ecm","panel","ecm_and_mean"], fontsize=6)
    plt.xticks(rotation=90);
    plt.title(target, fontsize=10);
    index+=1