In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import IsolationForest

sns.set_style(style="whitegrid")

In [None]:
df = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv",
                index_col='time', 
                usecols=['x', 'y', 'direction', 'congestion', 'time'])
df.index = pd.to_datetime(df.index, infer_datetime_format=True)
df

In [None]:
#Features divided amongs categorical and numerical

CATEGORICAL_FEATURES=["direction"]
NUMERIC_FEATURES=['x', 'y']

y = df.loc[:, "congestion"]
X = df.loc[:, df.columns!="congestion"]
X_numerical = X[NUMERIC_FEATURES]
X_categorical = X[CATEGORICAL_FEATURES]

#Encoding the categorical features and then transformed data

encoder = OneHotEncoder(drop='first').fit(X_categorical.values)
X_categorical_encoded=encoder.transform(X_categorical.values).toarray()

#concatenating back numerical and categorical features
X_encoded = np.concatenate((X_numerical.values, X_categorical_encoded), axis=1)
X_encoded
#standardizing all the features 

scaler = StandardScaler().fit(X_encoded)
X_encoded_standardized = scaler.transform(X_encoded)
X_encoded_standardized

X_engineered = pd.DataFrame(X_encoded_standardized,
                        columns=np.hstack((X_numerical.columns.values, 
                                           encoder.get_feature_names_out(input_features=CATEGORICAL_FEATURES))),
                        index=X.index)
X_engineered.head()

In [None]:
#Basic visualization of the Time-series Data aggregated hourly and daily

hourly=y.resample("H").mean()#y.groupby(by=pd.Grouper(freq="1H")).mean()
daily=y.groupby(by=pd.Grouper(freq="1D")).mean()

fig, ax = plt.subplots(nrows=2, 
                       ncols=1, 
                       figsize = (20, 7), 
                       sharex=True, 
                       gridspec_kw={"hspace":0.25})

ax[0].plot(hourly.index, hourly.values)
ax[0].set_title("Hourly")
ax[1].plot(daily.index, daily.values)
ax[1].set_title("daily")

In [None]:
#Outlier-Removal with Isolation Forest.

isf = IsolationForest(bootstrap=True,
                      contamination=0.1,
                      random_state=1).fit(X_engineered.values)
predictions_outliers = isf.predict(X_engineered.values)
non_outlier_index = np.where(predictions_outliers == 1)
X_engineered_outlier_treated = X_engineered.iloc[non_outlier_index]
X_engineered_outlier_treated

In [None]:
np.unique(predictions_outliers, return_counts=True)