# Introduction

This is my first attempt at this competition. I always try to approach these things completely naive the first time, so that I look at the problem with fresh ideas and the fun of the puzzle is maintained. 

In [None]:
!pip install Pycaret

from pycaret.regression import *

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/tabular-playground-series-mar-2022/train.csv", infer_datetime_format=True)
test_df = pd.read_csv("../input/tabular-playground-series-mar-2022/test.csv")
df.head()

In [None]:
df.dtypes

# Look for NaNs

In [None]:
def detect_NaNs(df_temp, verbose=0): 
    print('NaNs in data: ', df_temp.isnull().sum().sum())
    count_nulls = df_temp.isnull().sum().sum()
    columns_with_NaNs = []
    if count_nulls > 0:
        print('******')
        for col in df_temp.columns:
            if df_temp[col].isnull().sum().sum() > 0:
                columns_with_NaNs.append(col)
                print('NaNs in', col + ": ", df_temp[col].isnull().sum().sum())
        print('******')
    print('')
    return columns_with_NaNs

detect_NaNs(df)
detect_NaNs(test_df)
display()

# Class Balance

In [None]:
sns.displot(x=df["congestion"], bins=20)

Looks at least somewhat Gaussian. Worth testing for normality.

# Normality Test

Let's test the class for normality.

In [None]:
from statsmodels.api import qqplot
from scipy.stats import shapiro

def test_normality(series, column_name, plot = 0, silent = False):
    # create right number of samples
    if len(series) > 5000:
        if type(series) is pd.Series:
            series = series.sample(frac=1)[:5000]
        else:
            series = series[:5000]
    # normality test
    stat, p = shapiro(series)
    if not silent:
        print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    normal = False
    if p > alpha:
        normal = True
        if not silent:
            print(column_name ,'looks Gaussian (fail to reject H0)')
    else:
        if not silent:
            print(column_name ,'does not look Gaussian (reject H0)')
    # q-q plot
    if plot > 0:
        qqplot(series, line='s')
        plt.show()
    return(normal)

test_normality(df["congestion"], "congestion", 1)
display()

Not quite Gaussian but you can see from the way it follows the red line it is pretty close.

# Parse Dates

In [None]:
from dateutil.parser import parse

df['time'] = pd.to_datetime(df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

# Show Directions

I just want to check to make sure there is nothing weird or incorrect in this

In [None]:
df.direction.unique().tolist()

In [None]:
sns.countplot(x=df["direction"])
plt.show()

# Create Location Ids

I want to encode the location ids so I can use them later

In [None]:
df["location"] = df["x"].astype(str) + df["y"].astype(str) + df["direction"]
test_df["location"] = df["x"].astype(str) + df["y"].astype(str) + df["direction"]

Now to encode them. 

In [None]:
from sklearn import preprocessing

def encode_column(df, column, test_df = None):
    le = preprocessing.LabelEncoder()
    classes_to_encode = df[column].astype(str).unique().tolist()
    le.fit(classes_to_encode)
    df[column] = le.transform(df[column].astype(str))
    test_df[column] = le.transform(test_df[column].astype(str))
    return df, test_df

df, test_df = encode_column(df, "location", test_df)

# Encode Direction and Engineer It

It might be nice to take the first and second letter of direction.

In [None]:
df["first_letter"] = df["direction"].str[0]
test_df["first_letter"] = test_df["direction"].str[0]
df, test_df = encode_column(df, "first_letter", test_df)
df["second_letter"] = df["direction"].str[1]
test_df["second_letter"] = test_df["direction"].str[1]
df, test_df = encode_column(df, "second_letter", test_df)
df, test_df = encode_column(df, "direction", test_df)

Check out the balance of location.

In [None]:
sns.countplot(x=df["location"])
plt.show()

Nice to see it's uniform.



# Create Date Information

In [None]:
df['day_of_year'] = df['time'].dt.dayofyear
df['week_of_year'] = df['time'].dt.isocalendar().week
df['month_of_year'] = df['time'].dt.month
df['hour_of_day'] = df['time'].dt.hour
df['minute_of_day'] = df['time'].dt.minute
df['rectified_hour'] = df['hour_of_day'] + (df['time'].dt.minute / 60)
test_df['day_of_year'] = test_df['time'].dt.dayofyear
test_df['week_of_year'] = test_df['time'].dt.isocalendar().week
test_df['month_of_year'] = test_df['time'].dt.month
test_df['hour_of_day'] = test_df['time'].dt.hour
test_df['minute_of_day'] = test_df['time'].dt.minute
test_df['rectified_hour'] = test_df['hour_of_day'] + (test_df['time'].dt.minute / 60)

In [None]:
import math

df["sine"] = np.sin((df["hour_of_day"])/24 * 2 * math.pi)
df["cos"] = np.cos((df["hour_of_day"])/24 * 2 * math.pi)
test_df["sine"] = np.sin((test_df["hour_of_day"])/24 * 2 * math.pi)
test_df["cos"] = np.cos((test_df["hour_of_day"])/24 * 2 * math.pi)

In [None]:
# Add location and rectified hour

df["location_hour"] = df['rectified_hour'].astype(str) + "_" + df['location'].astype(str)
test_df["location_hour"] = test_df['rectified_hour'].astype(str) + "_" + test_df['location'].astype(str)
df, test_df = encode_column(df, "location_hour", test_df)

# Plot what we have so far

In [None]:
pltdf = df.copy()
pltdf.iloc[:500, :30].plot(subplots=True, layout=(5,6), figsize=(15,10))

plt.show()

# Average by Time and Location

This could help us work on the test set. We will record the average for each time and location.

In [None]:
location_hour_df = df[['location_hour','congestion']].groupby(['location_hour']).mean()
location_hour_dict = {}
for idx, row in location_hour_df.iterrows():
    location_hour_dict[idx] = row[0]
df["location_hour_mean"] = df["location_hour"].map(location_hour_dict)
test_df["location_hour_mean"] = test_df["location_hour"].map(location_hour_dict)

# LineRegression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit

y = df.congestion.values
X = df.drop(columns=["congestion","time","row_id"]).values

tscv = TimeSeriesSplit()

for i, indexes in enumerate(tscv.split(X)):
    train_index = indexes[0]
    test_index = indexes[1]
    reg = LinearRegression().fit(X[train_index], y[train_index])
    print("split: " + str(i + 1))
    print(reg.score(X[test_index], y[test_index]))

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

clf = Lasso()

tscv = TimeSeriesSplit()

for i, indexes in enumerate(tscv.split(X)):
    train_index = indexes[0]
    test_index = indexes[1]
    clf = Lasso().fit(X[train_index], y[train_index])
    print("split: " + str(i + 1))
    print(clf.score(X[test_index], y[test_index]))

# Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor


for i, indexes in enumerate(tscv.split(X)):
    train_index = indexes[0]
    test_index = indexes[1]
    tree = DecisionTreeRegressor(random_state=0).fit(X[train_index], y[train_index])
    print("split: " + str(i + 1))
    print(tree.score(X[test_index], y[test_index]))

# Pycaret

In [None]:
setup(data = df.copy().drop(columns=["time","row_id"]), 
             target = "congestion",
             silent = True, session_id=1, normalize=False, remove_perfect_collinearity=False)
display()

In [None]:
lr = create_model("lr")

# Create Submission

In [None]:
test_X = test_df.drop(columns=["time","row_id"]).values
predictions = reg.predict(test_X)

In [None]:
sample_sub = pd.read_csv("../input/tabular-playground-series-mar-2022/sample_submission.csv")
sample_sub["congestion"] = np.round(test_df["location_hour_mean"],0)

In [None]:
sample_sub.to_csv("submission.csv", index=False)