In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', index_col=0, parse_dates=['time'])
test = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv', index_col=0, parse_dates=['time'])

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
from datetime import datetime
from math import sin, cos, pi


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split


import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import torch.optim as optim

import matplotlib.pyplot as plt

# Acknowledgement
* AMBROSM--[TPSMAR22 Random Forest](https://www.kaggle.com/ambrosm/tpsmar22-random-forest)
* CHECHE--[TPS_2022_03_pytorch_score(4.923)](https://www.kaggle.com/zhangcheche/tps-2022-03-pytorch-score-4-923)
* AMBROSM--[TPSMAR22 EDA which makes sense ⭐️⭐️⭐️⭐️⭐️](https://www.kaggle.com/ambrosm/tpsmar22-eda-which-makes-sense)
* MARTYNOV ANDREY--[TPS Mar 22, Step 0.1, Special values](https://www.kaggle.com/martynovandrey/tps-mar-22-step-0-1-special-values/notebook)
* AMBROSM--[TPSMAR22 Generalizing the Special Values](https://www.kaggle.com/ambrosm/tpsmar22-generalizing-the-special-values?scriptVersionId=90141945)

# Drop Holidays

In [None]:
# Memorial Day
train = train[(train.time.dt.month != 5) | (train.time.dt.day != 27)]

# July 4
train = train[(train.time.dt.month != 7) | (train.time.dt.day != 4)]

# Labor Day
train = train[(train.time.dt.month != 9) | (train.time.dt.day != 2)]

# Feature Engineering
## 1. Convert raw data to location, direction, data, and time

In [None]:
def fe0(data):
    data['weekday'] = data.time.dt.weekday
    data['hour'] = data.time.dt.hour
    data['timeofday'] = data.time.dt.time

    data['saturday'] = data['weekday'] == 5
    data['sunday'] = data['weekday'] == 6
    data['minute'] = data.time.dt.minute
    data = data.drop(columns='time')
    return data

In [None]:
train = fe0(train)
test = fe0(test)

### 1. Histogram for Training Data

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20,8))
bin_weekday = list(range(8))
axs[0].hist(train.weekday, bin_weekday, rwidth=0.5)
axs[0].set_title('weekday')
axs[0].set_xticks(bin_weekday)
axs[0].set_ylabel('count')

bin_hour = list(range(25))
axs[1].hist(train.hour, bin_hour, rwidth=0.7)
axs[1].set_title('hour')
axs[1].set_xticks(bin_hour)
axs[1].set_ylabel('count')

plt.show()

In [None]:
temp = train.groupby(train.weekday).congestion.mean()
plt.figure(figsize=(18, 6))
plt.bar(temp.index, temp)
plt.xticks(ticks=temp.index, labels='MTWTFSS')
plt.xlabel('Days of the week')
plt.ylabel('Mean of Congestion')
plt.show()

### 2. Test Data

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(20,8))
bin_weekday = list(range(8))
axs[0].hist(test.weekday, bin_weekday, rwidth=0.5)
axs[0].set_title('weekday')
axs[0].set_xticks(bin_weekday)
axs[0].set_ylabel('count')

bin_hour = list(range(25))
axs[1].hist(test.hour, bin_hour, rwidth=0.7)
axs[1].set_title('hour')
axs[1].set_xticks(bin_hour)
axs[1].set_ylabel('count')

plt.show()

### Use the training data after 12:00 only.

In [None]:
train = train[train.hour >= 12]

In [None]:
target = train.congestion
train = train.drop(columns='congestion')

In [None]:
train.head()

## 2. One-hot encode

In [None]:
ohe = OneHotEncoder(drop='first', sparse=False)
ohe.fit(train)

In [None]:
def fe1(data):
    data_ohe = ohe.transform(data)
    df = pd.DataFrame(data_ohe, index=data.index)
    return df

In [None]:
x_data = fe1(train)
x_test = fe1(test)

In [None]:
num_feature = x_data.shape[1]
num_feature

# PyTorch
## Send data to CUDA

In [None]:
x_data = x_data.values
x_test = x_test.values
target = target.values.reshape(-1, 1)

In [None]:
x_data = torch.tensor(x_data)
x_test = torch.tensor(x_test)
target = torch.tensor(target)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

In [None]:
x_data = x_data.double().to(device)
x_test = x_test.double().to(device)
target = target.double().to(device)

## Define Model

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_data, target)

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(num_feature, 300), 
            nn.ReLU(), 
            nn.Linear(300, 100), 
            nn.ReLU(),
            nn.Linear(100, 50), 
            nn.ReLU(), 
            nn.Linear(50, 1)
        )
        
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits
    
model = Net().double().to(device)

In [None]:
def init_weight(layer):
    if type(layer) == nn.Linear:
        nn.init.xavier_normal_(layer.weight.data)
model.apply(init_weight)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for X, y in dataloader:
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return loss.data


def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    return test_loss

## Train Model

In [None]:
learning_rate = 1e-3
batch_size = 128
epochs = 1000

loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

train_dataset = TensorDataset(x_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(x_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

loss_list_train = []
loss_list_test = []
for t in range(epochs):
    loss_train = train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loss = test_loop(val_dataloader, model, loss_fn)
    loss_list_train.append(loss_train)
    loss_list_test.append(test_loss)
    if t % 50 == 0:
        print(f'epoch {t} loss {test_loss}')
print("Done!")

## Plot Test Loss

In [None]:
plt.plot(range(epochs),loss_list_test)
plt.xlabel("Number of Epochs")
plt.ylabel("Loss")
plt.show()

In [None]:
with torch.no_grad():
    y_test = model(x_test)
    y_test = torch.round(y_test)
y_test

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
sub.congestion = y_test.cpu()

# Special Values

In [None]:
special = pd.read_csv('../input/tps-mar-22-special-values/special v2.csv', index_col="row_id")
special = special[['congestion']].rename(columns={'congestion':'special'})
sub = sub.merge(special, left_index=True, right_index=True, how='left')
sub['special'] = sub['special'].fillna(sub['congestion']).round().astype(int)
submission_in = sub.drop(['congestion'], axis=1).rename(columns={'special':'congestion'})

# Generalizing the Special Values

In [None]:
# Read and prepare the training data
train = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv', parse_dates=['time'])
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute

# Compute the quantiles of workday afternoons in September except Labor Day
sep = train[(train.time.dt.hour >= 12) & (train.time.dt.weekday < 5) &
            (train.time.dt.dayofyear >= 246)]
lower = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.15).values
upper = sep.groupby(['hour', 'minute', 'x', 'y', 'direction']).congestion.quantile(0.7).values

# Clip the submission data to the quantiles
submission_out = submission_in.copy()
submission_out['congestion'] = submission_in.congestion.clip(lower, upper)
submission_out['congestion'] = submission_out.congestion.round().astype(int)

In [None]:
submission_out.to_csv('submission.csv', index=False)