In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
from torch import nn
import torch.nn.functional as F

### read data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train.head()

### data preprocessing

In [None]:
def create_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create features base on the date variable, the idea is to extract as much 
    information from the date componets.
    Args
        df: Input data to create the features.
    Returns
        df: A DataFrame with the new time base features.
    """
    
    df['date'] = pd.to_datetime(df['date']) # Convert the date to datetime.
    
    # Start the creating future process.
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.days_in_month
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.weekofyear
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    
    return df

train = create_time_features(train)
test = create_time_features(test)

In [None]:
train = train.set_index('row_id')
test = test.set_index('row_id')

train = train.drop('date', axis=1)
test = test.drop('date', axis=1)

In [None]:
train.head()

In [None]:
CATEGORICAL = ['country', 'store', 'product']

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
x_data = train.drop('num_sold', axis=1)
y_data = train.num_sold

In [None]:
data = pd.concat([x_data, test])

data = pd.get_dummies(data)

x_data = data.iloc[:train.shape[0]]
test = data.iloc[train.shape[0]:]

In [None]:
x_data.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)
test = scaler.transform(test)

In [None]:
pd.DataFrame(x_data).describe()

In [None]:
x_data = torch.tensor(x_data, dtype=torch.float32)
test = torch.tensor(test, dtype=torch.float32)

y_data = torch.tensor(y_data.values, dtype=torch.float32)

In [None]:
# x_data = torch.randn(1000, 18)
# y_data = torch.randn(1000, 1) * 100

### model train

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)

In [None]:
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)


In [None]:
from xgboost import XGBRegressor

x_train = x_train.numpy()
y_train = y_train.numpy()

x_test = x_test.numpy()
y_test = y_test.numpy()

model = XGBRegressor()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

test_score = SMAPE(y_test, y_pred)

print('test_score', test_score)

In [None]:
test_pred = model.predict(test)


In [None]:
submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission.num_sold = test_pred
submission.to_csv('submission.csv', index=False)

In [None]:
# from torch.utils.data import TensorDataset
# from torch.utils.data import DataLoader

In [None]:
# dataset_train = TensorDataset(x_train, y_train)
# loader_train = DataLoader(dataset_train, batch_size=256, shuffle=True)

# dataset_test = TensorDataset(x_test, y_test)
# loader_test = DataLoader(dataset_test, batch_size=256, shuffle=True)


In [None]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(f'Using {device} device')

In [None]:
# class NeuralNetwork(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork, self).__init__()
#         self.fc1 = nn.Linear(18, 300)
#         self.bn1 = nn.BatchNorm1d(300)
#         self.fc2 = nn.Linear(300, 150)
#         self.bn2 = nn.BatchNorm1d(150)
#         self.fc3 = nn.Linear(150, 100)
#         self.bn3 = nn.BatchNorm1d(100)
#         self.fc5 = nn.Linear(100, 50)
#         self.bn5 = nn.BatchNorm1d(50)  
#         self.fc6 = nn.Linear(50, 1)

#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.bn1(x)
#         x = F.relu(x)
#         x = self.fc2(x)
#         x = self.bn2(x)
#         x = F.relu(x)
#         x = self.fc3(x)
#         x = self.bn3(x)
#         x = F.relu(x)
#         x = self.fc5(x)
#         x = self.bn5(x)
#         x = F.relu(x)
#         x = self.fc6(x)
#         return x

# def init_weights(layer):
#     if isinstance(layer, nn.Linear):
#         nn.init.xavier_normal_(layer.weight.data)    

In [None]:
# model = NeuralNetwork().to(device)
# model.apply(init_weights)
# print(model)

In [None]:
# learning_rate = 1e-3
# batch_size = 256
# epochs = 100

In [None]:
# def train_loop(dataloader, model, loss_fn, optimizer, epoch):
#     num_batches = len(dataloader)
#     loss_all = 0
#     for batch, (X, y) in enumerate(dataloader):
#         # Compute prediction and loss
        
#         X = X.to(device)
#         y = y.to(device)
        
#         pred = model(X)
#         loss = loss_fn(pred, y)

#         # Backpropagation
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
    
#         loss = loss.item()
#         loss_all += loss ** 1
#     if epoch % 30 == 0:
#         print(f"train loss: {loss_all / num_batches:>7f}")
#         return loss_all / num_batches


# def test_loop(dataloader, model, loss_fn):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     test_loss, correct = 0, 0

#     with torch.no_grad():
#         for X, y in dataloader:
            
#             X = X.to(device)
#             y = y.to(device)
#             pred = model(X)
#             test_loss += loss_fn(pred, y).item()

#     test_loss /= num_batches
#     test_loss = test_loss ** 1
#     print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")
#     return test_loss

In [None]:
# loss_fn = nn.L1Loss()
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9,  weight_decay=0.00001)

# epochs = 1

# train_loss = []
# test_loss = []
# for t in range(epochs):
#     if t % 30 == 0:
#         print(f"Epoch {t}\n-------------------------------")
#     loss=train_loop(loader_train, model, loss_fn, optimizer, t)
#     if t % 30 == 0:
#         train_loss.append(loss)
#         test_loss.append(test_loop(loader_test, model, loss_fn))
# print("Done!")

In [None]:
# import matplotlib.pyplot as plt
# plt.plot(range(len(train_loss)), train_loss, label='train')
# plt.plot(range(len(test_loss)), test_loss, label='test')
# plt.legend()
# plt.show()

In [None]:
# def SMAPE(y_true, y_pred):
#     denominator = (y_true + np.abs(y_pred)) / 200.0
#     diff = np.abs(y_true - y_pred) / denominator
#     diff[denominator == 0] = 0.0
#     return np.mean(diff)


In [None]:
# x_train = x_train.to(device)
# x_test = x_test.to(device)

# train_pred = model(x_train)
# test_pred = model(x_test)

# train_score = SMAPE(y_train.cpu().detach().numpy(), train_pred.cpu().detach().numpy())
# test_score = SMAPE(y_test.cpu().detach().numpy(), test_pred.cpu().detach().numpy())

# print('train_score ', train_score, 'test_score', test_score)

In [None]:
# test = test.to(device)
# test_pred = model(test)

In [None]:
# test_pred

In [None]:
# submission = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
# submission.num_sold = test_pred.cpu().detach().numpy().reshape(-1)
# submission.to_csv('submission.csv', index=False)