In [None]:
import pandas as pd 
import numpy as np
import torch
import os
import random
import torch.nn as nn
import sklearn
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from fastprogress import progress_bar
from sklearn.preprocessing import RobustScaler
from transformers import get_linear_schedule_with_warmup

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df


In [None]:
df_train = import_data('../input/tabular-playground-series-apr-2022/train.csv')
df_test = import_data('../input/tabular-playground-series-apr-2022/test.csv')
df_train_labels = import_data('../input/tabular-playground-series-apr-2022/train_labels.csv')
sub = import_data('../input/tabular-playground-series-apr-2022/sample_submission.csv')

In [None]:
features  = [col for col in df_test.columns if col not in ("sequence","step","subject")]

In [None]:
groups = df_train["sequence"]
df_train = df_train.drop(["sequence", "subject", "step"], axis=1)
df_test = df_test.drop(["sequence", "subject", "step"], axis=1)

In [None]:
length = len(df_train)
train_size = int(length * 0.85) - int((length * 0.85) % 60)
test_size = length - train_size
length_y = len(df_train_labels)
train_size_y = int(length_y * 0.85)
test_size_y = length_y - train_size_y
X_train, X_test = df_train[0:train_size], df_train[train_size:length]
y_train, y_test = df_train_labels['state'][0:train_size_y], df_train_labels['state'][train_size_y:length_y]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
y_train = np.array(y_train)
y_valid = np.array(y_test)

In [None]:
scaler = RobustScaler()
train = scaler.fit_transform(X_train)
valid = scaler.fit_transform(X_test)
test = scaler.fit_transform(df_test)

In [None]:
class Custom_dataset(Dataset):
    def __init__(self, X, sequence_len, y=None, mode='train'):
        self.data = X
        self.target = y
        self.sequence_len = sequence_len
        self.mode = mode
    def __len__(self):
        return (self.data.shape[0]//sequence_len)

    def __getitem__(self, idx):
        out_data = self.data[idx]
        if self.mode == 'train':
            out_label =  self.target[idx[0]//self.sequence_len]
            return out_data, out_label
        else:
            return out_data

In [None]:
sequence_len = 60

train_dataset = Custom_dataset(train, sequence_len=sequence_len, y=y_train)
valid_dataset = Custom_dataset(valid, sequence_len=sequence_len, y=y_valid)
test_dataset = Custom_dataset(test, sequence_len=sequence_len, mode='test')

In [None]:
def Custom_dataloader(dataset, dataset_num, sequence_len, input_size, batch_size, shuffle=False):
    sampler = np.array([list(range(i*sequence_len, (i+1)*sequence_len)) for i in range(dataset_num//sequence_len)])
    print(len(sampler))
    if shuffle == True:
        np.random.shuffle(sampler)
    dataloader = DataLoader(dataset, batch_size, sampler=sampler)
    return dataloader



In [None]:
train_dataloader = Custom_dataloader(train_dataset, train.shape[0], sequence_len, train.shape[1], 512)
valid_dataloader = Custom_dataloader(valid_dataset, valid.shape[0], sequence_len, valid.shape[1], 512)
test_dataloader = Custom_dataloader(test_dataset, test.shape[0], sequence_len, test.shape[1], 512)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
class LSTM(nn.Module):
    def __init__(self,input_size,num_classes,hidden_size,num_layers):
        super().__init__()
        
        self.lstm1 = nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,dropout=0,bidirectional=True)
        
        self.lstm2 = nn.LSTM(hidden_size*2,hidden_size*2,num_layers,batch_first=True,dropout=0,bidirectional=True)
        
        self.lstm3= nn.LSTM(hidden_size*4,hidden_size*4,num_layers,batch_first=True,dropout=0,bidirectional=False)

        self.final = nn.Sequential(
#             nn.ReLU(),
            nn.Linear(hidden_size*60*4, num_classes),
        )
            
    def forward(self,x):
        
        out1, _ = self.lstm1(x)
        
        out2, _ = self.lstm2(out1)
        
        out3, _ = self.lstm3(out2)
        
        out3 = out3.reshape(out3.shape[0],-1)
        
        out4 = self.final(out3)
        return out4

In [None]:
len(features)

In [None]:
input_size = len(features)
hidden_size = 64
num_layers = 2
num_classes = 1
learning_rate = 1e-3
num_epochs = 500

In [None]:
model = LSTM(input_size,num_classes,hidden_size,num_classes)
model.to(device)

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_warmup_steps = int(0.1 * num_epochs * len(train_dataloader))
num_training_steps = int(num_epochs * len(train_dataloader))

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps
)


In [None]:
torch.cuda.empty_cache()
for epoch in progress_bar(range(num_epochs)): 
    
    model.train()
    for trainX, train_y in train_dataloader:
        
        outputs = model(trainX.to(device,dtype=torch.float32)).squeeze(-1)
        optimizer.zero_grad()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        loss = criterion(outputs, train_y.to(device,dtype=torch.float32))
        loss.backward()

        optimizer.step()
        scheduler.step()

    model.eval()
    for validX, valid_y in valid_dataloader:
        with torch.no_grad():
            val_out = model(validX.to(device,dtype=torch.float32)).squeeze(-1)
            vall_loss = criterion(val_out,valid_y.to(device,dtype=torch.float32))

    if epoch % 10 == 0:
        
          print("Epoch: %d, loss: %1.5f valid loss:  %1.5f " %(epoch, loss.cpu().item(),vall_loss.cpu().item()))


In [None]:
predictions = []

for testX in test_dataloader:
    with torch.no_grad():
        model.eval()
        testX = testX.to(torch.float32)
        test_out = model(testX.to(device)).squeeze(-1)
        predictions.append(test_out)

In [None]:
real_preds = torch.cat(predictions)

In [None]:
real_preds = real_preds.cpu().numpy()

In [None]:
len(real_preds)

In [None]:
real_preds[:3]

In [None]:
submission = pd.DataFrame({'sequence':sub.sequence,'state':real_preds})

In [None]:
submission.to_csv('submission.csv',index=False)