In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
label = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
df.info()

# EDA

Inspired by [Early EDA and insights](https://www.kaggle.com/code/abdulravoofshaik/early-eda-and-insights)

In [None]:
def each_sensor_value(seq = 0):
    df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
    label = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
    df = pd.merge(left = df, right = label, how = 'left')
    df = df.drop(['sequence', 'subject', 'step'], axis = 1)
    sensor_data = df.iloc[60*seq: 60*(seq + 1), :13]
    state = df.iloc[60*seq: 60*(seq + 1), 13].unique().item()
    print(f'-----sequence:{seq}-----')
    print(f'-----state:{state}-----')
    sensor_data.plot(subplots=True, sharex=True, figsize=(18, 1.5*13));



In [None]:
each_sensor_value(seq = 0)

# Feature Engineering

Inspired by [stats + XGBoost = score 83%](https://www.kaggle.com/code/desitancheva/stats-xgboost-score-83)

In [None]:
def feat_eng(train = True):
    if train:
        df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
        label = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
        train_data = pd.DataFrame()
        train_data['sequence'] = df['sequence'].unique()
        for sensor in range(13):
            sensor_name = f"sensor_{sensor:02d}"
            train_data[f'{sensor_name}''_max'] = df.groupby('sequence')[f'{sensor_name}'].max()
            train_data[f'{sensor_name}''_min'] = df.groupby('sequence')[f'{sensor_name}'].min()
            train_data[f'{sensor_name}''_mean'] = df.groupby('sequence')[f'{sensor_name}'].mean()
            train_data[f'{sensor_name}''_std'] = df.groupby('sequence')[f'{sensor_name}'].std()
            train_data[f'{sensor_name}''_median'] = df.groupby('sequence')[f'{sensor_name}'].median()
            
        train_data = pd.merge(left = train_data, right = label, how = 'left')
        train_x = train_data.drop(['sequence', 'state'], axis = 1)
        train_y = train_data['state']
        return train_x, train_y
    
    else:
        df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
        test_data = pd.DataFrame()
        for sensor in range(13):
            sensor_name = f"sensor_{sensor:02d}"
            test_data[f'{sensor_name}''_max'] = df.groupby('sequence')[f'{sensor_name}'].max()
            test_data[f'{sensor_name}''_min'] = df.groupby('sequence')[f'{sensor_name}'].min()
            test_data[f'{sensor_name}''_mean'] = df.groupby('sequence')[f'{sensor_name}'].mean()
            test_data[f'{sensor_name}''_std'] = df.groupby('sequence')[f'{sensor_name}'].std()
            test_data[f'{sensor_name}''_median'] = df.groupby('sequence')[f'{sensor_name}'].median()
        return test_data

In [None]:
train_x, train_y = feat_eng(train = True)
test_x = feat_eng(train = False)

In [None]:
# from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import KFold
# from catboost import CatBoostClassifier


# model_list = []
# mae_list = []

# # fold5
# kf = KFold(n_splits = 5, shuffle = True, random_state = 70)

# # modeling and training
# for fold, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
#     print(f'--------fold:{fold+1}--------')
#     fold+=1
#     tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
#     tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
#     params = {
#         'loss_function' : 'Logloss',
#         'task_type' : 'GPU', 
#         'grow_policy' : 'SymmetricTree',
#         'learning_rate': 0.3,
#         'l2_leaf_reg' : 0.2,
#         'random_state': 0
#      }
                  
#     model = CatBoostClassifier(**params)
#     # Training the model
    
#     model.fit(tr_x,
#               tr_y,
#               eval_set=[(va_x, va_y)])
    
#     val_pred = model.predict(va_x)
#     print(f' ROC: {roc_auc_score(va_y, val_pred)}')

In [None]:
del train_x, train_y, df

# Submission

In [None]:
pred_1 = model.predict(test_x)

# NN


In [None]:
import torch

In [None]:
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
from torchvision.io import read_image
import torchvision.transforms as transforms

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class TPSAprDataset(Dataset):
    def __init__(self, transform = None):
        self.data = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
        self.label = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
        self.df = pd.merge(left = self.data, right = self.label, how = 'left')
        self.transform = transform

    def __len__(self):
        return len(self.df)//60

    def __getitem__(self, idx):
        self.target = self.df.iloc[60*idx: 60*(idx+1),]
        self.inp = self.target.drop(['sequence', 'subject', 'step', 'state'], axis = 1)
        self.inp = self.inp.values
        self.label = self.target['state'].unique().item()
        
        if self.transform:
            self.inp = self.transform(self.inp)
        self.label = torch.tensor(self.label)
        
        return self.inp, self.label

In [None]:
from torch.utils.data import DataLoader
train_data = TPSAprDataset()
trainloader = DataLoader(train_data, batch_size=64, shuffle=True)

In [None]:
CUDA_LAUNCH_BLOCKING=1

In [None]:
from torch import nn
import torch.nn.functional as F


class TPSNet(nn.Module):
    def __init__(self,
         seq_num = 60,
         input_dim = 13,
         lstm_dim = 512,
         num_layers = 1,
         num_classes = 1
    ):
        super().__init__()

        self.lstm = nn.LSTM(input_dim, lstm_dim, num_layers, batch_first=True, bidirectional=True)
        
        self.lstm1 = nn.LSTM(2 * lstm_dim, lstm_dim, num_layers, batch_first=True, bidirectional=True)
        
        self.lstm2 = nn.LSTM(2 * lstm_dim, lstm_dim, num_layers, batch_first=True, bidirectional=True)
        
        self.logits = nn.Sequential(
            nn.ReLU(),
            nn.Linear(lstm_dim * seq_num * 2, num_classes),
        )

    def forward(self, x):
        features, _ = self.lstm(x)
        features, _ = self.lstm1(features)
        features, _ = self.lstm2(features)
        features = features.reshape(features.shape[0], -1)
        pred = self.logits(features)
        return pred


net = TPSNet()
net = net.to(device)
print(net)

In [None]:
import torch.optim as optim

criterion = nn.MSELoss()
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [None]:
  for epoch in range(5): 
    print(f'----{epoch+1}---')
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(trainloader, 0):
        inputs = inputs.to(device)
        inputs = inputs.to(torch.float32)
        labels = labels.to(device)
        labels = labels.to(torch.float32)
        labels = labels.unsqueeze(1)

        optimizer.zero_grad()
        

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0

print('Finished Training')

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
data = test_df.drop(['sequence','subject', 'step'], axis = 1)

In [None]:
class Testset(Dataset):
    def __init__(self, transform = None):
        self.df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
        self.df = self.df.drop(['sequence','subject', 'step'], axis = 1)

    def __len__(self):
        return len(self.df)//60

    def __getitem__(self, idx):
        self.target = self.df.iloc[60*idx: 60*(idx+1),]
        self.target = self.target.values
        self.target = torch.tensor(self.target)
        return self.target

In [None]:
data = Testset()

In [None]:
loader = DataLoader(data, batch_size=64)

In [None]:
net.eval()
preds = []
with torch.no_grad():
    for data in loader:
        data = data.to(device)
        pred = net(data.float())
        preds.append(pred.detach().cpu().numpy())

preds = np.concatenate(preds, 0)


In [None]:
sub = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
pred_2 = preds.squeeze(1)
ans = pred_2
# for i, j in enumerate(ans):
#     if j>0.5:
#         ans[i] = 1
#     else:
#         ans[i] = 0
sub['state'] = ans

In [None]:
len(ans)

In [None]:
sub.to_csv('submission.csv', index = False)