In [None]:
import pandas as pd
import numpy as np

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import torch.optim as optim

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-feb-2022/test.csv')

In [None]:
train = train.set_index('row_id').drop_duplicates()

In [None]:
x_data = train.drop(['target'], axis=1)
target = train.target

x_test = test.drop('row_id', axis=1)

In [None]:
enc = LabelEncoder()
target = enc.fit_transform(target)

In [None]:
elements = x_data.columns

In [None]:
from math import factorial

In [None]:
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

def bias_of(s):
    w = int(s[1:s.index('T')])
    x = int(s[s.index('T')+1:s.index('G')])
    y = int(s[s.index('G')+1:s.index('C')])
    z = int(s[s.index('C')+1:])
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

x_data = pd.DataFrame({col: ((x_data[col] + bias_of(col)) * 1000000).round().astype(int)
                        for col in elements})
x_test = pd.DataFrame({col: ((x_test[col] + bias_of(col)) * 1000000).round().astype(int)
                       for col in elements})

In [None]:
x_data = x_data.values
x_test = x_test.values

In [None]:
x_data = torch.tensor(x_data, dtype=torch.float)
x_test = torch.tensor(x_test, dtype=torch.float)

target = torch.tensor(target)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_data, target)

In [None]:
x_train

In [None]:
train_dataset = TensorDataset(x_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True)

val_dataset = TensorDataset(x_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=100, shuffle=True)

In [None]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(286, 100)
        self.bn1 = nn.BatchNorm1d(100)
        self.drop1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(100, 50)
        self.bn2 = nn.BatchNorm1d(50)
        self.drop2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(50, 30)
        self.bn3 = nn.BatchNorm1d(30)
        self.drop3 = nn.Dropout(0.5)
        self.fc5 = nn.Linear(30, 10)
        
        
    def forward(self, x):
        x = self.fc1(x)
#         x = F.relu(self.drop1(x))
        x = F.relu(self.bn1(x))
        x = self.fc2(x)
#         x = F.relu(self.drop2(x))
        x = F.relu(self.bn2(x))
        x = self.fc3(x)
#         x = F.relu(self.drop3(x))
        x = F.relu(self.bn3(x))
        x = self.fc5(x)
        
        return x
    
net = Net()
print(net)
        
    
def init_weights(layer):
    if isinstance(layer, nn.Linear):
        nn.init.xavier_normal_(layer.weight.data)
        
net.apply(init_weights)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device {device}')
net = net.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(params=net.parameters(), lr=1e-3)

In [None]:
net.train()

In [None]:
acc = []
loss_list = []
for epoch in range(10000):
    
    accuracy = 0
    length =0
    train_loss = 0
    for data in train_dataloader:
        x, label = data
        
        x = x.to(device)
        label = label.to(device)
        
        predict = net(x)
        loss = criterion(predict, label)
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        accuracy += torch.sum(predict.argmax(axis=1) == label)
        length += len(x)
        train_loss += loss
    
    train_loss = train_loss / length * len(x)
    acc.append(accuracy.item() / length)
    loss_list.append(train_loss.item())
    if epoch % 50 == 0:
        print(f'epoch {epoch} loss {train_loss} accuracy {accuracy / length}')

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(loss_list)), loss_list)
plt.plot(range(len(acc)), acc)
plt.show()

In [None]:
net.eval()

In [None]:
with torch.no_grad():
    predict = net(x_val.to(device))
    y_val = y_val.to(device)
    accuracy_score = torch.sum(predict.argmax(axis=1) == y_val) / x_val.shape[0]
    print(accuracy_score)

In [None]:
with torch.no_grad():
    x_test = x_test.to(device)
    y_test = net(x_test).argmax(axis=1)

In [None]:
y_test = enc.inverse_transform(y_test.to('cpu'))

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-feb-2022/sample_submission.csv')

submission.target = y_test
submission.to_csv('submission.csv', index=False)

In [None]:
dddddddddddddddd