In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [4]:
class SentencePairDataset(Dataset):
    def __init__(self, data, vectorizer, train=True):
        super().__init__()
        self.train = train
        self.data = data
        self.vectorizer = vectorizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence_pair = self.data.iloc[idx]
        sentences = sentence_pair['combined_sentences']
        sentences = self.vectorizer.transform([sentences]).toarray().flatten()
        if self.train:
            label = sentence_pair['label']
            return sentences, label
        else:
            return sentences, self.data.iloc[idx]['guid']

In [5]:
train_data = pd.read_json("./data/train.json")
test_data = pd.read_json("./data/test.json")
validation_data = pd.read_json("./data/validation.json")

In [63]:
# train_data = train_data.iloc[:len(train_data) // 20]
# len(train_data)

In [6]:
train_data['combined_sentences'] = train_data['sentence1'] + " " + train_data['sentence2']
test_data['combined_sentences'] = test_data['sentence1'] + " " + test_data['sentence2']
validation_data['combined_sentences'] = validation_data['sentence1'] + " " + validation_data['sentence2']

In [16]:
vectorizer_input = train_data['combined_sentences']
vectorizer_input.head()

0    Primul taragotist român a fost Nicolae Luță Io...
1    Lupta revoluționarilor este condusă de Avram I...
2    Locuitorii liberi au devenit „''iobagiones cas...
3    În anul 2002 are loc lansarea în domeniul turi...
4    Zillich a mijlocit, prin revista ''Klingsor'',...
Name: combined_sentences, dtype: object

In [17]:

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit(vectorizer_input)

In [None]:
test = vectorizer.transform(train_data['combined_sentences'][0])
vectorizer_size = test.shape[1]
vectorizer_size

In [96]:
train_dataset = SentencePairDataset(train_data, vectorizer)
test_dataset = SentencePairDataset(test_data, vectorizer, train=False)
validation_dataset = SentencePairDataset(validation_data, vectorizer)

In [80]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)

In [41]:
class NN(nn.Module):
    def __init__(self, input_dim):
        super(NN, self).__init__()
        # self.fc1 = nn.Linear(input_dim, 1024)
        # self.dropout1 = nn.Dropout(0.5)
        # self.fc2 = nn.Linear(1024, 256)
        # self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(input_dim, 4)
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        # x = torch.relu(self.fc1(x))
        # x = self.dropout1(x)
        # x = torch.relu(self.fc2(x))
        # x = self.dropout2(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [43]:
import time

model = NN(input_dim=vectorizer_size)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer.zero_grad()
num_epochs = 5
for epoch in range(num_epochs):
    start = time.time()
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    model.eval()
    val_losses = 0.0
    with torch.no_grad():
        for inputs, labels in validation_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs.float())
            val_loss = criterion(output, labels)
            val_losses += val_loss.item()
    end = time.time()
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {running_loss/len(train_loader):.4f} - "
          f"Val Loss: {val_losses/len(validation_loader):.4f} - "
          f"Time = {end-start:.1f}s")

  return self._call_impl(*args, **kwargs)


Epoch 1/5 - Loss: 1.2133 - Val Loss: 1.1391 - Time = 186.7s
Epoch 2/5 - Loss: 1.1132 - Val Loss: 1.1122 - Time = 189.7s
Epoch 3/5 - Loss: 1.0764 - Val Loss: 1.0991 - Time = 206.4s
Epoch 4/5 - Loss: 1.0528 - Val Loss: 1.0922 - Time = 204.4s
Epoch 5/5 - Loss: 1.0349 - Val Loss: 1.0858 - Time = 204.6s


In [121]:
model.eval()
model.cpu()
val_correct = 0
total_val = 0
with torch.no_grad():
    for inputs, labels in validation_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs, 1)
        val_correct += (predicted == labels).sum()
        total_val += labels.size(0)
        
accuracy = val_correct / total_val
print(f"Validation Accuracy: {accuracy}")

  return self._call_impl(*args, **kwargs)


Validation Accuracy: 0.6681922078132629


In [45]:
torch.save(model.state_dict(), "./models/nn2epochs")

In [46]:
# Loading the saved model
loaded_model = NN(input_dim=vectorizer_size)  # Create an instance of the model
loaded_model.load_state_dict(torch.load('./models/nn2epochs'))  # Load the model state dictionary
loaded_model.eval()

NN(
  (fc3): Linear(in_features=183708, out_features=4, bias=True)
  (softmax): Softmax(dim=None)
)

In [114]:
model.eval()
model.cpu()
val_correct = 0
total_val = 0
df = pd.DataFrame(columns=["guid", "label"])

with torch.no_grad():
    for inputs, guids in test_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs, 1)
        df_new = pd.DataFrame({'guid': guids, 'label': predicted.numpy()})
        df = pd.concat([df, df_new], ignore_index=True)


  return self._call_impl(*args, **kwargs)


                                      guid label
0     831cf870-c8e8-47f9-9318-7954706f08e3     2
1     4aabdcb9-9caa-4853-a0ec-d2767e365142     2
2     db383040-957a-475c-b8f7-5de7a36282f2     3
3     19d0dedc-04e1-4b7f-8061-2710f3bcbf86     2
4     3d293138-1740-4260-bd7e-64ab18d71e8d     3
...                                    ...   ...
2995  74ca470e-d28a-42dc-bfcc-7f1e57ff042b     3
2996  fc0b744e-c69c-4e8f-9ec8-1495e7a3725c     3
2997  538041fd-c1b7-481a-8a40-272b851ff3c7     2
2998  9b8dcd8c-fef9-4a2d-a170-b972461103d0     2
2999  648141bd-447d-437f-8706-81a909b48337     2

[3000 rows x 2 columns]


In [120]:
df.to_csv("./data/submission2.csv", index=False)