In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import gensim

In [4]:
train_data = pd.read_json("./data/train.json")
test_data = pd.read_json("./data/test.json")
validation_data = pd.read_json("./data/validation.json")

In [5]:
train_data = train_data.iloc[:len(train_data)]
len(train_data)

58114

In [36]:
train_data['combined_sentences'] = train_data['sentence1'] + " " + train_data['sentence2']
test_data['combined_sentences'] = test_data['sentence1'] + " " + test_data['sentence2']
validation_data['combined_sentences'] = validation_data['sentence1'] + " " + validation_data['sentence2']

In [89]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allex\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [185]:
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

ss = SnowballStemmer("romanian")
sw = stopwords.words("romanian")

# 1.Lowercase everything
# 2.Remove all symbols other than a-z@#.
# 3.Split on spaces.
# 4.Remove stopwords/empty tokens
# 5.Apply snowball stemmer to remainder

def text_preparetion(text):
    list_text_tokens = [ss.stem(i) for i in  # 5
                          re.split(r" +",  # 3
                                   re.sub(r"[^a-z@# ]", "",  # 2
                                          text.lower()))  # 1
                          if (i not in sw) and len(i)]  # 4
    return list_text_tokens

In [186]:
train_data['all_tokens'] = train_data['combined_sentences'].apply(text_preparetion)

In [194]:
train_data.head()

Unnamed: 0,sentence1,sentence2,label,guid,combined_sentences,all_tokens,tokens,moving_window
0,Primul taragotist român a fost Nicolae Luță Io...,"Colegiul de arhitectură, artă și planificare (...",3,7cec5ac4-c115-4976-8d2a-9badfe9b63b9,Primul taragotist român a fost Nicolae Luță Io...,"[taragotist, romn, nicola, lu, iov, originar, ...","[romn, nicola, banat, coleg, arhitectur, art, ...","[(0, 1), (0, 2), (0, 3), (1, 0), (1, 2), (1, 3..."
1,Lupta revoluționarilor este condusă de Avram I...,Schiul nordic face parte din programul olimpic...,3,bc2fa29f-cf22-4a7c-8b55-9b1ed019f6ac,Lupta revoluționarilor este condusă de Avram I...,"[lupt, revoluionar, condus, avram, iancu, ioan...","[lupt, condus, ioan, frai, ioan, viitor, ioan,...","[(13, 14), (13, 15), (13, 16), (14, 13), (14, ..."
2,Locuitorii liberi au devenit „''iobagiones cas...,"În anii 1960, ea a apărut în drame realizate l...",3,8547b1ef-7bfe-43a9-aedf-bad0c0fbc049,Locuitorii liberi au devenit „''iobagiones cas...,"[locuit, liber, deven, iobagiones, castr, ioba...","[locuit, liber, deven, cet, milit, n, aprar, m...","[(26, 27), (26, 28), (26, 29), (27, 26), (27, ..."
3,În anul 2002 are loc lansarea în domeniul turi...,Se lansează primul hotel al grupului în otopen...,2,0ad1ce19-7aa9-4ddd-b8d6-822072a723b0,În anul 2002 are loc lansarea în domeniul turi...,"[n, an, loc, lans, n, domen, turistichotelier,...","[n, an, loc, n, domen, grup, n, dat, august, c...","[(31, 43), (31, 44), (31, 31), (43, 31), (43, ..."
4,"Zillich a mijlocit, prin revista ''Klingsor'',...","Au apărut lucrări ale lui ion luca caragiale, ...",2,50c44ffa-b0c1-4d98-bc6c-3bbf95f50896,"Zillich a mijlocit, prin revista ''Klingsor'',...","[zillich, mijloc, revist, klingsor, debut, mul...","[mijloc, revist, debut, multor, tiner, autor, ...","[(54, 55), (54, 56), (54, 57), (55, 54), (55, ..."


In [192]:
from collections import Counter

counts = Counter([token for sublist in train_data['all_tokens'] for token in sublist])
counts = {k:v for k, v in counts.items() if v>10} # Filtering
vocab = list(counts.keys())
n_v = len(vocab)
id2tok = dict(enumerate(vocab))
tok2id = {token: id for id, token in id2tok.items()}
# Now correct tokens
def remove_rare_tokens(row):
    row = [t for t in row if t in vocab]
    return row

train_data['tokens'] = train_data['all_tokens'].apply(remove_rare_tokens)

In [193]:
def windowizer(doc, wsize=3):
    """
    Windowizer function for Word2Vec. Converts sentence to sliding-window pairs.
    """
    
    out = []
    for i, wd in enumerate(doc):
        target = tok2id[wd]
        window = [i+j for j in
                  range(-wsize, wsize+1, 1)
                  if (i+j>=0) &
                     (i+j<len(doc)) &
                     (j!=0)]

        out+=[(target, tok2id[doc[w]]) for w in window]
    return out

train_data['moving_window'] = train_data['tokens'].apply(windowizer)

In [None]:
from torch.utils.data import Dataset, DataLoader

class Word2VecDataset(Dataset):
    """
    Takes a HuggingFace dataset as an input, to be used for a Word2Vec dataloader.
    """
    def __init__(self, dataset, vocab_size, wsize=3):
        self.dataset = dataset
        self.vocab_size = vocab_size
        self.data = [i for s in dataset['moving_window'] for i in s]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
BATCH_SIZE = 2**14
N_LOADER_PROCS = 10

dataloader = {}
for key in train_data['moving_window'].keys():
    dataloader = {key: DataLoader(Word2VecDataset(
                                    dataset[key], vocab_size=n_v),
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=N_LOADER_PROCS)}

In [66]:
vector_size = 64
word2vec_model = gensim.models.Word2Vec(sentences=train_data['all_tokens'], vector_size=vector_size, window=5, min_count=1,
                                        workers=4)

In [74]:
n = 2
test_data['combined_sentences'][n]

"După Dunăre, Inn și Main, Isar este al patrulea râu, ca lungime, din Bavaria. Limes, Cluj, 2010) a fost achiziționată de către: Biblioteca Kubon & Sagner din Munchen, de către [ Biblioteca Academia de Științe] a Rep. Moldova, precum și de către [ Biblioteca „Lucian Blaga” din Madrid]\n* Cartea ''[ Securitatea, Cezarul și sfoara de câlți a lui Elie\xa0Wiesel]'' (Ed."

In [77]:
embeddings = [word2vec_model.wv[word] for word in test_data['combined_sentences'][n].split() if
              word in word2vec_model.wv]
embeddings = torch.tensor(embeddings)
print(len(test_data['combined_sentences'][n].split()))
print(torch.mean(embeddings))
print(embeddings)

62
tensor(0.0007)
tensor([[-0.0606, -0.0230, -0.2749,  0.1191,  0.8304,  0.5394, -0.8490, -0.1324,
          0.5752, -0.4563, -0.7517,  0.2644,  0.6231,  0.0518, -0.5138, -0.5378,
         -0.7700, -0.0088,  0.2464, -0.6010,  0.4204, -0.6892,  0.1346,  0.3070,
         -0.0087,  0.4722,  0.6388, -0.3836,  0.4670, -0.1899,  0.5981, -0.2624,
          0.7431,  0.3321,  0.3341,  0.2017, -0.2177, -0.0672, -0.1043, -0.2189,
          0.5096, -0.2590, -0.9160,  0.2714,  0.3928, -0.6067, -0.2389, -0.0371,
         -0.2111, -0.5222,  0.2207,  0.1477,  0.2344, -0.1478, -0.6726, -0.3229,
          0.0460,  0.1142,  0.1018, -0.7688, -0.1168,  0.2117, -0.1168, -0.0970],
        [ 0.2864, -0.2028,  0.7957, -0.3784, -0.3367, -1.5247,  1.3210,  0.7995,
         -0.6488,  0.4206,  0.9565,  0.1348, -1.0504, -0.7567,  0.2591,  0.0052,
          0.6355,  0.2421,  0.3406, -0.1324, -0.8410,  1.8654,  0.2153, -0.4081,
          0.8646,  0.1761, -0.1320,  0.7988,  1.0315,  0.0773, -0.0338,  0.5530,
         

In [40]:
train_dataset = SentencePairDataset(train_data, word2vec_model)
test_dataset = SentencePairDataset(test_data, word2vec_model, train=False)
validation_dataset = SentencePairDataset(validation_data, word2vec_model)

In [41]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=16, shuffle=False)

In [42]:
class NN(nn.Module):
    def __init__(self, input_dim):
        super(NN, self).__init__()
        # self.fc1 = nn.Linear(input_dim, 1024)
        # self.dropout1 = nn.Dropout(0.5)
        # self.fc2 = nn.Linear(1024, 256)
        # self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(input_dim, 4)
        self.softmax = nn.Softmax()

    def forward(self, x):
        # x = torch.relu(self.fc1(x))
        # x = self.dropout1(x)
        # x = torch.relu(self.fc2(x))
        # x = self.dropout2(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [51]:
import time

model = NN(input_dim=vector_size)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer.zero_grad()
num_epochs = 5
for epoch in range(num_epochs):
    start = time.time()
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(inputs.float())
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    model.eval()
    val_losses = 0.0
    with torch.no_grad():
        for inputs, labels in validation_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs.float())
            val_loss = criterion(output, labels)
            val_losses += val_loss.item()
    end = time.time()
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {running_loss / len(train_loader):.4f} - "
          f"Val Loss: {val_losses / len(validation_loader):.4f} - "
          f"Time = {end - start:.1f}s")

RuntimeError: stack expects each tensor to be equal size, but got [2, 1024] at entry 0 and [3, 1024] at entry 2

In [None]:
model.eval()
model.cpu()
val_correct = 0
total_val = 0
with torch.no_grad():
    for inputs, labels in validation_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs, 1)
        val_correct += (predicted == labels).sum()
        total_val += labels.size(0)

accuracy = val_correct / total_val
print(f"Validation Accuracy: {accuracy}")

In [None]:
torch.save(model.state_dict(), "./models/nn2epochs")

In [None]:
# Loading the saved model
loaded_model = NN(input_dim=vectorizer_size)  # Create an instance of the model
loaded_model.load_state_dict(torch.load('./models/nn2epochs'))  # Load the model state dictionary
loaded_model.eval()

In [None]:
model.eval()
model.cpu()
val_correct = 0
total_val = 0
df = pd.DataFrame(columns=["guid", "label"])

with torch.no_grad():
    for inputs, guids in test_loader:
        # inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs, 1)
        df_new = pd.DataFrame({'guid': guids, 'label': predicted.numpy()})
        df = pd.concat([df, df_new], ignore_index=True)


In [None]:
df.to_csv("./data/submission2.csv", index=False)