In [40]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torchtext
import torch.nn.functional as F
import random
import torch
import spacy
import re
from collections import Counter
import string


In [41]:
#this will be used for tokenizing 
tok = spacy.load("en_core_web_sm")

In [42]:
X_train=pd.read_csv('../input/col774-2022/train_x.csv')[['Title']]
y_train=pd.read_csv('../input/col774-2022/train_y.csv')['Genre']

X_test=pd.read_csv('../input/col774-2022/non_comp_test_x.csv')[['Title']]
y_test=pd.read_csv('../input/col774-2022/non_comp_test_y.csv')['Genre']
# test= pd.concat([X_test[['Id','Title']],y_test['Genre']],axis=1)

X_test_comp=pd.read_csv('../input/col774-2022/comp_test_x.csv')
comp_test=X_test_comp[['Id','Title']]

In [43]:
# X_train=pd.read_csv('../input/col774-2022/train_x.csv')[['Title']]
# y_train=pd.read_csv('../input/col774-2022/train_y.csv')

# X_test=pd.read_csv('../input/col774-2022/non_comp_test_x.csv')[['Title']]
# y_test=pd.read_csv('../input/col774-2022/non_comp_test_y.csv')
# # test= pd.concat([X_test[['Id','Title']],y_test['Genre']],axis=1)

# X_test_comp=pd.read_csv('../input/col774-2022/comp_test_x.csv')
# comp_test=X_test_comp[['Id','Title']]

In [44]:
X_train['Title'].loc[0]

"Lutheran and Catholic Reconciliation on Justification: A Chronology of the Holy See's Contributions, 1961-1999, to a New Relationship ; between ... Declaration on the Doctrine of Justification"

In [45]:
def normalize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    text = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(text)]

In [46]:
#count number of occurences of each word
frequency = Counter()
for index, row in X_train.iterrows():
    frequency.update(normalize(row['Title']))
    
#deleting iwords with frequency one (optional)
for word in list(frequency):
    if frequency[word] < 2:
        del frequency[word]
# print("num_words after:",len(frequency.keys()))

#creating mapping
vocab_index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in frequency:
    vocab_index[word] = len(words)
    words.append(word)

In [47]:
def words_to_numbers(text, vocab_index, N=35):
    text= normalize(text)
    enc1 = np.zeros(N, dtype=int)
    i=0
    for word in text:
        if word in vocab_index:
            enc1[i]=vocab_index[word]
        else:
            enc1[i]=vocab_index["UNK"]
        i=i+1
        if i>=N:
            break
    
    return enc1

In [48]:
def preprocess(data,vocab_index):
    data['feature']= data['Title'].apply(lambda x: np.array(words_to_numbers(x,vocab_index )))
    data.drop('Title',axis=1,inplace=True)
    return data

In [49]:
preprocess(X_train,vocab_index)
preprocess(X_test,vocab_index)

Unnamed: 0,feature
0,"[2904, 3, 21, 3463, 1627, 8, 1453, 957, 100, 6..."
1,"[1, 4797, 8, 12, 21, 278, 516, 22, 821, 3, 12,..."
2,"[12, 662, 11830, 8, 12, 21, 1832, 241, 11, 100..."
3,"[247, 5920, 11, 8356, 8, 816, 21, 123, 15, 233..."
4,"[6365, 622, 623, 21, 331, 8, 6371, 6355, 8, 63..."
...,...
5695,"[21, 3397, 8819, 3351, 1457, 0, 0, 0, 0, 0, 0,..."
5696,"[8214, 67, 9, 1284, 393, 0, 0, 0, 0, 0, 0, 0, ..."
5697,"[1348, 931, 4339, 67, 8727, 0, 0, 0, 0, 0, 0, ..."
5698,"[50, 8, 1, 153, 250, 8, 1, 8, 1, 742, 67, 3770..."


In [50]:
class TitleDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx].astype(np.int32)), self.y[idx]

In [51]:

X_train=list(X_train['feature'])
y_train=list(y_train)
X_test=list(X_test['feature'])
y_test=list(y_test)

train_dataset = TitleDataset(X_train, y_train)
test_dataset = TitleDataset(X_test, y_test)

batch_size = 100
vocab_size = len(words)
train_loader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,shuffle=True)

In [85]:
torch.manual_seed(0)
class RNN(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embedding =torch.nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, 30)
        self.dropout = torch.nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.rnn(x)
        output= self.linear(ht[-1])
        return output
    

In [86]:
def compute_accuracy(model, test_loader):
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y in test_loader:
        x = x.long()
        y = y.long()
        y_pred = model(x)
        loss = F.cross_entropy(y_pred, y)
        pred = torch.max(y_pred, 1)[1]   
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

def train_model(model, epochs=20, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        print(f'Epoch:{i}')
        sum_loss = 0.0
        total = 0
        for x, y in train_loader:
            x = x.long()
            y = y.long()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        test_loss, test_acc = compute_accuracy(model, test_loader)
        print("train loss %.3f, test loss %.3f, test accuracy %.3f" % (sum_loss/total, test_loss, test_acc*100))



In [87]:
model =  RNN(vocab_size, 128, 128)
train_model(model, epochs=30, lr=0.001)

Epoch:0
train loss 3.405, test loss 3.395, test accuracy 4.246
Epoch:1
train loss 3.100, test loss 2.862, test accuracy 16.263
Epoch:2
train loss 2.659, test loss 2.637, test accuracy 22.877
Epoch:3
train loss 2.384, test loss 2.529, test accuracy 27.895
Epoch:4
train loss 2.185, test loss 2.422, test accuracy 31.404
Epoch:5
train loss 2.027, test loss 2.361, test accuracy 33.509
Epoch:6
train loss 1.901, test loss 2.317, test accuracy 34.825
Epoch:7
train loss 1.805, test loss 2.310, test accuracy 36.807
Epoch:8
train loss 1.722, test loss 2.305, test accuracy 37.421
Epoch:9
train loss 1.653, test loss 2.277, test accuracy 38.632
Epoch:10
train loss 1.592, test loss 2.280, test accuracy 39.000
Epoch:11
train loss 1.532, test loss 2.286, test accuracy 39.298
Epoch:12
train loss 1.478, test loss 2.254, test accuracy 40.860
Epoch:13
train loss 1.432, test loss 2.286, test accuracy 40.596
Epoch:14
train loss 1.394, test loss 2.289, test accuracy 41.158
Epoch:15
train loss 1.355, test loss

In [89]:
print(f'Training accuracy:{compute_accuracy(model,train_loader)}')
print(f'Test_accuracy:{compute_accuracy(model,test_loader)}')

Training accuracy:(0.9737972587521313, tensor(0.7137))
Test_accuracy:(2.409226936206483, tensor(0.4391))


In [None]:
# def load_glove_vectors(glove_file="./data/glove.6B/glove.6B.50d.txt"):
#     """Load the glove word vectors"""
#     word_vectors = {}
#     with open(glove_file) as f:
#         for line in f:
#             split = line.split()
#             word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
#     return word_vectors

In [None]:
# def get_emb_matrix(pretrained, word_counts, emb_size = 50):
#     """ Creates embedding matrix from word vectors"""
#     vocab_size = len(word_counts) + 2
#     vocab_to_idx = {}
#     vocab = ["", "UNK"]
#     W = np.zeros((vocab_size, emb_size), dtype="float32")
#     W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
#     W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
#     vocab_to_idx["UNK"] = 1
#     i = 2
#     for word in word_counts:
#         if word in word_vecs:
#             W[i] = word_vecs[word]
#         else:
#             W[i] = np.random.uniform(-0.25,0.25, emb_size)
#         vocab_to_idx[word] = i
#         vocab.append(word)
#         i += 1   
#     return W, np.array(vocab), vocab_to_idx

In [None]:
# word_vecs = load_glove_vectors()
# pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [None]:
# class LSTM_glove_vecs(torch.nn.Module) :
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
#         super().__init__()
#         self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
#         self.embeddings.weight.requires_grad = False ## freeze embeddings
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
#         self.linear = nn.Linear(hidden_dim, 5)
#         self.dropout = nn.Dropout(0.2)
        
#     def forward(self, x, l):
#         x = self.embeddings(x)
#         x = self.dropout(x)
#         lstm_out, (ht, ct) = self.lstm(x)
#         return self.linear(ht[-1])

In [None]:
# model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)
# train_model(model, epochs=30, lr=0.1)