In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import spacy
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error


from collections import Counter
import re
import string
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x2dce9fd0348>

In [2]:
EPOCHS = 100
BATCH_SIZE = 128
LEARNING_RATE = 0.0001
NODES = 1000

In [3]:
train = pd.read_csv("train.csv", sep="|")
test = pd.read_csv("test.csv", sep="|")
len(train),len(test)

(9256, 3085)

In [4]:
tok = spacy.load('en_core_web_sm')
def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]
#count number of occurences of each word
counts = Counter()
for index, row in train.iterrows():
    counts.update(tokenize(row['title']))

In [5]:
# creating vocab
vocab2index = {"":0, "UNK":1}
words = ["","UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
words[:5]

['', 'UNK', 'trump', 'family', 'asks']

In [6]:
def encode_sentence(text, vocab2index, N=450):
    tokenized = tokenize(text)
    encoded = np.zeros(N,dtype=int)
    enc1 = np.array([vocab2index.get(word,vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [7]:
train['encoded'] = train['title'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))
test['encoded'] = test['title'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))

In [8]:
X_train, y_train = train['encoded'], train['label']
X_train.head(2),y_train.head(2)

(0    [[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
 1    [[16, 17, 18, 19, 20, 0, 0, 0, 0, 0, 0, 0, 0, ...
 Name: encoded, dtype: object,
 0    1
 1    0
 Name: label, dtype: int64)

In [9]:
X_test, y_test = test['encoded'], test['label']
X_test.head(2), y_test.head(2)

(0    [[305, 850, 851, 893, 894, 6, 895, 225, 127, 5...
 1    [[114, 1796, 1916, 8020, 25, 996, 1100, 9, 157...
 Name: encoded, dtype: object,
 0    1
 1    1
 Name: label, dtype: int64)

In [10]:
class NewsDataset(Dataset):
    def __init__(self, X, Y):
            self.X = X
            self.y = Y
            
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [11]:
train_ds = NewsDataset(X_train, y_train)
test_ds = NewsDataset(X_test, y_test)

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [20]:
def train_model(model):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)
    actual_loss = None
    for i in range(EPOCHS):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item() *y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, test_dl)
        if actual_loss is None or actual_loss >= val_loss:
            actual_loss = val_loss
#             print("Model Dict updated")
#             torch.save(model.state_dict(),"./model.pt" )
            
#         if i%5 == 0:
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))
def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long().to(device)
        y = y.long()
        y_hat = model(x, l).cpu()
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [14]:
vocab_size = len(words)
print(vocab_size)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
# np.save('vocab2index.npy',vocab2index)
# np.save('wordlist.npy',words)

10118


In [15]:
x,y,l = train_ds[0]
len(train_dl), x.unsqueeze(0).shape

(73, torch.Size([1, 450]))

In [16]:
class LSTM_fixed_len(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
#         self.model = nn.Sequential(*[
#             nn.Embedding(vocab_size, embedding_dim, padding_idx=0),
#             nn.Dropout(0.2),
#             nn.LSTM(embedding_dim, hidden_dim, batch_first=True),
#             nn.Linear(hidden_dim, 2)
#         ])
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
#         self.dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=3, bidirectional=True)
        self.linear = nn.Linear(hidden_dim, 2)
        
    def forward(self, x, l):
#         print(x[0])
#         result = self.model(x)
#         print(result)
#         return 0
        x = self.embeddings(x)
#         x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [17]:
model_fixed = LSTM_fixed_len(vocab_size, 256, 256)
model_fixed

LSTM_fixed_len(
  (embeddings): Embedding(10118, 256, padding_idx=0)
  (lstm): LSTM(256, 256, num_layers=3, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=256, out_features=2, bias=True)
)

In [18]:
train_model(model_fixed.to(device))

train loss 0.687, val loss 0.672, val accuracy 0.595, and val rmse 0.635
train loss 0.648, val loss 0.638, val accuracy 0.635, and val rmse 0.603
train loss 0.605, val loss 0.629, val accuracy 0.650, and val rmse 0.590
train loss 0.558, val loss 0.618, val accuracy 0.677, and val rmse 0.567
train loss 0.504, val loss 0.617, val accuracy 0.682, and val rmse 0.562
train loss 0.448, val loss 0.626, val accuracy 0.694, and val rmse 0.551
train loss 0.401, val loss 0.672, val accuracy 0.696, and val rmse 0.550
train loss 0.350, val loss 0.666, val accuracy 0.699, and val rmse 0.548
train loss 0.320, val loss 0.713, val accuracy 0.702, and val rmse 0.544
train loss 0.290, val loss 0.800, val accuracy 0.697, and val rmse 0.548
train loss 0.269, val loss 0.735, val accuracy 0.696, and val rmse 0.550
train loss 0.253, val loss 0.775, val accuracy 0.697, and val rmse 0.549
train loss 0.235, val loss 0.885, val accuracy 0.690, and val rmse 0.555
train loss 0.232, val loss 0.811, val accuracy 0.69

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED