## Quick fix final

In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import spacy
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, classification_report
from sklearn.metrics import mean_squared_error


from collections import Counter
import re
import string
import tqdm
spacy.load('en_core_web_sm')
stopwords = stopwords.words('english')
# torch.backends.cudnn.enabled = False 
torch.random.manual_seed(123456)

<torch._C.Generator at 0x12d80955bd0>

In [48]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.00001
NODES = 1000

In [49]:
train_split = 70
split_train = train_split/100
train_file_name = "./dataset_{}/train_{}.csv".format(train_split,train_split) 
test_file_name = "./dataset_{}/test_{}.csv".format(train_split,train_split) 
vocab_file_name =  "./dataset_{}/vocab_{}.csv".format(train_split,train_split)
print(split_train, train_file_name, test_file_name)

0.7 ./dataset_70/train_70.csv ./dataset_70/test_70.csv


In [50]:
train = pd.read_csv(train_file_name, sep="|", index_col=0)
test = pd.read_csv(test_file_name, sep="|", index_col=0)
len(train),len(test), train.columns

(19930,
 8541,
 Index(['tweet_id', 'created_time', 'count', '1', '2', '3', '4', '5', '6',
        'user_id', 'screen_name', 'url', 'follower_count', 'title', 'content',
        'expanded_url', 'title_len', 'content_len', 'max_retweets',
        'label_log_10', 'label_mean', 'label_median', 'label_quantile',
        'label_grouped_median', 'grouped_median'],
       dtype='object'))

In [51]:
# train.loc[train.max_retweets==32870]

In [52]:
tok = spacy.load('en_core_web_sm')
max_len = -1
def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct_num = regex.sub(" ", text.lower()) 
#     text = " ".join([word for word in text.split() if word not in stopwords])
    # Removing the odd apostrophes
    tokens = [token for token in nopunct_num.split() if len(token)>=2 and token not in stopwords]
    return tokens
#     text = re.sub(r"[^\x00-\x7F]+", " ", text)
#     regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
#     nopunct = regex.sub(" ", text.lower())
#     return [token.text for token in tok.tokenizer(nopunct)]
#count number of occurences of each word
counts = Counter()
for index, row in train.iterrows():
    tokenized = tokenize(row['title'])
    if max_len < len(tokenized):
        max_len = len(tokenized)
    counts.update(tokenized)
print("Max number of words in len",max_len)

Max number of words in len 20


In [53]:
#deleting infrequent words

In [54]:
# creating vocab
vocab2index = {"":0, "UNK":1}
words = ["","UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)
words[:5]

['', 'UNK', 'man', 'accused', 'golden']

In [55]:
def encode_sentence(text, vocab2index, N=max_len):
    tokenized = tokenize(text)
    encoded = np.zeros(N,dtype=int)
    enc1 = np.array([vocab2index.get(word,vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [56]:
train['encoded'] = train['title'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))
test['encoded'] = test['title'].apply(lambda x: np.array(encode_sentence(x,vocab2index)))

In [57]:
X_train, y_train = train['encoded'], train['label_log_10']
X_train.head(2),y_train.head(2)

(0    [[2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0,...
 1    [[10, 11, 12, 13, 14, 15, 16, 17, 18, 0, 0, 0,...
 Name: encoded, dtype: object,
 0    1
 1    1
 Name: label_log_10, dtype: int64)

In [58]:
X_test, y_test = test['encoded'], test['label_log_10']
X_test.head(2), y_test.head(2)

(0    [[354, 638, 5250, 12, 4213, 3322, 250, 3750, 7...
 1    [[13, 57, 127, 1, 13439, 4632, 50, 516, 1079, ...
 Name: encoded, dtype: object,
 0    2
 1    2
 Name: label_log_10, dtype: int64)

In [59]:
class NewsDataset(Dataset):
    def __init__(self, X, Y):
            self.X = X
            self.y = Y
            
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [60]:
train_ds = NewsDataset(X_train, y_train)
test_ds = NewsDataset(X_test, y_test)

In [61]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [62]:
def train_model(model):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=LEARNING_RATE)#, momentum=0.7)
#     model_fixed.load_state_dict(torch.load("./model_dropout.pt"))
#     optimizer.load_state_dict(torch.load("./optimizer_dropout.pt"))
    actual_loss = None
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True,
                                                     patience=3, factor=0.6)
    softmax = nn.LogSoftmax(dim=0)
    for i in tqdm.tqdm(range(EPOCHS), total=EPOCHS):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
#             y_hat = softmax(y_pred)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item() *y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, test_dl, i)
        scheduler.step(val_loss)
        actual_loss = val_loss
        print("Model Dict updated")
        torch.save(model.state_dict(),"./model_dropout.pt" )
        torch.save(optimizer.state_dict(),"./optimizer_dropout_adam.pt")
            
#         if i%5 == 0:
        print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl, i):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    total_preds, total_actual= [], []
    for x, y, l in valid_dl:
        x = x.long().to(device)
        y = y.long()
        y_hat = model(x, l).cpu()
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
#         print(pred.numpy())
#         print(y.numpy())
        total_preds += pred.tolist()
        total_actual += y.tolist()
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
#     if i%5 == 0:
#         print(i)
    print(classification_report(total_preds, total_actual))
    return sum_loss/total, correct/total, sum_rmse/total

In [63]:
vocab_size = len(words)
print(vocab_size)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
# np.save('vocab2index.npy',vocab2index)
# np.save('wordlist.npy',words)

14322


In [64]:
x,y,l = train_ds[0]
len(train_dl), x.unsqueeze(0).shape

(312, torch.Size([1, 20]))

In [69]:
class LSTM_fixed_len(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
#         self.model = nn.Sequential(*[
#             nn.Embedding(vocab_size, embedding_dim, padding_idx=0),
#             nn.Dropout(0.2),
#             nn.LSTM(embedding_dim, hidden_dim, batch_first=True),
#             nn.Linear(hidden_dim, 2)
#         ])
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(0.2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=3, bidirectional=True)
        self.linear = nn.Linear(hidden_dim*2, 4, bias=False)
        
    def forward(self, x, l):
#         print(x[0])
#         result = self.model(x)
#         print(result)
#         return 0
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out,( ht, ct) = self.lstm(x)
#         print(ht[-1].shape, torch.cat((ht[-2],ht[-1]), dim=1).shape)
#         print(ht.view(3, 2, 128, 256).shape)
        return self.linear(torch.cat((ht[-2],ht[-1]), dim=1))

In [70]:
model_fixed = LSTM_fixed_len(vocab_size, 200, 256)
model_fixed

LSTM_fixed_len(
  (embeddings): Embedding(14322, 200, padding_idx=0)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(200, 256, num_layers=3, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=4, bias=False)
)

In [71]:
EPOCHS = 100
BATCH_SIZE = 128
LEARNING_RATE = 1e-3

In [72]:
train_model(model_fixed.to(device))  




  _warn_prf(average, modifier, msg_start, len(result))



  1%|█▏                                                                                                                          | 1/100 [00:12<21:20, 12.94s/it][A[A[A

              precision    recall  f1-score   support

           0       0.05      0.39      0.09        66
           1       0.89      0.71      0.79      7119
           2       0.30      0.48      0.37      1356
           3       0.00      0.00      0.00         0

    accuracy                           0.67      8541
   macro avg       0.31      0.40      0.31      8541
weighted avg       0.79      0.67      0.72      8541

Model Dict updated
train loss 0.846, val loss 0.803, val accuracy 0.671, and val rmse 0.607


  _warn_prf(average, modifier, msg_start, len(result))



  2%|██▍                                                                                                                         | 2/100 [00:26<21:23, 13.10s/it][A[A[A

              precision    recall  f1-score   support

           0       0.14      0.46      0.21       156
           1       0.88      0.72      0.79      6915
           2       0.35      0.51      0.42      1470
           3       0.00      0.00      0.00         0

    accuracy                           0.68      8541
   macro avg       0.34      0.42      0.36      8541
weighted avg       0.77      0.68      0.72      8541

Model Dict updated
train loss 0.744, val loss 0.762, val accuracy 0.683, and val rmse 0.592


  _warn_prf(average, modifier, msg_start, len(result))



  3%|███▋                                                                                                                        | 3/100 [00:40<21:26, 13.26s/it][A[A[A

              precision    recall  f1-score   support

           0       0.24      0.40      0.30       307
           1       0.75      0.77      0.76      5583
           2       0.58      0.47      0.52      2651
           3       0.00      0.00      0.00         0

    accuracy                           0.66      8541
   macro avg       0.39      0.41      0.40      8541
weighted avg       0.68      0.66      0.67      8541

Model Dict updated
train loss 0.648, val loss 0.794, val accuracy 0.664, and val rmse 0.610





  4%|████▉                                                                                                                       | 4/100 [00:53<21:22, 13.36s/it][A[A[A

              precision    recall  f1-score   support

           0       0.24      0.35      0.29       358
           1       0.79      0.76      0.77      5948
           2       0.50      0.49      0.49      2220
           3       0.02      0.20      0.03        15

    accuracy                           0.67      8541
   macro avg       0.39      0.45      0.40      8541
weighted avg       0.69      0.67      0.68      8541

Model Dict updated
train loss 0.556, val loss 0.826, val accuracy 0.669, and val rmse 0.613





  5%|██████▏                                                                                                                     | 5/100 [01:07<21:20, 13.48s/it][A[A[A

              precision    recall  f1-score   support

           0       0.25      0.38      0.30       347
           1       0.84      0.74      0.79      6444
           2       0.42      0.52      0.46      1738
           3       0.02      0.25      0.03        12

    accuracy                           0.68      8541
   macro avg       0.38      0.47      0.40      8541
weighted avg       0.73      0.68      0.70      8541

Model Dict updated
train loss 0.477, val loss 0.828, val accuracy 0.681, and val rmse 0.598





  6%|███████▍                                                                                                                    | 6/100 [01:20<21:09, 13.51s/it][A[A[A

              precision    recall  f1-score   support

           0       0.26      0.35      0.30       384
           1       0.77      0.76      0.77      5769
           2       0.53      0.48      0.50      2363
           3       0.02      0.12      0.03        25

    accuracy                           0.66      8541
   macro avg       0.39      0.43      0.40      8541
weighted avg       0.68      0.66      0.67      8541

Epoch     6: reducing learning rate of group 0 to 6.0000e-04.
Model Dict updated
train loss 0.414, val loss 0.937, val accuracy 0.665, and val rmse 0.615





  7%|████████▋                                                                                                                   | 7/100 [01:34<21:04, 13.59s/it][A[A[A

              precision    recall  f1-score   support

           0       0.29      0.33      0.31       464
           1       0.76      0.76      0.76      5648
           2       0.53      0.48      0.50      2381
           3       0.07      0.23      0.11        48

    accuracy                           0.66      8541
   macro avg       0.41      0.45      0.42      8541
weighted avg       0.66      0.66      0.66      8541

Model Dict updated
train loss 0.338, val loss 1.020, val accuracy 0.658, and val rmse 0.623





  8%|█████████▉                                                                                                                  | 8/100 [01:48<20:49, 13.59s/it][A[A[A

              precision    recall  f1-score   support

           0       0.27      0.34      0.30       410
           1       0.78      0.76      0.77      5815
           2       0.48      0.48      0.48      2150
           3       0.14      0.13      0.13       166

    accuracy                           0.66      8541
   macro avg       0.42      0.43      0.42      8541
weighted avg       0.67      0.66      0.66      8541

Model Dict updated
train loss 0.293, val loss 1.131, val accuracy 0.660, and val rmse 0.629





  9%|███████████▏                                                                                                                | 9/100 [02:02<20:41, 13.64s/it][A[A[A

              precision    recall  f1-score   support

           0       0.28      0.33      0.30       440
           1       0.78      0.76      0.77      5852
           2       0.50      0.49      0.49      2198
           3       0.06      0.20      0.09        51

    accuracy                           0.66      8541
   macro avg       0.40      0.44      0.41      8541
weighted avg       0.68      0.66      0.67      8541

Model Dict updated
train loss 0.267, val loss 1.234, val accuracy 0.663, and val rmse 0.620





 10%|████████████▎                                                                                                              | 10/100 [02:16<20:37, 13.75s/it][A[A[A

              precision    recall  f1-score   support

           0       0.29      0.34      0.31       443
           1       0.77      0.76      0.77      5760
           2       0.50      0.49      0.49      2202
           3       0.12      0.14      0.13       136

    accuracy                           0.66      8541
   macro avg       0.42      0.43      0.42      8541
weighted avg       0.66      0.66      0.66      8541

Epoch    10: reducing learning rate of group 0 to 3.6000e-04.
Model Dict updated
train loss 0.249, val loss 1.279, val accuracy 0.659, and val rmse 0.624





 11%|█████████████▌                                                                                                             | 11/100 [02:29<20:24, 13.76s/it][A[A[A

              precision    recall  f1-score   support

           0       0.29      0.33      0.31       466
           1       0.77      0.76      0.77      5828
           2       0.48      0.49      0.48      2118
           3       0.12      0.15      0.13       129

    accuracy                           0.66      8541
   macro avg       0.42      0.43      0.42      8541
weighted avg       0.66      0.66      0.66      8541

Model Dict updated
train loss 0.217, val loss 1.437, val accuracy 0.658, and val rmse 0.629





 12%|██████████████▊                                                                                                            | 12/100 [02:44<20:44, 14.14s/it][A[A[A

              precision    recall  f1-score   support

           0       0.25      0.36      0.29       359
           1       0.80      0.75      0.77      6069
           2       0.45      0.49      0.47      1983
           3       0.11      0.14      0.12       130

    accuracy                           0.66      8541
   macro avg       0.40      0.43      0.41      8541
weighted avg       0.68      0.66      0.67      8541

Model Dict updated
train loss 0.205, val loss 1.482, val accuracy 0.665, and val rmse 0.620





 13%|███████████████▉                                                                                                           | 13/100 [02:59<20:43, 14.30s/it][A[A[A

              precision    recall  f1-score   support

           0       0.25      0.35      0.29       373
           1       0.79      0.76      0.77      5953
           2       0.46      0.48      0.47      2056
           3       0.12      0.13      0.13       159

    accuracy                           0.66      8541
   macro avg       0.41      0.43      0.41      8541
weighted avg       0.67      0.66      0.67      8541

Model Dict updated
train loss 0.196, val loss 1.626, val accuracy 0.660, and val rmse 0.629





 14%|█████████████████▏                                                                                                         | 14/100 [03:13<20:29, 14.30s/it][A[A[A

              precision    recall  f1-score   support

           0       0.26      0.37      0.31       374
           1       0.78      0.76      0.77      5875
           2       0.48      0.48      0.48      2149
           3       0.12      0.13      0.13       143

    accuracy                           0.66      8541
   macro avg       0.41      0.44      0.42      8541
weighted avg       0.67      0.66      0.67      8541

Epoch    14: reducing learning rate of group 0 to 2.1600e-04.
Model Dict updated
train loss 0.192, val loss 1.614, val accuracy 0.662, and val rmse 0.622





 15%|██████████████████▍                                                                                                        | 15/100 [03:28<20:19, 14.35s/it][A[A[A

              precision    recall  f1-score   support

           0       0.27      0.38      0.31       372
           1       0.80      0.75      0.78      6071
           2       0.45      0.49      0.47      1967
           3       0.12      0.15      0.13       131

    accuracy                           0.67      8541
   macro avg       0.41      0.44      0.42      8541
weighted avg       0.69      0.67      0.68      8541

Model Dict updated
train loss 0.178, val loss 1.720, val accuracy 0.668, and val rmse 0.615





 16%|███████████████████▋                                                                                                       | 16/100 [03:42<20:00, 14.29s/it][A[A[A

              precision    recall  f1-score   support

           0       0.27      0.35      0.30       400
           1       0.78      0.76      0.77      5891
           2       0.48      0.49      0.49      2128
           3       0.09      0.12      0.11       122

    accuracy                           0.66      8541
   macro avg       0.41      0.43      0.42      8541
weighted avg       0.67      0.66      0.67      8541

Model Dict updated
train loss 0.170, val loss 1.830, val accuracy 0.664, and val rmse 0.620





 17%|████████████████████▉                                                                                                      | 17/100 [03:56<19:36, 14.17s/it][A[A[A

              precision    recall  f1-score   support

           0       0.27      0.34      0.30       418
           1       0.79      0.76      0.77      5906
           2       0.48      0.50      0.49      2094
           3       0.10      0.13      0.11       123

    accuracy                           0.67      8541
   macro avg       0.41      0.43      0.42      8541
weighted avg       0.68      0.67      0.67      8541

Model Dict updated
train loss 0.168, val loss 1.814, val accuracy 0.667, and val rmse 0.618





 18%|██████████████████████▏                                                                                                    | 18/100 [04:10<19:19, 14.14s/it][A[A[A

              precision    recall  f1-score   support

           0       0.29      0.34      0.31       444
           1       0.79      0.76      0.78      5978
           2       0.46      0.50      0.48      1999
           3       0.10      0.13      0.11       120

    accuracy                           0.67      8541
   macro avg       0.41      0.43      0.42      8541
weighted avg       0.68      0.67      0.67      8541

Epoch    18: reducing learning rate of group 0 to 1.2960e-04.
Model Dict updated
train loss 0.169, val loss 1.796, val accuracy 0.667, and val rmse 0.620





 19%|███████████████████████▎                                                                                                   | 19/100 [04:24<19:04, 14.13s/it][A[A[A

              precision    recall  f1-score   support

           0       0.26      0.35      0.30       379
           1       0.78      0.76      0.77      5860
           2       0.49      0.48      0.48      2187
           3       0.10      0.14      0.12       115

    accuracy                           0.66      8541
   macro avg       0.41      0.43      0.42      8541
weighted avg       0.67      0.66      0.67      8541

Model Dict updated
train loss 0.160, val loss 1.928, val accuracy 0.662, and val rmse 0.621


KeyboardInterrupt: 