In [1]:
import torch 
import torch.nn as nn

In [57]:
# Basic Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# For text preprocessing
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import ConfusionMatrixDisplay


import torch 
import spacy
import torch.nn as nn
from collections import Counter

from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [24]:
data = pd.read_csv("Constraint_English_Train - Sheet1.csv")
val = pd.read_csv("Constraint_English_Val - Sheet1.csv")
test = pd.read_csv("english_test_with_labels - Sheet1.csv")

In [25]:
stop_words = set(stopwords.words("english"))
tok = spacy.load('en_core_web_sm')
def process_text(string):
    """
    Replaces "http", "www" in urls with space.
    Replaces ampersand(&) with "and"
    Removes non-alphanumeric characters
    Removes stop words
    """
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)  
    tx = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stop_words]
    text = " ".join(text)
    text = [token.text for token in tok.tokenizer(text)]
    return text

In [26]:
data['tweet'] = data['tweet'].apply(lambda x: process_text(x))
val['tweet'] = val['tweet'].apply(lambda x: process_text(x))
test['tweet'] = test['tweet'].apply(lambda x: process_text(x))

In [27]:
data.head()

Unnamed: 0,id,tweet,label
0,1,"[cdc, currently, reports, 99031, deaths, gener...",real
1,2,"[states, reported, 1121, deaths, small, rise, ...",real
2,3,"[politically, correct, woman, almost, uses, pa...",fake
3,4,"[indiafightscorona, 1524, covid, testing, labo...",real
4,5,"[populous, states, generate, large, case, coun...",real


In [38]:
counts = Counter()
for index, row in data.iterrows():
    counts.update(row['tweet'])

vocab_to_int = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab_to_int [word] = len(words)
    words.append(word)

In [43]:
def encode_sentence(text, vocab2idx, N=32):
    encoded = np.zeros(N, dtype=int)
    enc = np.array([vocab2idx.get(word, vocab2idx["UNK"]) for word in text])
    length = min(N, len(enc))
    encoded[:length] = enc[:length]
    return encoded, length

In [44]:
data['encoded'] = data['tweet'].apply(lambda x: np.array(encode_sentence(x,vocab_to_int), dtype=object))
data.head()

Unnamed: 0,id,tweet,label,encoded
0,1,"[cdc, currently, reports, 99031, deaths, gener...",real,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,2,"[states, reported, 1121, deaths, small, rise, ...",real,"[[21, 22, 23, 6, 13, 24, 25, 26, 27, 21, 22, 2..."
2,3,"[politically, correct, woman, almost, uses, pa...",fake,"[[29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, ..."
3,4,"[indiafightscorona, 1524, covid, testing, labo...",real,"[[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ..."
4,5,"[populous, states, generate, large, case, coun...",real,"[[58, 21, 59, 60, 61, 10, 62, 63, 64, 65, 66, ..."


In [45]:
data['encoded']

0       [[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
1       [[21, 22, 23, 6, 13, 24, 25, 26, 27, 21, 22, 2...
2       [[29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, ...
3       [[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ...
4       [[58, 21, 59, 60, 61, 10, 62, 63, 64, 65, 66, ...
                              ...                        
6415    [[5318, 105, 106, 43, 87, 2371, 99, 109, 3004,...
6416    [[6606, 8287, 43, 87, 1239, 8288, 5165, 8289, ...
6417    [[274, 275, 43, 87, 171, 601, 354, 720, 2538, ...
6418    [[2286, 2107, 9428, 11013, 3503, 520, 3084, 80...
6419    [[2414, 235, 559, 25, 61, 43, 87, 946, 947, 94...
Name: encoded, Length: 6420, dtype: object

In [47]:
val['encoded'] = val['tweet'].apply(lambda x: np.array(encode_sentence(x,vocab_to_int), dtype=object))
test['encoded'] = test['tweet'].apply(lambda x: np.array(encode_sentence(x,vocab_to_int), dtype=object))

In [48]:
# stats about vocabulary
print('Unique words: ', len((vocab_to_int)))  # should ~ 74000+
print()

Unique words:  14127



In [49]:
data['label_encoded'] = data['label'].apply(lambda x : 0 if x == "fake" else 1)
val['label_encoded'] = val['label'].apply(lambda x : 0 if x == "fake" else 1)
test['label_encoded'] = test['label'].apply(lambda x : 0 if x == "fake" else 1)

In [67]:
class TweetDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0]), self.y[idx]
    
    def __len__(self):
        return len(self.y)

In [68]:
train_data = TweetDataset(data['encoded'], data['label_encoded'])
val_data = TweetDataset(val['encoded'], val['label_encoded'])
test_data = TweetDataset(test['encoded'], test['label_encoded'])

In [69]:
batch_size = 128
vocab_size = len(words)

trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valloader = DataLoader(val_data, batch_size=batch_size)
testloader = DataLoader(test_data, batch_size=batch_size)

In [95]:
class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim,  padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds)
        
        lstm_out = lstm_out[:, -1, :] # getting the last time step output
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
#     def init_hidden(self, batch_size):
#         ''' Initializes hidden state '''
#         # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
#         # initialized to zero, for hidden state and cell state of LSTM
#         weight = next(self.parameters()).data
        
#         if (train_on_gpu):
#             hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
#                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
#         else:
#             hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
#                       weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
#         return hidden

In [96]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(14127, 400, padding_idx=0)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [97]:
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [98]:
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

No GPU available, training on CPU.


In [100]:
epochs = 4 

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()

# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    # h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in trainloader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

       
        net.zero_grad()

        output, h = net(inputs)

        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        
     
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            # val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valloader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                # val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())
            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))


Epoch: 2/4... Step: 100... Loss: 0.084026... Val Loss: 0.323566
Epoch: 4/4... Step: 200... Loss: 0.114681... Val Loss: 0.329537


In [88]:
for a, b in trainloader:
    print(a.shape)
    break

torch.Size([128, 32])


In [103]:
test_losses = [] # track loss
num_correct = 0

# init hidden state
# h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in testloader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    # h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(testloader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.290
Test accuracy: 0.903
