In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from collections import Counter

import torch as torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim

import torchtext
import torchtext.data as ttd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import spacy

import re
import string


2023-08-04 14:16:19.943206: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-04 14:16:23.526662: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-04 14:16:23.527348: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Cleaning Text

In [3]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)



def remove_url(text): 
    url_pattern  = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)
 # converting return value from list to string



def clean_text(text): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>2))]) 

    return text2.lower()


In [4]:
def get_label(label):
    if label == -1:
        return 2
    else:
        return label

In [5]:
data = pd.read_csv('Reddit_Data.csv')
data.columns = ['text', 'label']
print(data.dtypes)
data.head()

text     object
label     int64
dtype: object


Unnamed: 0,text,label
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


It has 3 sentiment category. -1 as negative, 0 as neutral and 1 as positive.

mapping into:

0. Neutral
1. Positive
2. Negative

In [6]:
print("SHAPE",data.shape)
print("NULL: ",data.isna().sum())
print("\n")
data.dropna(how='any',axis=0,inplace=True)
print("SHAPE",data.shape)
print("NULL: ",data.isna().sum())


SHAPE (37249, 2)
NULL:  text     100
label      0
dtype: int64


SHAPE (37149, 2)
NULL:  text     0
label    0
dtype: int64


In [7]:
data['Num_words_text'] = data['text'].apply(lambda x:len(str(x).split())) 
data.head()

Unnamed: 0,text,label,Num_words_text
0,family mormon have never tried explain them t...,1,39
1,buddhism has very much lot compatible with chr...,1,196
2,seriously don say thing first all they won get...,-1,86
3,what you have learned yours and only yours wha...,0,29
4,for your own benefit you may want read living ...,1,112


In [8]:
# accepting data only with more than 3 words
mask = data['Num_words_text'] > 3
data = data[mask]

In [9]:
print(data.label.value_counts())
max_sentence_length  = data['Num_words_text'].max()
print('Max Sentence Length :'+str(max_sentence_length))

 1    15082
 0     9869
-1     7887
Name: label, dtype: int64
Max Sentence Length :1307


In [10]:
data.text = data.text.apply(remove_emoji)
data.text = data.text.apply(remove_url)
data.text = data.text.apply(clean_text)

In [11]:
data.label = data.label.apply(get_label)
data.head()

Unnamed: 0,text,label,Num_words_text
0,family mormon have never tried explain them th...,1,39
1,buddhism has very much lot compatible with chr...,1,196
2,seriously don say thing first all they won get...,2,86
3,what you have learned yours and only yours wha...,0,29
4,for your own benefit you may want read living ...,1,112


In [12]:
X_train, X_test, Y_train, Y_test= train_test_split(data['text'].tolist(),\
                                                      data['label'].tolist(),\
                                                      test_size=0.2,\
                                                      stratify = data['label'].tolist(),\
                                                      random_state=0)


print('Train data len:'+str(len(X_train)))
print('Class distribution'+str(Counter(Y_train)))


print('Test data len:'+str(len(X_test)))
print('Class distribution'+ str(Counter(Y_test)))


train_dat =list(zip(Y_train,X_train))
test_dat =list(zip(Y_test,X_test))

print("\n We need a data iterator as specified in torchtext Datset class which is why we used zip formatted tuples\n")
print(train_dat[0])


Train data len:26270
Class distributionCounter({1: 12065, 0: 7895, 2: 6310})
Test data len:6568
Class distributionCounter({1: 3017, 0: 1974, 2: 1577})

 We need a data iterator as specified in torchtext Datset class which is why we used zip formatted tuples

(0, 'maybe they could start american branch')


In [13]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

train_iter = train_dat
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text) # yield: return a generator, a sequence of data that can only be iterated over once

# build the vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"]) # set the default index when the token is not found


### Preprocessing Pipeline

In [14]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) 

In [15]:
print(text_pipeline("This movie was great, I really like it"))
print(label_pipeline(2))

[0, 169, 14, 157, 0, 0, 83, 22, 33537]
2


**Collation**: sorted sequencing of data

In [16]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text =torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list, dtype=torch.float64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [17]:
class RNN(nn.Module):
    def __init__(self,
                 vocab_size,
                 batch_size,
                 output_dim,
                 embedding_dim=100,
                 hidden_dim=128,
                 n_layers=1,
                 device='cpu'):
        super(RNN,self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.device = device

        self.encoder = nn.EmbeddingBag(vocab_size, embedding_dim, sparse=True)
        # self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim,num_layers=n_layers, batch_first=True)
        self.decoder = nn.Linear(hidden_dim, output_dim)


    def init_hidden(self):
        return torch.zeros(self.n_layers, self.batch_size, self.hidden_dim).to(self.device)
    

    def forward(self, text,offset):
        batch_size = text.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size

        encoded = self.encoder(text,offset)
        hidden = self.init_hidden()
        output, hidden = self.rnn(encoded, hidden)
        output = self.decoder(output[:,:,-1]).squeeze()
        return output



In [18]:
train_iter1 = train_dat
output_dim = len(set(category for (category,text) in train_iter1))
print(output_dim)
vocab_size = len(vocab)
batch_size = 256
embedding_dim = 128
hidden_dim = 256
n_layers = 2
model = RNN(vocab_size, batch_size, output_dim, embedding_dim, hidden_dim, n_layers,device)
model.to(device)

3


RNN(
  (encoder): EmbeddingBag(46057, 128, mode='mean')
  (rnn): RNN(128, 256, num_layers=2, batch_first=True)
  (decoder): Linear(in_features=256, out_features=3, bias=True)
)

In [19]:
# Training Hyperparameters
lr = 0.001
epoch = 5



criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

total_accuracy = None


# Dataloader
train_loader = DataLoader(dataset=train_dat, batch_size=batch_size, shuffle=True,collate_fn=collate_batch)
test_loader = DataLoader(dataset=test_dat, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)


In [20]:
import time

def train(dataloader):
    model.train()
    total_count, total_accuracy = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label,text,offset) in enumerate(dataloader): 
        predicted_label = model(text, offset)
        loss = criterion(predicted_label, label)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_count += label.size(0)
        total_accuracy += (predicted_label.argmax(1) == label).sum().item()

        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | accuracy {:8.3f} | time {:5.2f}s'.format(
                epoch, idx, len(dataloader), total_accuracy/total_count, elapsed))
            total_count, total_accuracy = 0, 0
            start_time = time.time()


def evaluate(dataloader):
    model.eval()
    total_count, total_accuracy = 0, 0

    with torch.no_grad():
        for idx, (label,text,offset) in enumerate(dataloader):
            predicted_label = model(text, offset)
            loss = criterion(predicted_label, label)
            total_count += label.size(0)
            total_accuracy += (predicted_label.argmax(1) == label).sum().item()

    return total_accuracy/total_count



In [21]:
for epoch in range(1, epoch + 1):
    epoch_start_time = time.time()
    train(train_loader)
    accu_val = evaluate(test_loader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

RuntimeError: For unbatched 2-D input, hx should also be 2-D but got 3-D tensor