## Sentiment Analysis on Youtube comments

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

ModuleNotFoundError: No module named 'pandas'

### Load the tweets dataset

In [None]:
data_dir = '../data/twitter_sentiment'
output_dir = '../Outputs'
train_file_name = 'train.csv'
dataset = pd.read_csv(os.path.join(data_dir, train_file_name), encoding = "ISO-8859-1", header = None, names = ['target','id','date','flag','user', 'text',])
dataset.head()

In [None]:
dict_target = {'negative':0, 'neutral':2, 'positive':4}
print('Training data')
print('Number of Training examples', dataset.shape)

We will extract out only the required columns, that is target and text.

In [None]:
dataset = dataset.drop(columns = ['id','date','flag','user'])
dataset.head()

### Data Preprocessing

1. We need to remove stop words, links, usernames  and a lot of other trash from the tweets as they don't convey any sentiment.
    
    So let us write a function for that.

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#extract the most common words in english language
stop_words = stopwords.words("english")
#intialise lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


def preprocess(text):
    
    # convert to lowercase
    text = text.lower()
    
    #remove punctuarion
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #remove numbers
    text = re.sub(r'\d+', '', text)
    
    #remove usernames
    text = re.sub(r'@[^\s]+','', text)
    
    #remove links
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text) 
    
    # heeeelllloooo => heelloo
    text = re.sub(r"(.)\1{4,}", r"\1"*4, text)
    
    #remove whitespaces from beginning and end
    text = text.strip()
    
    #tokenize
    word_tokens = word_tokenize(text)
    tokens = []
    
    #remove stop words
    for token in word_tokens:
        if token not in stop_words:
            tokens.append(token)
 
    #Lemmatization to reduce words to their base forms
    lemm_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        
    return " ".join(lemm_tokens)


try: # load processed and save dataset
    prep_file = 'preprocessed_tweets.csv'
    dataset = pd.read_csv(os.path.join(data_dir, prep_file) , index_col=0)
except FileNotFoundError:
    print('Preprocessing the data. Will take few minutes!')
    dataset['text'] = dataset['text'].apply(lambda x: preprocess(x))
    dataset.to_csv('../data/Tweet_data/preprocessed_tweets.csv') # save it for later

In [None]:
dataset.head()

### Word Embedding Matrix using Word2Vec algorithm

***Word Embeddings*** are vector representations that capture the context of the underlying words in relation to other words in the sentence. This transformation results in words having similar meaning being clustered closer together in the hyperplane and distinct words positioned further away in the hyperplane.

And ***Word2Vec***  is a 2 layer neural network, whose input is a text corpus and it's output is a set of vectors, which form the ***Word Embedding matrix***.

We can use ***pre-trained Word Embeddings*** as written in this keras [blog](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html), which is a better option when our training data is relatively small.

But Since we have a large amount of data with us, ***We will train our own Word Embeddings***, specific to our data.

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
#We will create a list of words present in our text corpus
Bigger_list = []
for i in dataset['text']:
    try:
        li = list(i.split(" "))
        Bigger_list.append(li)
    except:
        pass

In [None]:
#hyperparams
W2V_SIZE = 100    #Size of vector representing each word
W2V_WINDOW = 7    
W2V_EPOCH = 32    
W2V_MIN_COUNT = 10  #Minimum number of times, the word should appear in text corpus
                    #for it to be included in vocabulary
                    #keeping 10, helps to avoid usernames present in tweets

try: # load already saved model
    model_file = 'model.w2v'
    w2v_model = Word2Vec.load(os.path.join(output_dir, model_file))
except FileNotFoundError:
    print('Training the Word2Vec model. Will take few mins!')
    w2v_model = Word2Vec(Bigger_list, size = W2V_SIZE, window = W2V_WINDOW, min_count = W2V_MIN_COUNT, workers = 8)
    w2v_model.save("model.w2v") #save the model

Now, Let's create a dictionary mapping each word in Vocabulary to an integer.

In [None]:
#let's check out the vocabulary
vocab = list(w2v_model.wv.vocab)
print('Length of Vocabulary :',len(vocab))

#and create the dictionary
word_index = {}
for i, word in enumerate(vocab, 1): 
    word_index[word] = i

Let us analyze that our Word2Vec model if it learned correct relation in between the words present in text corpus. We can do that by finding similar words to a given word.

In [None]:
#let's check similarity
test_word = "great"
print('Top 5 words similar to', test_word)
w2v_model.wv.most_similar(test_word, topn = 5)

Now we will club all the vectors together and form a ***Word Embedding Matrix*** which will be passed into the Neural Network.

In [None]:
vocab_size = len(word_index)+1   #one extra row for "out of vocabulary words"
embedding_matrix = np.zeros((vocab_size, W2V_SIZE)) #initialising the matrix with zeros

for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]  #adding vector to the matrix

print('Shape of embedding matrix :', embedding_matrix.shape)

## Model

#### Preparing the input to the Nerual Network.

We will transform the tweets to their integer form using the word_index dictionary. And, since not all the tweets are of same length, we will pad the shorter tweets with zeros.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# import importlib
# importlib.reload(model)

In [None]:
class RNNModel(nn.Module):

    def __init__(self, embedding_matrix, hidden_size,  num_layers):
        super(RNNModel, self).__init__()
        self.embedding, num_embeddings, embedding_dim = self.create_embedding_layer(embedding_matrix)
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=self.num_layers)
        self.linear = nn.Linear(num_layers*hidden_size, 1)

    def forward(self, inp, hidden):
        output, hidden = self.lstm(self.embedding(inp), hidden)
        h, c = hidden
        h = h.permute(1, 2, 0)
        h = h.reshape(h.shape[0], -1)
        y = self.linear(h)
        return F.sigmoid(y), hidden

    @staticmethod
    def create_embedding_layer(embed_matrix, trainable=True):
        num_embeddings, embedding_dim = embed_matrix.shape
        emb_layer = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        emb_layer.load_state_dict({'weight':embed_matrix})
        if trainable:
            emb_layer.weight.requires_grad = True
        return emb_layer, num_embeddings, embedding_dim


In [None]:
def text_to_int(df, word_index, max_len):
    '''
        df : dataframe containing column "text"
        word_index : Dictionary contiaing mapping from words to int
        max_len : maximum length of each tweet
    '''
    X = np.zeros((df.shape[0], max_len))  #initialising the nd-array
    
    for i, tweet in enumerate(df.text):
        try:
            words = list(tweet.split(" "))
            j = 0
            for word in reversed(words):
                if word in word_index.keys():   #if present in our vocab
                    X[i, max_len-1-j] = word_index[word]
                    j += 1
        except:
            pass
    return X

#finding the longest tweet
max_len = 0
for list_ in Bigger_list:
    if len(list_)>max_len:
        max_len = len(list_)

print('Length of longest tweet is',max_len)

#converting train_data tweets to integer
X_train = text_to_int(dataset, word_index, max_len)
print(dataset.text[1], '\n mapped to \n', X_train[1])

In [None]:
X_train.shape

In [None]:
# converting to torch tensors
x_train = torch.from_numpy(X_train).to(torch.int64)
#embed_matrix = torch.from_numpy(embedding_matrix)
target = torch.from_numpy(dataset.target.values).to(torch.float32)

In [None]:
class tweetDataset(Dataset):
    def __init__(self):
        pass
    
    def __len__(self):
        return int(X_train.shape[0])
    
    def __getitem__(self, idx):        
        sample = [x_train[idx], target[idx]]
        return sample

In [None]:
def train(model, h0, c0, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    h = h0.to(device)
    c = c0.to(device)
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output, (h, c) = model(data, (h, c))
        
        # Detaching from graph
        h.detach_()
        c.detach_()
        
        loss = F.binary_cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

In [None]:
batch_size = 56
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
lr = 0.01
epochs = 1

train_dataset = tweetDataset()
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size)
# test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


hidden_size = 50
num_layers = 2
    

h0 = torch.rand((num_layers, batch_size, hidden_size)).to(device)
c0 = torch.rand((num_layers, batch_size, hidden_size)).to(device)
    
model = RNNModel(embed_matrix, hidden_size, num_layers).to(device)
optimizer = optim.SGD(model.parameters(), lr=lr)


epoch = 1
train(model, h0, c0, device, train_loader, optimizer, epoch, log_interval=1000)
#    test(model, device, test_loader)

In [None]:
# hidden_size = 200
# num_layers = 2
# embed_matrix = torch.from_numpy(embedding_matrix)
# model = Model(embed_matrix, hidden_size, num_layers)

In [None]:
embedding_matrix = torch.rand((50, 10))
hidden_size = 20
num_layers = 2
model = RNNModel(embedding_matrix, hidden_size, num_layers)
h = torch.rand((num_layers, 1, hidden_size))
c = torch.rand((num_layers, 1, hidden_size))
x = torch.arange(10).view((10, 1))
output, (h, c) = model.forward(x, (h, c))
print(output.shape)

### Testing on Youtube Comments data