# Sentiment Analysis on Youtube comments

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os

### Load the tweets dataset

In [3]:
data_dir = os.path.join('..', 'data', 'twitter_sentiment')
output_dir = os.path.join('..', 'Outputs')
model_dir = os.path.join('..', 'Models')
train_file_name = 'train.csv'
dataset = pd.read_csv(os.path.join(data_dir, train_file_name), encoding = "ISO-8859-1", header = None, names = ['target','id','date','flag','user', 'text',])
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
dict_target = {'negative':0, 'positive':1}
print('Training data')
print('Number of Training examples', dataset.shape)

Training data
Number of Training examples (1600000, 6)


We will extract out only the required columns, that is target and text.

In [5]:
dataset = dataset.drop(columns = ['id','date','flag','user'])
dataset.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


### Data Preprocessing

1. We need to remove stop words, links, usernames  and a lot of other trash from the tweets as they don't convey any sentiment.
    
    So let us write a function for that.

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/sachin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sachin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/sachin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import re
import emoji
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#extract the most common words in english language
stop_words = stopwords.words("english")
#intialise lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


def preprocess(text):
    
    # remove emoji
    #text = emoji.get_emoji_regexp().sub(r'', text.decode('utf8'))

    # convert to lowercase
    text = text.lower()
    
    #remove punctuarion
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #remove numbers
    text = re.sub(r'\d+', '', text)
    
    #remove usernames
    text = re.sub(r'@[^\s]+','', text)
    
    #remove links
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text) 
    
    # heeeelllloooo => heelloo
    text = re.sub(r"(.)\1{4,}", r"\1"*4, text)
    
    #remove whitespaces from beginning and end
    text = text.strip()
    
    #tokenize
    word_tokens = word_tokenize(text)
    tokens = []
    
    #remove stop words
    for token in word_tokens:
        if token not in stop_words:
            tokens.append(token)
 
    #Lemmatization to reduce words to their base forms
    lemm_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        
    return " ".join(lemm_tokens)


try: # load processed and save dataset
    prep_file = 'preprocessed_tweets.csv'
    dataset = pd.read_csv(os.path.join(data_dir, prep_file) , index_col=0)
except FileNotFoundError:
    print('Preprocessing the data. Will take few minutes!')
    dataset['text'] = dataset['text'].apply(lambda x: preprocess(x))
    dataset.to_csv('../data/Tweet_data/preprocessed_tweets.csv') # save it for later
    
dataset.head()

  mask |= (ar1 == a)


Unnamed: 0,target,text
0,0,switchfoot httptwitpiccomyzl awww thats bummer...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many time ball managed save res...
3,0,whole body feel itchy like fire
4,0,nationwideclass behaving im mad cant see


### Word Embedding Matrix using Word2Vec algorithm

***Word Embeddings*** are vector representations that capture the context of the underlying words in relation to other words in the sentence. This transformation results in words having similar meaning being clustered closer together in the hyperplane and distinct words positioned further away in the hyperplane.

And ***Word2Vec***  is a 2 layer neural network, whose input is a text corpus and it's output is a set of vectors, which form the ***Word Embedding matrix***.

We can use ***pre-trained Word Embeddings*** as written in this keras [blog](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html), which is a better option when our training data is relatively small.

But Since we have a large amount of data with us, ***We will train our own Word Embeddings***, specific to our data.

In [8]:
from gensim.models.word2vec import Word2Vec

In [9]:
#We will create a list of words present in our text corpus
Bigger_list = []
for i in dataset['text']:
    try:
        li = list(i.split(" "))
        Bigger_list.append(li)
    except:
        pass

In [10]:
#hyperparams
W2V_SIZE = 100    #Size of vector representing each word
W2V_WINDOW = 7    
W2V_EPOCH = 32    
W2V_MIN_COUNT = 10  #Minimum number of times, the word should appear in text corpus
                    #for it to be included in vocabulary
                    #keeping 10, helps to avoid usernames present in tweets

try: # load already saved model
    model_file = 'model.w2v'
    w2v_model = Word2Vec.load(os.path.join(output_dir, model_file))
except FileNotFoundError:
    print('Training the Word2Vec model. Will take few mins!')
    w2v_model = Word2Vec(Bigger_list, size = W2V_SIZE, window = W2V_WINDOW, min_count = W2V_MIN_COUNT, workers = 8)
    w2v_model.save("model.w2v") #save the model

Now, Let's create a dictionary mapping each word in Vocabulary to an integer.

In [11]:
#let's check out the vocabulary
vocab = list(w2v_model.wv.vocab)
print('Length of Vocabulary :',len(vocab))

#and create the dictionary
word_index = {}
for i, word in enumerate(vocab, 1): 
    word_index[word] = i

Length of Vocabulary : 41207


Let us analyze that our Word2Vec model if it learned correct relation in between the words present in text corpus. We can do that by finding similar words to a given word.

In [12]:
#let's check similarity
test_word = "great"
print('Top 5 words similar to', test_word)
w2v_model.wv.most_similar(test_word, topn = 5)

Top 5 words similar to great


[('fantastic', 0.8335270881652832),
 ('wonderful', 0.7792121171951294),
 ('good', 0.7575284242630005),
 ('fabulous', 0.7514442205429077),
 ('awesome', 0.742106556892395)]

Now we will club all the vectors together and form a ***Word Embedding Matrix*** which will be passed into the Neural Network.

In [13]:
vocab_size = len(word_index)+1   #one extra row for "out of vocabulary words"
embedding_matrix = np.zeros((vocab_size, W2V_SIZE)) #initialising the matrix with zeros

for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]  #adding vector to the matrix

print('Shape of embedding matrix :', embedding_matrix.shape)

Shape of embedding matrix : (41208, 100)


#### Preparing the input to the Nerual Network.

We will transform the tweets to their integer form using the word_index dictionary. And, since not all the tweets are of same length, we will pad the shorter tweets with zeros.

In [14]:
def text_to_int(df, column, word_index, max_len):
    '''
        df : dataframe containing column "text"
        word_index : Dictionary contiaing mapping from words to int
        max_len : maximum length of each tweet
    '''
    X = np.zeros((df.shape[0], max_len))  #initialising the nd-array
    
    for i, tweet in enumerate(df[column]):
        try:
            words = list(tweet.split(" "))
            j = 0
            for word in reversed(words):
                if word in word_index.keys():   #if present in our vocab
                    X[i, max_len-1-j] = word_index[word]
                    j += 1
        except:
            pass
    return X

#finding the longest tweet
max_len = 0
for list_ in Bigger_list:
    if len(list_)>max_len:
        max_len = len(list_)

print('Length of longest tweet is',max_len)

#converting train_data tweets to integer
X_train = text_to_int(dataset, 'text', word_index, max_len)
print(dataset.text[1], '\n mapped to \n', X_train[1])

Length of longest tweet is 56
upset cant update facebook texting might cry result school today also blah 
 mapped to 
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0. 11. 12. 13. 14. 15. 16. 17. 18. 19. 20.
 21. 22.]


## Model

In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import importlib

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# personal scripts
import rnnmodel
#importlib.reload(rnnmodel)

### Dataset and DataLoader

In [16]:
class tweetDataset(Dataset):
    def __init__(self):
        pass
    
    def __len__(self):
        return int(X_train.shape[0])
    
    def __getitem__(self, idx):        
        sample = [x_train[idx], target[idx]]
        return sample

In [55]:
def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    correct = 0
    # Start training
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.binary_cross_entropy(output, target) # Cause we have 3 classes
        loss.backward()
        pred = torch.round(output)
        correct += pred.eq(target.view_as(pred)).sum().item()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
    # print accuracy
    print('\nTraining Accuracy: {}/{} ({:.4f}%) \n'.format(correct, len(train_loader.dataset),
                                                    100. * correct / len(train_loader.dataset)))

### Training

In [18]:
# Converting to torch tensors
x_train = torch.from_numpy(X_train).to(device, torch.int64)
embed_matrix = torch.from_numpy(embedding_matrix).to(device)
target = torch.from_numpy(dataset.target.values/4).to(device, torch.float32)

In [95]:
load = False
batch_size = 64
lr = 0.01
epochs = 3
hidden_size = 50
num_layers = 2

# Data loader
train_dataset = tweetDataset()
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
# model    
model = rnnmodel.RNNModel(embed_matrix, hidden_size, num_layers).to(device)
optimizer = optim.SGD(model.parameters(), lr=lr)

if load:
    model.load_state_dict(torch.load(model_path))
else:
    for epoch in range(epochs):
        train(model, device, train_loader, optimizer, epoch, log_interval=1000)
torch.save(model.state_dict(), os.path.join(model_dir, f'model_{str(datetime.now())}'))

  if __name__ == '__main__':



Training Accuracy: 1153754/1600000 (72.1096%) 


Training Accuracy: 1231483/1600000 (76.9677%) 


Training Accuracy: 1244682/1600000 (77.7926%) 



## Testing on Youtube Comments data (for product - CyberTruck)

In [58]:
import youtube_scraper

In [68]:
os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1'
service = youtube_scraper.get_authenticated_service()
keyword = input('Enter a keyword(related to product): ')
max_pages = int(input('Enter max_pages: '))
yt_data = youtube_scraper.extract_comments_by_video_keyword(service, max_pages, q=keyword, 
                                                            part='id,snippet', eventType='completed', 
                                                            type='video')
yt_data.head()

Enter a keyword(related to product): cybertruck
Enter max_pages: 1


Unnamed: 0,video_id,title,comment
0,SwvDOdBHYBw,WATCH LIVE! Elon Musk presents the new Tesla C...,They released this robotic version of Elon Mus...
1,SwvDOdBHYBw,WATCH LIVE! Elon Musk presents the new Tesla C...,That’s literally a halo warthog
2,SwvDOdBHYBw,WATCH LIVE! Elon Musk presents the new Tesla C...,When you see the car which you used to draw in...
3,SwvDOdBHYBw,WATCH LIVE! Elon Musk presents the new Tesla C...,Now hit it with a sledgehammer that's not rubb...
4,SwvDOdBHYBw,WATCH LIVE! Elon Musk presents the new Tesla C...,Him trying to speak gives me anxiety.


### Preprocess youtube comments data

In [80]:
yt_pro_data = yt_data.copy()
yt_pro_data.comment = yt_pro_data.comment.apply(lambda x:preprocess(x))

yt_cmts = text_to_int(yt_pro_data, 'comment', word_index, max_len)
num_cmts = len(yt_cmts)  # feeding all comments together

# Preparing input to the model
h = torch.zeros((num_layers, num_cmts, hidden_size)).to(device)
c = torch.zeros((num_layers, num_cmts, hidden_size)).to(device)
cmts_data = torch.tensor(yt_cmts).to(device, dtype=torch.int64)

### Sentiment classification of youtube comments

In [94]:
model.eval()
output = model(cmts_data)

pred = []
for i in range(num_cmts):
    if output[i]<0.4:
        pred.append('negative')
    elif output[i]>0.6:
        pred.append('positive')
    else:
        pred.append('neutral')
        
# save sentiment to dataframe
sent_data = pd.DataFrame(pred, columns=['sentiment'])
yt_cmts = yt_data.copy()
#yt_sent_data = yt_cmts.join(sent_data)

# Print comments and their sentiment
num_print = 10
for i in range(num_print):
    print('-'*10)
    print(yt_sent_data.comment[i], '-->',yt_sent_data.sentiment[i])

# saving output
pd.to_csv(yt_sent_data, os.path.join(output_dir, f'{keyword}.csv'))

----------
They released this robotic version of Elon Musk before the A.I. was ready. --> positive
----------
That’s literally a halo warthog --> neutral
----------
When you see the car which you used to draw in the first grade.. --> neutral
----------
Now hit it with a sledgehammer that's not rubber on the outside --> neutral
----------
Him trying to speak gives me anxiety. --> negative
----------
great mind, less great speeches. I still like him :) --> positive
----------
I appreciate elon musk. I feel that he is ahead of the pack in spearheading electric technologies --> positive
----------
Your going to see the roads full of Tesla’s soon. Idk the cons but I’d like to see the reviews. I can’t imagine what will be going on 100yrs from now. Good job --> neutral
----------
Welcome back drawing I made about 6 years ago --> positive
----------
The man is brilliant, he hires the best engineers and challenges them with vision --> positive


#### Thank u