<a href="https://colab.research.google.com/github/prachi-ovs/Twitter_Sentiment_Analysis_Using_RNN_and_TF-IDF/blob/main/Airline_data_Twitter_Sentiment_Analysis_using_RNN_and_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os,sys
import numpy as np
import pandas as pd


import re  
import nltk 
nltk.download('stopwords')  
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import DataLoader, TensorDataset


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
tweets = pd.read_csv("https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv")
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
x = tweets.loc[:,'text'].values
y = tweets.loc[:,'airline_sentiment'].values

In [5]:
cleaned_tweets = []

for tweet in x:
  cleaned_tweet = re.sub(r'\W', ' ', tweet)  # replace all non alphanumeric characters with space
  cleaned_tweet = re.sub(r'\s+[a-zA-Z]\s+', ' ', cleaned_tweet) # replace single characters in the tweet with space 
  cleaned_tweet = re.sub(r'^b\s+', ' ', cleaned_tweet) # replace the character 'b' from prefix of a tweet which can be present to indicate dataset in byte format
  cleaned_tweet = re.sub(r'\s+', ' ', cleaned_tweet, flags= re.I) #replaces multiple spaces with a single space
  cleaned_tweets.append(cleaned_tweet.lower())


### **Tf-idf value for top 2000 words from all tweets**

In [6]:
# vectorizer = TfidfVectorizer(max_features= 200, stop_words='english')
# vocab = vectorizer.fit_transform(corpus) # top 200 words across all documents which forms the vocabulary
# y = np.array(labels)
# print(vocab.shape, y.shape) 

In [7]:
vectorizer = TfidfVectorizer(max_features=2000, stop_words=stopwords.words('english'))  
X = vectorizer.fit_transform(cleaned_tweets).toarray() # to convert tweets into corresponding TF-IDF feature vectors
y = np.array(y)
print(X.shape, y.shape)

(14640, 2000) (14640,)


In [None]:
# vocabulary items
vectorizer.get_feature_names()

### **Preparing each tweet**

In [9]:
vocabulary = vectorizer.vocabulary_  # vocabulary of 2000 words is learned from all tweets and each wordo is assigned a unique integer index in the output vector
document_tokens = vectorizer.build_tokenizer()  # splits a string into a sequence of tokens in a list

print('The vocabulary is', vocabulary)

The vocabulary is {'virginamerica': 1893, 'said': 1532, 'plus': 1356, 'added': 96, 'experience': 657, 'today': 1782, 'must': 1203, 'mean': 1132, 'need': 1212, 'take': 1729, 'another': 155, 'trip': 1816, 'really': 1437, 'entertainment': 618, 'amp': 147, 'little': 1064, 'big': 258, 'bad': 227, 'thing': 1756, 'seriously': 1574, 'would': 1973, 'pay': 1309, '30': 36, 'flight': 735, 'seats': 1555, 'flying': 758, 'va': 1884, 'yes': 1989, 'nearly': 1211, 'every': 634, 'time': 1776, 'fly': 754, 'go': 821, 'away': 220, 'missed': 1178, 'opportunity': 1266, 'without': 1953, 'https': 919, 'co': 391, 'well': 1933, 'amazing': 140, 'arrived': 184, 'hour': 910, 'early': 588, 'good': 829, 'know': 1003, 'second': 1557, 'cause': 335, 'death': 508, 'among': 145, '10': 2, '24': 26, 'lt': 1099, 'pretty': 1382, 'much': 1200, 'better': 256, 'great': 834, 'deal': 505, 'already': 134, 'thinking': 1759, '2nd': 34, 'australia': 208, 'even': 629, 'gone': 827, '1st': 19, 'yet': 1991, 'fabulous': 672, 'skies': 1616, 

**Number of Timestamps**

In [10]:
list_tweet_tokens = [document_tokens(tweet) for tweet in cleaned_tweets]  # each document which is a string is converted to a list of tokens


# Finding the number of timestamps

# length = 0  # to check the length of each document after tokenization 
tweets = []    # list of all documents
for i in range(len(list_tweet_tokens)): # for all 14640 tweets
  tokens = []  # list of imp tokens per tweet
  for j in range(len(list_tweet_tokens[i])): # length of tokens in tweet 
    token = list_tweet_tokens[i][j]   # ith tweet jth word 

    # check if token part of the top 200 words vocabulary to reduce sparsity
    if token in vocabulary:
      tokens.append(token)
  tweets.append(tokens)  # new tweet with tokens appended
   
max_length = max(map(len, tweets))  # map function iterates over the inner list to create a list of lengths
longest_tweet = max(tweets, key = len)

print(tweets[3])
print('<----->')
print(longest_tweet)
print(max_length)

['virginamerica', 'really', 'entertainment', 'amp', 'little']
<----->
['united', 'today', 'take', 'flight', 'san', 'francisco', 'refuse', 'let', 'us', 'board', 'baby', 'early', 'time', 'board', 'wont', 'let', 'us', 'take', 'baby', 'carryon', 'bag']
21


**Document Padding**

In [11]:
# Adding padding to each tweet to make its size equal to the number of timestamps

tweets_with_padding_list = []
def tweetPadding(all_tweets, max_length): 
  for i in all_tweets:
    diff_length = max_length - len(i)
    i = [0]*diff_length + i
    tweets_with_padding_list.append(i)
  return tweets_with_padding_list

tweets_with_padding = tweetPadding(tweets,max_length)

In [12]:
tweets_with_padding_array = np.array(tweets_with_padding)
tweets_with_padding_array[0]

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', 'virginamerica', 'said'],
      dtype='<U21')

**Word Vectorization**

In [13]:
all_tweets = []
max_features = 2000
for i in range(len(tweets_with_padding)):
  list_words_for_tweet = []
  print("Tweet #", i)
  
  for j in range(len(tweets_with_padding[i])):
    #print("----New word---")
    list_for_word = [0]*max_features
    word = tweets_with_padding[i][j]

    if word in vocabulary:
      index = vocabulary[word]
      tfidf_value = X[i,index]
      list_for_word[index] = tfidf_value
      #print(list_for_word)
      list_words_for_tweet.append(list_for_word)

    else:
      list_words_for_tweet.append(list_for_word)

  all_tweets.append(list_words_for_tweet)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Tweet # 9640
Tweet # 9641
Tweet # 9642
Tweet # 9643
Tweet # 9644
Tweet # 9645
Tweet # 9646
Tweet # 9647
Tweet # 9648
Tweet # 9649
Tweet # 9650
Tweet # 9651
Tweet # 9652
Tweet # 9653
Tweet # 9654
Tweet # 9655
Tweet # 9656
Tweet # 9657
Tweet # 9658
Tweet # 9659
Tweet # 9660
Tweet # 9661
Tweet # 9662
Tweet # 9663
Tweet # 9664
Tweet # 9665
Tweet # 9666
Tweet # 9667
Tweet # 9668
Tweet # 9669
Tweet # 9670
Tweet # 9671
Tweet # 9672
Tweet # 9673
Tweet # 9674
Tweet # 9675
Tweet # 9676
Tweet # 9677
Tweet # 9678
Tweet # 9679
Tweet # 9680
Tweet # 9681
Tweet # 9682
Tweet # 9683
Tweet # 9684
Tweet # 9685
Tweet # 9686
Tweet # 9687
Tweet # 9688
Tweet # 9689
Tweet # 9690
Tweet # 9691
Tweet # 9692
Tweet # 9693
Tweet # 9694
Tweet # 9695
Tweet # 9696
Tweet # 9697
Tweet # 9698
Tweet # 9699
Tweet # 9700
Tweet # 9701
Tweet # 9702
Tweet # 9703
Tweet # 9704
Tweet # 9705
Tweet # 9706
Tweet # 9707
Tweet # 9708
Tweet # 9709
Tweet # 9710
Tweet # 9711

In [14]:
datasets = np.array(all_tweets)

In [15]:
datasets.shape

(14640, 21, 2000)

In [1]:
datasets = datasets.astype(np.float32)
y = y.astype(np.float32)

x_train, x_val, y_train, y_val = train_test_split(datasets, y, test_size= 0.2, random_state= 2020)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

NameError: ignored

### **Data Loader and Batching**

In [None]:
batch_size = 16
training_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))

train_loader = DataLoader(training_data, shuffle= True, batch_size= batch_size)
val_loader = DataLoader(val_data, shuffle= True, batch_size= batch_size)

### **RNN Model**


In [None]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim

In [None]:
class Model(nn.Module):
  def __init__(self, input_size, output_size, hidden_size, n_layers):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers

    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True)
    self.fc1 = nn.Linear(hidden_size,output_size)
    self.fc2 = nn.Linear(output_size,2) 

  def forward(self,x, hidden):
    batch_size = x.size()[0]
    hidden = self.init_hidden(batch_size)

    rnn_out,hidden = self.rnn(x,hidden)
    rnn_out = self.fc1(rnn_out)
    last_out = rnn_out[:,-1,:].view(batch_size,-1)
    out = F.softmax(self.fc2(last_out))

    return out,hidden

  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)
    return hidden

In [None]:
model = Model(200,32,256,3)
print(model)

### **Training and Validation**

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  model.to(device)

# Hyper parameters
epochs = 10
learning_rate = 1e-4
counter = 0
clip = 5

# Loss and Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(epochs):

  model.train()

  train_hidden_values = model.init_hidden( batch_size)
  for step, (inputs, labels) in enumerate(train_loader):
  #for inputs, labels in train_loader:
    inputs, labels = inputs.to(device),labels.to(device)
    optimizer.zero_grad()
    predicted_outputs, h = model(inputs, train_hidden_values)
    loss = criterion(predicted_outputs, torch.max(labels, 1)[1]) # 1 indicates column wise
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), clip)
    optimizer.step() 
  
    model.eval()

    val_hidden_values = model.init_hidden(batch_size= batch_size)
    all_val_loss =[]
    for inputs, labels in val_loader:
      inputs, labels = inputs.to(device),labels.to(device)
      val_predicted_outputs, val_h = model(inputs, val_hidden_values)
      val_loss = criterion(val_predicted_outputs, torch.max(labels, 1)[1])
      all_val_loss.append(val_loss.item())

    print('Epoch: {}'.format(epoch+1), 
    'Batch: {}'.format(step), 
    'Training Loss: {:.5f}'.format(loss.item()),
    'Validation Loss: {:.5f}'.format(np.mean(all_val_loss)))
