In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F # Provides nD convolution functions
from torch.utils.data import DataLoader, TensorDataset

In [2]:
import numpy as np
import pandas as pd

import re

In [3]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import nltk
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt

In [4]:
nltk.download('punkt')#This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, 
#collocations, and words that start sentences. It must be trained on a large collection of plaintext in the target language before it can be used.

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Hostname mismatch, certificate is not valid for
[nltk_data]     'raw.githubusercontent.com'. (_ssl.c:1129)>


False

In [5]:
train_path = "lstm_data.csv"
test_path = "lstm_test.csv"

In [6]:
train_df = pd.read_csv(train_path)

In [7]:
test_df = pd.read_csv(test_path)

In [8]:
train_df

Unnamed: 0,Id,Category,Tweet
0,635769805279248384,negative,Not Available
1,635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
2,635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
3,636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
4,636100906224848896,positive,Not sure how to start your publication on iOS?...
...,...,...,...
5965,639016598477651968,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,640276909633486849,neutral,Rib injury for Zlatan against Russia is a big ...
5967,640296841725235200,neutral,Noooooo! I was hoping to see Zlatan being Zlat...
5968,641017384908779520,neutral,Not Available


In [9]:
train_df=train_df.drop(columns=['Id'])
train_df=train_df.dropna()
train_df=train_df[train_df['Tweet']!= "Not Available"]
train_df

Unnamed: 0,Category,Tweet
1,neutral,IOS 9 App Transport Security. Mm need to check...
2,neutral,"Mar if you have an iOS device, you should down..."
3,negative,@jimmie_vanagon my phone does not run on lates...
4,positive,Not sure how to start your publication on iOS?...
5,neutral,"Two Dollar Tuesday is here with Forklift 2, Qu..."
...,...,...
5963,positive,"Ok ed let's do this, Zlatan, greizmann and Lap..."
5964,neutral,Goal level: Zlatan 90k by Friday? = Posting e...
5965,neutral,@YouAreMyArsenal Wouldn't surprise me if we en...
5966,neutral,Rib injury for Zlatan against Russia is a big ...


In [10]:
test_df

Unnamed: 0,Id,Category
0,6.289494e+17,dear @Microsoft the newOoffice for Mac is grea...
1,6.289766e+17,@Microsoft how about you make a system that do...
2,6.290232e+17,Not Available
3,6.291792e+17,Not Available
4,6.291863e+17,If I make a game as a #windows10 Universal App...
...,...,...
9963,,
9964,,
9965,,
9966,,


In [11]:
test_df = test_df.rename(columns={"Category" : "Tweet"})

In [12]:
test_df=test_df.drop(columns=['Id'])
test_df=test_df.dropna()
test_df=test_df[test_df['Tweet']!= "Not Available"]
test_df

Unnamed: 0,Tweet
0,dear @Microsoft the newOoffice for Mac is grea...
1,@Microsoft how about you make a system that do...
4,If I make a game as a #windows10 Universal App...
5,"Microsoft, I may not prefer your gaming branch..."
6,@MikeWolf1980 @Microsoft I will be downgrading...
...,...
3994,Anybody with a Steak &amp; Shake or IHOP move ...
3995,I am assembling an epic Pancake Posse for an I...
3996,do you work at Ihop tomorrow @carlysunshine_
3997,23 Aug 00;30 #771NAS Rescue193 returned from T...


In [13]:
train_df['Category'].value_counts()

positive    2599
neutral     1953
negative     869
Tweet          1
Name: Category, dtype: int64

In [14]:
remove_pos = 2599 - 869
remove_neut = 1953 - 869

In [15]:
neg_df= train_df[train_df["Category"]=="negative"]
pos_df=train_df[train_df["Category"] == "positive"]
neut_df =train_df[train_df["Category"]=="neutral"]

In [16]:
pos_drop_indices=np.random.choice(pos_df.index, remove_pos, replace=False)
neut_drop_indices=np.random.choice(neut_df.index, remove_neut, replace=False)

In [17]:
pos_undersampled=pos_df.drop(pos_drop_indices)
neut_undersampled=neut_df.drop(neut_drop_indices)

In [18]:
pos_undersampled

Unnamed: 0,Category,Tweet
11,positive,"Today @YouTubeGaming launches, with apps for i..."
13,positive,"Met with iOS Developer today. We may have a ""g..."
18,positive,See news through the eyes of real people &amp;...
21,positive,"@facebook As I've been on FB for many years, I..."
33,positive,@tim_cook loving my new iPhone 6 from T-Mobile...
...,...,...
5939,positive,7 more days till we start the campaign that wi...
5940,positive,The VP of France's refereeing union Laurent Ug...
5947,positive,@DaveEllis11 @klavierstuk but if Zlatan is ava...
5952,positive,"Mourinho, Zlatan, Serie A - who are the winner..."


In [19]:
balanced_train_df= pd.concat([neg_df, pos_undersampled, neut_undersampled])

In [20]:
balanced_train_df["Category"].value_counts()

negative    869
positive    869
neutral     869
Name: Category, dtype: int64

In [21]:
train_clean_df, test_clean_df=train_test_split(balanced_train_df, test_size=0.15)

In [22]:
train_clean_df

Unnamed: 0,Category,Tweet
4280,positive,According to @SalSports we may see little to n...
1790,positive,Messi and Ronaldo both go up a rating in FUT 1...
1907,positive,"it's August 5 in MetLife stadium, stay safe Ha..."
5513,positive,"valentine's day, jesus christ you got some goo..."
1479,negative,Sound like Lexus I'll believe it when I see i...
...,...,...
215,neutral,Today's the 9th? We've got some apple news com...
4964,positive,"@andycastro @MiklCraw4d ok, so he can pretend ..."
3628,neutral,@charles_pence @CarrieKHutchens 1st found gu...
4704,positive,@gorgeousgg @watsb4me hello and good morning l...


In [23]:
test_clean_df

Unnamed: 0,Category,Tweet
1441,neutral,Lexus Is trying to text me about a math proble...
122,neutral,what she gonna buy with $200? 1/4th of an ipad...
3646,neutral,@JimDarcy2 @LTider @RWSurferGirl That law brea...
5008,negative,Theresa May is the Tory leader Labour should f...
5241,neutral,With Tsipras resignaton #Greece will head to t...
...,...,...
5724,positive,I notice several people having issues with SAM...
1459,positive,"Moving out by January and buying a new Lexus, ..."
4313,negative,"T-Mobile promised me next day air, so if I don..."
1325,positive,"Tomorrow... Ima hit this gym thang, and sauna ..."


In [24]:
train_set= list(train_clean_df.to_records(index=False))
test_set = list(test_clean_df.to_records(index=False))

In [25]:
train_set[:10]

[('positive', 'According to @SalSports we may see little to none of Matt Cassel tonight. Predominately T-Mobile and EJ'),
 ('positive', 'Messi and Ronaldo both go up a rating in FUT 16 with Messi a 94 and Ronaldo 93 Suarez is a 90 as 3rd best player in the game.'),
 ('positive', "it's August 5 in MetLife stadium, stay safe Harry, thinking about you the whole night #HarryBeCareful"),
 ('positive', "valentine's day, jesus christ you got some good 'd' right there https://t.co/kM8Q1kF9Je"),
 ('negative', "Sound like Lexus  I'll believe it when I see it  https://t.co/Sm1Xjrf2Pg"),
 ('neutral', "Wonder if Sting will be able to speak with his actual voice on #Raw on Monday, or if they'll dub him over like a movie trailer again."),
 ('neutral', 'I found my old ipod classic from the 5th grade the songs on it oh my god'),
 ('positive', '@ValhallaD6w the sun is out in Limerick and most definitely shining in Kerry. We almost in Ballybunion. Actual blue sky'),
 ('neutral', "Europe spent months tryi

In [26]:
def remove_links_mentions(tweet):
    link_re_pattern = "https?:\/\/t.co/[\w]+"
    mention_re_pattern = "@\w+"
    tweet = re.sub(link_re_pattern, "", tweet)
    tweet = re.sub(mention_re_pattern, "", tweet)
    return tweet.lower()


In [27]:
remove_links_mentions('...and Jeb Bush is third in the polls and losing donors. Be fair and balance...@karlrove @FoxNews. https://t.co/Ka2km3bua6')

'...and jeb bush is third in the polls and losing donors. be fair and balance... . '

In [28]:
train_set = [(label,word_tokenize(remove_links_mentions(tweet)))for label,tweet in train_set]
train_set[:3]

[('positive',
  ['according',
   'to',
   'we',
   'may',
   'see',
   'little',
   'to',
   'none',
   'of',
   'matt',
   'cassel',
   'tonight',
   '.',
   'predominately',
   't-mobile',
   'and',
   'ej']),
 ('positive',
  ['messi',
   'and',
   'ronaldo',
   'both',
   'go',
   'up',
   'a',
   'rating',
   'in',
   'fut',
   '16',
   'with',
   'messi',
   'a',
   '94',
   'and',
   'ronaldo',
   '93',
   'suarez',
   'is',
   'a',
   '90',
   'as',
   '3rd',
   'best',
   'player',
   'in',
   'the',
   'game',
   '.']),
 ('positive',
  ['it',
   "'s",
   'august',
   '5',
   'in',
   'metlife',
   'stadium',
   ',',
   'stay',
   'safe',
   'harry',
   ',',
   'thinking',
   'about',
   'you',
   'the',
   'whole',
   'night',
   '#',
   'harrybecareful'])]

In [29]:
test_set = [(label,word_tokenize(remove_links_mentions(tweet)))for label,tweet in test_set]
test_set[:3]

[('neutral',
  ['lexus',
   'is',
   'trying',
   'to',
   'text',
   'me',
   'about',
   'a',
   'math',
   'problem',
   ',',
   'idk',
   'what',
   'math',
   'is',
   'till',
   'tuesday',
   ',',
   'do',
   "n't",
   'text',
   'me',
   'with',
   'yo',
   'homework',
   '.']),
 ('neutral',
  ['what',
   'she',
   'gon',
   'na',
   'buy',
   'with',
   '$',
   '200',
   '?',
   '1/4th',
   'of',
   'an',
   'ipad']),
 ('neutral',
  ['that',
   'law',
   'breaks',
   'the',
   '14th',
   'amendment',
   ',',
   'so',
   'it',
   'is',
   'in',
   'scotus',
   'power',
   'to',
   'strike',
   'it',
   'down',
   '.'])]

In [30]:
index2word = ["<PAD>", "<SOS>", "<EOS>"]

for ds in [train_set, test_set]:
    for label, tweet in ds:
        for token in tweet:
            if token not in index2word:
                index2word.append(token)

In [31]:
print(index2word[10])
print(len(index2word))
print(index2word)
print(type(index2word))

of
7754
<class 'list'>


In [32]:
word2index ={token : idx for idx, token in enumerate(index2word)}
print(type(word2index))

<class 'dict'>


In [33]:
word2index["the"]

39

In [34]:
def label_map(label):
  if label == "negative":
    return 0
  elif label == "neutral":
    return 1
  else : #positive labels
    return 2     

In [35]:
seq_length = 32

In [36]:
def encode_and_pad(tweet, length):
    sos = [word2index["<SOS>"]]
    eos = [word2index["<EOS>"]]
    pad = [word2index["<PAD>"]]

    if len(tweet) < length - 2: # -2 for SOS and EOS
        n_pads = length - 2 - len(tweet)
        encoded = [word2index[w] for w in tweet]
        print(encoded)
        return sos + encoded + eos + pad * n_pads 
    else: # tweet is longer than possible; truncating
        encoded = [word2index[w] for w in tweet]
        truncated = encoded[:length - 2]
        return sos + truncated + eos

In [37]:
train_encoded = [(encode_and_pad(tweet, seq_length), label_map(label)) for label, tweet in train_set]

[3, 4, 5, 6, 7, 8, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
[41, 42, 43, 44, 26, 45, 46, 47, 48, 49, 50, 47, 51, 52, 53, 39, 54, 55, 56, 57]
[58, 42, 59, 47, 60, 61, 53, 62, 63, 64, 65, 66, 67, 68]
[69, 70, 71, 72, 73, 74, 41, 75, 72, 7, 41]
[72, 97, 98, 99, 100, 101, 102, 39, 103, 104, 39, 105, 86, 41, 106, 98, 107]
[39, 108, 33, 109, 26, 110, 17, 111, 112, 113, 26, 114, 14, 5, 115, 26, 116, 14, 84, 117, 118]
[119, 120, 121, 122, 4, 123, 124, 14, 125, 126, 127, 128, 24, 129, 130, 119, 6, 131, 132, 133, 86, 134, 10, 135, 42, 124]
[72, 73, 80, 136, 26, 137, 42, 138, 139, 140, 14, 141, 29, 142, 143, 56, 144, 56, 145, 56, 146]
[147, 148, 149, 150, 151, 152, 153, 154, 155, 14, 156, 157, 150, 158, 29, 24, 159, 14, 160, 161, 162, 33, 163, 164, 160, 165, 162]
[72, 166, 167, 168, 169, 24, 170, 10, 171, 86, 39, 172, 173, 174, 175, 176, 172, 177]
[75, 53, 178, 179, 180, 109, 4, 39, 181, 17, 182, 14]
[183, 184, 185, 186, 5, 187, 39, 188, 189, 190, 191, 10, 192, 193, 194, 195, 17, 196, 41, 197, 1

[6082, 41, 203, 70, 231, 6, 322, 80, 39, 734, 14, 83, 734, 3080, 3599, 6083, 5860, 765, 389, 322, 64, 14]
[816, 466, 26, 1118, 236, 551, 4, 219, 59, 5, 495, 322, 233, 24, 88, 6086, 14, 88, 4412, 79, 22, 109, 29, 460, 42, 1024, 14]
[39, 69, 10, 140, 29, 267, 268, 529, 6087, 86, 39, 6088, 56, 6089, 56, 6090]
[1432, 3839, 271, 5437, 17, 2234, 339, 39, 1034, 4, 6093, 93, 4808, 10, 934, 935, 14, 4808, 2219, 109, 1131, 4385, 14]
[56, 831, 185, 438, 42, 3876, 3602, 1481, 6, 1444, 4736, 26, 923, 185, 438, 1924, 127, 222, 6094, 363, 345, 806, 14]
[4091, 1110, 457, 709, 10, 567, 921, 837, 4695, 14, 72, 222, 232, 6095, 6, 80, 6096, 1678, 14, 685, 24, 283, 2746, 309]
[1085, 6101, 47, 319, 320, 22, 4, 1767, 411, 775, 1681, 269, 33, 86, 39, 460, 573, 14, 56, 6102]
[6103, 4732, 4216, 1349, 244, 39, 6104, 6105, 185, 467, 6, 233, 321, 1128, 285, 6106, 345, 56, 2187, 56, 6107]
[457, 271, 53, 691, 1561, 86, 645, 253, 70, 2322, 853, 582, 320, 1136, 411, 339, 53, 691, 953]
[78, 6, 430, 53, 63, 3565, 14]
[2

[68, 271, 307, 6465, 324, 26, 39, 6466, 4015, 244, 39, 934, 935, 2435, 26, 6467, 14, 41, 389, 227, 6466, 193, 357, 6468, 286, 14]
[271, 53, 377, 39, 138, 139, 861, 244, 152, 5409, 6469, 1544, 72, 431, 998, 4744, 41, 33, 152, 5409, 6470, 345]
[72, 201, 281, 4, 289, 290, 266, 638, 143]
[1813, 805, 339, 411, 114, 47, 5058, 2580, 10, 380, 53, 178, 2532, 224, 1155, 1571, 47, 233, 24, 6472, 59, 4939]
[1083, 1336, 1337, 6473, 997, 42, 789, 244, 403, 1201, 1764, 2895, 185, 24, 8, 303, 304, 1125, 453, 1548, 126, 6474, 345]
[98, 6475, 102, 39, 148, 6476, 72, 6477, 1124, 567, 305, 4, 39, 530, 222, 125, 33, 322, 3660, 584, 585, 42, 6475, 26, 583, 10, 411, 14]
[816, 39, 2038, 1004, 44, 33, 6213, 47, 419, 47, 1332, 47, 39, 6478, 6479, 47, 17, 6480, 6481, 26, 236, 1224, 29, 557, 6482, 35, 39, 1225, 1150]
[2124, 6, 47, 26, 1114, 6483, 4, 985, 93, 39, 6484, 375, 946, 26, 743, 4287, 4004, 47, 166, 582, 6485, 39, 1724, 1326, 14]
[72, 167, 456, 186, 131, 41, 26, 224, 1428, 4, 863, 39, 131, 408, 6486, 102,

In [38]:
test_encoded = [(encode_and_pad(tweet, seq_length), label_map(label)) for label, tweet in test_set]

[71, 33, 122, 4, 790, 215, 52, 24, 3015, 1275, 47, 994, 457, 3015, 33, 587, 604, 47, 370, 320, 790, 215, 29, 2428, 7070, 14]
[457, 820, 739, 429, 2453, 29, 497, 6290, 198, 7071, 10, 357, 575]
[236, 2085, 6345, 39, 354, 230, 47, 223, 41, 33, 26, 349, 1762, 4, 7072, 41, 1020, 14]
[2124, 6, 33, 39, 743, 1064, 2272, 335, 2748]
[29, 124, 7073, 56, 135, 79, 3125, 4, 39, 1550, 339, 39, 1225, 500, 26, 426, 650, 14, 806, 56, 2704]
[1195, 128, 39, 831, 22, 3575, 139, 2177, 26, 289, 290, 47, 319, 86, 140, 17, 75, 53, 5471, 357, 197, 345]
[2124, 6, 4400, 339, 7077, 4, 2097, 1289, 3886, 17, 7078, 1069, 589, 3565, 26, 39, 4236, 185]
[484, 1020, 4, 625, 645, 47, 7079, 72, 233, 4, 336, 1911, 339, 411, 14, 7080, 7081, 7082, 29, 29, 98, 7083, 4208, 14]
[7084, 4, 39, 1352, 1072, 10, 182, 177, 24, 142, 7085, 10, 6389, 47, 7086, 10, 7087, 14, 783, 586, 236, 7088, 527, 320, 2839, 41, 14, 6580, 1579]
[7089, 6, 233, 24, 2710, 567, 1015, 29, 1012, 5547, 39, 557, 530, 223, 2913, 143]
[7090, 941, 53, 335, 233, 2

In [39]:
for i in train_encoded[:3]:
    print(i)

([1, 3, 4, 5, 6, 7, 8, 4, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 2)
([1, 19, 17, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 19, 24, 30, 17, 20, 31, 32, 33, 24, 34, 35, 36, 37, 38, 26, 39, 40, 14, 2], 2)
([1, 41, 42, 43, 44, 26, 45, 46, 47, 48, 49, 50, 47, 51, 52, 53, 39, 54, 55, 56, 57, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 2)


In [57]:
batch_size = 50


2


In [58]:
train_x = np.array([tweet for tweet, label in train_encoded])
print(train_x)
train_y = np.array([label for tweet, label in train_encoded])
print(train_y)
test_x = np.array([tweet for tweet, label in test_encoded])
test_y = np.array([label for tweet, label in test_encoded])

[[   1    3    4 ...    0    0    0]
 [   1   19   17 ...   40   14    2]
 [   1   41   42 ...    0    0    0]
 ...
 [   1  152   97 ...    0    0    0]
 [   1 2476   17 ...    0    0    0]
 [   1   98 7069 ...    0    0    0]]
[2 2 2 ... 1 2 1]


In [53]:
print((train_x.shape))

(2215, 32)


In [51]:
A=torch.from_numpy(test_x)
print(A)
print(type(A))
print(A.shape)

tensor([[   1,   71,   33,  ...,    0,    0,    0],
        [   1,  457,  820,  ...,    0,    0,    0],
        [   1,  236, 2085,  ...,    0,    0,    0],
        ...,
        [   1,   16, 3331,  ...,    0,    0,    0],
        [   1,  411,  345,  ...,   59, 1546,    2],
        [   1,   41,   42,  ...,    0,    0,    0]], dtype=torch.int32)
<class 'torch.Tensor'>
torch.Size([392, 32])


In [42]:
train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_ds = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

In [43]:
train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size, drop_last=True)

In [44]:
class BiLSTM_SentimentAnalysis(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout) :
        super().__init__()

        # The embedding layer takes the vocab size and the embeddings size as input
        # The embeddings size is up to you to decide, but common sizes are between 50 and 100.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM layer takes in the the embedding size and the hidden vector size.
        # The hidden dimension is up to you to decide, but common values are 32, 64, 128
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # We use dropout before the final layer to improve with regularization
        self.dropout = nn.Dropout(dropout)

        # The fully-connected layer takes in the hidden dim of the LSTM and
        #  outputs a a 3x1 vector of the class scores.
        self.fc = nn.Linear(hidden_dim, 3)

    def forward(self, x, hidden):
        """
        The forward method takes in the input and the previous hidden state 
        """

        # The input is transformed to embeddings by passing it to the embedding layer
        embs = self.embedding(x)

        # The embedded inputs are fed to the LSTM alongside the previous hidden state
        out, hidden = self.lstm(embs, hidden)

        # Dropout is applied to the output and fed to the FC layer
        out = self.dropout(out)
        out = self.fc(out)

        # We extract the scores for the final hidden state since it is the one that matters.
        out = out[:, -1]
        return out, hidden
    
    def init_hidden(self):
        return (torch.zeros(1, batch_size, 32), torch.zeros(1, batch_size, 32))

In [45]:
model = BiLSTM_SentimentAnalysis(len(word2index), 64, 32, 0.2)
model = model.to(device)


NameError: name 'device' is not defined

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

In [None]:
epochs = 50
losses = []
for e in range(epochs):

    h0, c0 =  model.init_hidden()

    h0 = h0.to(device)
    c0 = c0.to(device)

    for batch_idx, batch in enumerate(train_dl):

        input = batch[0].to(device)
        target = batch[1].to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            out, hidden = model(input, (h0, c0))
            loss = criterion(out, target)
            loss.backward()
            optimizer.step()
    losses.append(loss.item())

In [None]:
plt.plot(losses)

In [None]:
batch_acc = []
for batch_idx, batch in enumerate(test_dl):

    input = batch[0].to(device)
    target = batch[1].to(device)

    optimizer.zero_grad()
    with torch.set_grad_enabled(False):
        out, hidden = model(input, (h0, c0))
        _, preds = torch.max(out, 1)
        preds = preds.to("cpu").tolist()
        batch_acc.append(accuracy_score(preds, target.tolist()))

sum(batch_acc)/len(batch_acc)