<a href="https://colab.research.google.com/github/rohankavari/DeepLearning/blob/main/TwitterSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MAIN

In [None]:
import torch
import pandas as pd
import regex as re
import numpy as np
from collections import Counter

from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data.dataset import random_split
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from string import punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read Data

In [None]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv('/content/drive/MyDrive/dataset/twitter/training.1600000.processed.noemoticon.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [None]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
review_lens = Counter([len(x) for x in df['text']])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 374


In [None]:
print("There are {} tweets".format(df.shape[0]))

There are 1600000 tweets


In [None]:
print("There are {} positive tweets and {} negative tweets".format(df.target.value_counts()[0],df.target.value_counts()[4]))

There are 800000 positive tweets and 800000 negative tweets


In [None]:
data=df[['target','text']]
data.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


# Remove @usernames from the tweet

In [None]:
def rem_usr(a):
  return re.sub("@[a-z]*","",a)

In [None]:
data['text']=data['text'].apply(rem_usr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,target,text
0,0,"http://twitpic.com/2y1zl - Awww, that's a bum..."
1,0,is upset that he can't update his Facebook by ...
2,0,Kenichan I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


# Remove emoji

In [None]:
# !wget https://github.com/ajayshewale/Sentiment-Analysis-of-Text-Data-Tweets-/raw/master/data/emoticons.txt -P /content/drive/MyDrive/dataset/twitter

edit it to remove the words positive and negetive

In [None]:
with open("/content/drive/MyDrive/dataset/twitter/emoticons.txt", encoding = 'utf-8') as f:
  emojis = [i.rstrip() for i in f.readlines()]
def rem_emoji(line):
  return " ".join([word for word in line.split(' ') if word not in emojis])

In [None]:
data['text']=data['text'].apply(rem_emoji)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,target,text
0,0,"http://twitpic.com/2y1zl - Awww, that's a bum..."
1,0,is upset that he can't update his Facebook by ...
2,0,Kenichan I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


# Remove links

In [None]:
def cleaning_URLs(a):
    return re.sub('(www|http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])','',a)

In [None]:
data['text']=data['text'].apply(cleaning_URLs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,target,text
0,0,"- Awww, that's a bummer. You shoulda got Da..."
1,0,is upset that he can't update his Facebook by ...
2,0,Kenichan I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


# Making it lowercase

In [None]:
data['text']=data['text'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,target,text
0,0,"- awww, that's a bummer. you shoulda got da..."
1,0,is upset that he can't update his facebook by ...
2,0,kenichan i dived many times for the ball. mana...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


# Removing punctuation

In [None]:
def rem_punc(line):
  return "".join([word for word in line if word not in punctuation])

In [None]:
data['text']=data['text'].apply(rem_punc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,target,text
0,0,awww thats a bummer you shoulda got david ...
1,0,is upset that he cant update his facebook by t...
2,0,kenichan i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i he...


# Remove stopwords

In [None]:
def rem_stopwords(line):
  return " ".join([word for word in line.split(' ') if word not in stopwords])

In [None]:
stopwords=stopwords.words('english')
data['text']=data['text'].apply(rem_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
data.head()

Unnamed: 0,target,text
0,0,awww thats bummer shoulda got david carr t...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many times ball managed save 50...
3,0,whole body feels itchy like fire
4,0,behaving im mad cant see


# Encode Lables

In [None]:
data.target.value_counts()

0    800000
4    800000
Name: target, dtype: int64

In [None]:
data.target.replace({0: 0, 4: 1},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [None]:
data.target.value_counts()

0    800000
1    800000
Name: target, dtype: int64

# Generate Vocab

In [None]:
from torchtext.vocab import build_vocab_from_iterator

def yield_tokens():
    for line in data['text']:
        yield line.strip().split()

In [None]:
vocab = build_vocab_from_iterator(yield_tokens(), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
len(vocab)

586744

In [None]:
torch.save(vocab, '/content/drive/MyDrive/dataset/twitter/vocab/vocab.pth')

# Feature Gen

[358, 43, 1101, 3430, 13, 756, 9845, 1771, 3]

In [None]:
def featureGen(line):
  return vocab(line.strip().split())

In [None]:
featureGen("awww thats bummer shoulda got david carr third day")

[358, 43, 1101, 3430, 13, 756, 9845, 1771, 3]

In [None]:
data['features']=data['text'].apply(featureGen)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,target,text,features
0,0,awww thats bummer shoulda got david carr t...,"[358, 43, 1101, 3430, 13, 756, 9845, 1771, 3]"
1,0,upset cant update facebook texting might cry r...,"[671, 12, 437, 442, 1932, 194, 422, 2218, 74, ..."
2,0,kenichan dived many times ball managed save 50...,"[26945, 95728, 214, 251, 1242, 1530, 789, 1139..."
3,0,whole body feels itchy like fire,"[323, 711, 371, 2848, 5, 1052]"
4,0,behaving im mad cant see,"[10705, 1, 486, 12, 21]"


In [None]:
data=data[data['features'].str.len()!=0]

In [None]:
data[data['features'].str.len()>25]

Unnamed: 0,target,text,features
2026,0,bestflights need 2 get ass gear wana go away 2...,"[232860, 35, 31, 4, 454, 2604, 2539, 6, 137, 2..."
24320,0,nguyennoir c ht c b ri thng starr lï¿½c u mï¿½...,"[58776, 656, 10951, 656, 379, 7710, 21588, 158..."
40870,0,n bday may 3 n dad wnt b abl 2 gt ne mac tings...,"[193, 500, 236, 93, 193, 345, 6075, 379, 73108..."
40906,0,3 n bday may 3 n dad wnt b abl 2 gt ne mac tin...,"[93, 193, 500, 236, 93, 193, 345, 6075, 379, 7..."
48547,0,1 ngï¿½y mt mi vt ln vi cï¿½i eclipse cui cï¿½...,"[129, 29596, 4231, 1654, 15343, 16865, 7949, 1..."
...,...,...,...
1562796,1,trá» vá» tá»« phan thiáº¿t äáº§y giã³ náº¯ng...,"[114465, 30092, 27447, 52243, 176856, 117109, ..."
1565029,1,äã£ xong máº¥y viá»c chã­nh nhæ° sá»­a xe rá...,"[40748, 17754, 29580, 35458, 93941, 32869, 860..."
1580177,1,em tháº¥y máº¥y cã¡i tweet gáº§n äã¢y chá» ...,"[657, 23713, 29580, 14083, 150, 329776, 37917,..."
1583033,1,chá» tã­ nhã© mã¬nh cã i tweetdeck thá»­ láº§...,"[75402, 179611, 158998, 12884, 34006, 11916, 9..."


# fixing the dataset size as 1.5 mil 
og:1596610

In [None]:
data.shape

(1600000, 2)

In [None]:
data=data[0:1500000]

In [None]:
data.shape

(1500000, 3)

In [None]:
data.head()

Unnamed: 0,target,text,features
0,0,awww thats bummer shoulda got david carr t...,"[358, 43, 1101, 3430, 13, 756, 9845, 1771, 3]"
1,0,upset cant update facebook texting might cry r...,"[671, 12, 437, 442, 1932, 194, 422, 2218, 74, ..."
2,0,kenichan dived many times ball managed save 50...,"[26945, 95728, 214, 251, 1242, 1530, 789, 1139..."
3,0,whole body feels itchy like fire,"[323, 711, 371, 2848, 5, 1052]"
4,0,behaving im mad cant see,"[10705, 1, 486, 12, 21]"


# ~~ Saving the preprocessed data ~~

In [None]:
# from sklearn.utils import shuffle
# data = shuffle(data)
# data.reset_index(drop=True,inplace=True)

In [None]:
# data.head()

Unnamed: 0,target,text,features
0,1,aleksandrorlov thanks found many play friends ...,"[121379, 29, 218, 214, 200, 109, 707, 29]"
1,1,mstimab pretty good suns cursed ex goin sons s...,"[67939, 134, 2, 3554, 9689, 2178, 512, 3319, 1..."
2,0,9 rows stage lol saw julia outside ah meet gr...,"[588, 11466, 1493, 16, 186, 5577, 291, 380, 35..."
3,1,aww sweet girlie thanx today misssoflyy,"[239, 285, 3853, 1252, 8, 156177]"
4,0,missing chris like whoa,"[238, 1136, 5, 2085]"


In [None]:
# data[0:1500000].to_csv("/content/drive/MyDrive/dataset/twitter/processedData.csv",index=False,encoding="ISO-8859-1")

# ~~ Read the saved pre-processed data ~~

In [None]:
# DATASET_ENCODING = "ISO-8859-1"
# data=pd.read_csv("/content/drive/MyDrive/dataset/twitter/processedData.csv",encoding=DATASET_ENCODING)

In [None]:
# data.head()

Unnamed: 0,target,text,features
0,1,morethanmommy guess reading like quoti didnt k...,"[156871, 151, 368, 5, 1322, 50, 19, 68, 3373, ..."
1,1,mirandabuzzfans mexico school calendar horrible,"[51735, 2372, 74, 4316, 639]"
2,0,write back,"[518, 15]"
3,1,hopped brawndo quite ready bed yet lot fun new...,"[12211, 242226, 409, 119, 70, 130, 205, 47, 25..."
4,1,clubcali actually never,"[262041, 185, 83]"


In [None]:
# data[data['features'].str.len()==295]

Unnamed: 0,target,text,features
188675,1,ì´ì°¨í¼ bandwidthë ë¶ì° íì¼ ìì¤í...,"[583431, 227344, 583290, 583618, 185359, 58140..."


# Padding

In [None]:
data.shape

(1500000, 3)

In [None]:
features=data.features.to_list()
encoded_labels=data.target.to_numpy()

In [None]:
encoded_labels[0:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
review_lens = Counter([len(x) for x in features])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 37


In [None]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    features=[]
    # implement function
    for i in reviews_ints:
      if len(i)>seq_length:
        features.append(i[:seq_length])
      elif len(i)<seq_length:
        features.append(np.pad(i, (seq_length-len(i),0), 'constant', constant_values=0).tolist())
      else:
        features.append(i)    
    return np.array(features)

In [None]:
seq_length = 37

features = pad_features(features, seq_length=seq_length)

In [None]:
len(features)

1500000

2 × 5 × 67 × 2383

# Split dataset

In [None]:
train_data = TensorDataset(torch.from_numpy(features), torch.from_numpy(encoded_labels))
# dataloader = DataLoader(train_data, batch_size=8, shuffle=True)

In [None]:
len(train_data)

1500000

In [None]:
BATCH_SIZE=1000
num_train = int(len(train_data) * 0.90)
num_test = int(len(train_data) * 0.05)

In [None]:
print(num_train,num_test)

1350000 75000


In [None]:
split_train_, split_valid_ ,split_test= random_split(train_data, [num_train,num_test,len(train_data)-num_train-num_test])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True)
test_dataloader = DataLoader(split_test, batch_size=BATCH_SIZE,
                             shuffle=True)

In [None]:
len(train_dataloader)

1350

In [None]:
for i,b in train_dataloader:
  print(i.shape)

In [None]:
for i,b in valid_dataloader:
  print(i.shape)

# Training

In [None]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [None]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)

        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)

        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels

        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden
        

In [None]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab)+1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

# net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers) checkpointing

#########FOR CHECKPOINT################
# Initialize model
net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
# Load state_dict
net.load_state_dict(torch.load('/content/drive/MyDrive/dataset/twitter/model/mod1.pth'))
#########FOR CHECKPOINT################


print(net)

SentimentRNN(
  (embedding): Embedding(586745, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

4351 sec for 1 epoc

In [None]:
# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(BATCH_SIZE)

    # batch loop
    for inputs, labels in train_dataloader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(BATCH_SIZE)
            val_losses = []
            net.eval()
            for inputs, labels in valid_dataloader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/4... Step: 100... Loss: 0.374585... Val Loss: 0.385488
Epoch: 1/4... Step: 200... Loss: 0.401490... Val Loss: 0.385888
Epoch: 1/4... Step: 300... Loss: 0.387386... Val Loss: 0.382064
Epoch: 1/4... Step: 400... Loss: 0.405070... Val Loss: 0.381447
Epoch: 1/4... Step: 500... Loss: 0.416662... Val Loss: 0.380805
Epoch: 1/4... Step: 600... Loss: 0.409396... Val Loss: 0.379965
Epoch: 1/4... Step: 700... Loss: 0.389190... Val Loss: 0.379870
Epoch: 1/4... Step: 800... Loss: 0.392654... Val Loss: 0.379140
Epoch: 1/4... Step: 900... Loss: 0.365571... Val Loss: 0.380795
Epoch: 1/4... Step: 1000... Loss: 0.361191... Val Loss: 0.378750
Epoch: 1/4... Step: 1100... Loss: 0.394786... Val Loss: 0.379170
Epoch: 1/4... Step: 1200... Loss: 0.395431... Val Loss: 0.378210
Epoch: 1/4... Step: 1300... Loss: 0.371053... Val Loss: 0.378105
Epoch: 2/4... Step: 1400... Loss: 0.376500... Val Loss: 0.384129
Epoch: 2/4... Step: 1500... Loss: 0.366920... Val Loss: 0.382600
Epoch: 2/4... Step: 1600... Loss: 

KeyboardInterrupt: ignored

In [None]:
## ALWAYS CHANGE THE NUMBER
PATH="/content/drive/MyDrive/dataset/twitter/model/mod2.pth"
torch.save(net.state_dict(), PATH)

In [None]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(BATCH_SIZE)

net.eval()
# iterate over test data
for inputs, labels in test_dataloader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_dataloader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.461
Test accuracy: 0.818


# Inference

Preprocessing Pipeline

1.   remove @
2.   remove emoji
3.   remove links
4.   lowercase
5. remove punc



In [None]:
def rem_usr(a):
  return re.sub("@[a-z]*","",a)

with open("/content/drive/MyDrive/dataset/twitter/emoticons.txt", encoding = 'utf-8') as f:
  emojis = [i.rstrip() for i in f.readlines()]
def rem_emoji(line):
  return " ".join([word for word in line.split(' ') if word not in emojis])

def cleaning_URLs(a):
    return re.sub('(www|http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])','',a)

def rem_punc(line):
  return "".join([word for word in line if word not in punctuation])

def rem_stopwords(line):
  stopwords=nltk.corpus.stopwords.words('english')
  return " ".join([word for word in line.split(' ') if word not in stopwords])

def featureGen(line):
  return vocab(line.strip().split())

In [None]:
def predict(net, test_review, sequence_length=37):    
    # print custom response based on whether test_review is pos/neg
    
    test_review=rem_usr(test_review)

    with open("/content/drive/MyDrive/dataset/twitter/emoticons.txt", encoding = 'utf-8') as f:
      emojis = [i.rstrip() for i in f.readlines()]
    test_review=rem_emoji(test_review)
    
    test_review=cleaning_URLs(test_review)

    test_review=test_review.lower()

    test_review=rem_punc(test_review)

    
    test_review=rem_stopwords(test_review)
    
    vocab = torch.load('/content/drive/MyDrive/dataset/twitter/vocab/vocab.pth')
    features=featureGen(test_review)

    features=torch.tensor(features)
    features=torch.reshape(features,(1,-1))
    net.eval()
     # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    # h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs= features.cuda()
    h = net.init_hidden(1)
    # get predicted outputs
    output, h = net(inputs, h)
    
    if output.item()<0.5:
      return ['Negetive',output.item()]
    else:
      return ['Positive',output.item()]
     


In [None]:
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')
vocab = torch.load('/content/drive/MyDrive/dataset/twitter/vocab/vocab.pth')
vocab_size = len(vocab)+1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
# Load state_dict
net.load_state_dict(torch.load('/content/drive/MyDrive/dataset/twitter/model/mod2.pth'))
if(train_on_gpu):
    net.cuda()



Training on GPU.


In [None]:
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'
predict(net, test_review_neg )

['Negetive', 0.0011623827740550041]

In [None]:
test_review_neg = 'The best movie I have seen acting was great.'
predict(net, test_review_neg )

['Positive', 0.7507647275924683]

In [None]:
test_review_neg = 'my dog is not well '
predict(net, test_review_neg )

['Negetive', 0.18143893778324127]

In [None]:
test_review_neg = 'my dog is not fine '
predict(net, test_review_neg )

['Positive', 0.8664206862449646]

# Flask deployment

In [None]:
!pip install flask-ngrok
!pip install flask==0.12.2
!pip install pyngrok

Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting flask==0.12.2
  Downloading Flask-0.12.2-py2.py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.1 MB/s 
Installing collected packages: flask
  Attempting uninstall: flask
    Found existing installation: Flask 1.1.4
    Uninstalling Flask-1.1.4:
      Successfully uninstalled Flask-1.1.4
Successfully installed flask-0.12.2
Collecting pyngrok
  Downloading pyngrok-5.1.0.tar.gz (745 kB)
[K     |████████████████████████████████| 745 kB 5.4 MB/s 
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-5.1.0-py3-none-any.whl size=19007 sha256=dee0e8832314915e344934b82f251113439880a8fc3ef52c25ee147f96c067ed
  Stored in directory: /root/.cache/pip/wheels/bf/e6/af/ccf6598ecefecd44104069371795cb9b

In [None]:
!ngrok authtoken 2166TespcaSIOXmMUpX6UO0pG4O_3H31LPqBF9FxqH7cCDcyZ

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
from flask import Flask
from flask_ngrok import run_with_ngrok
from flask import Flask, request, render_template
app = Flask(__name__, template_folder='/')
run_with_ngrok(app)  # Start ngrok when app is run
@app.route("/")
def hello():
    return  render_template("/content/drive/MyDrive/dataset/twitter/webapp/infer.html")

@app.route('/result', methods = ['POST'])
def result():
    if request.method == 'POST':
        to_predict_list = request.form.to_dict()
        # to_predict_list = list(to_predict_list.values())
        # to_predict_list = list(map(int, to_predict_list))
        print(to_predict_list)
        result = predict(net, to_predict_list['text'] )
        prediction=result[0]+" "+str(result[1])     
        return render_template("/content/drive/MyDrive/dataset/twitter/webapp/result.html", prediction = prediction)
if __name__ == '__main__':
    app.run()  # If address is in use, may need to terminate other sessions:
               # Runtime > Manage Sessions > Terminate Other Sessions

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://372e-107-178-218-119.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [15/Mar/2022 06:03:19] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [15/Mar/2022 06:03:19] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


{'text': 'my dog is not well'}


127.0.0.1 - - [15/Mar/2022 06:03:26] "[37mPOST /result HTTP/1.1[0m" 200 -
