In [None]:
!python -m spacy download en

In [None]:
import spacy
import torch
import torchtext
from torchtext import datasets

import re
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

### Twitter Sentiment Analysis Dataset
Source: http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/

In [None]:
tweets = pd.read_csv('datasets/tweets/tweets.csv', error_bad_lines = False)

#tweets = tweets.head(50000)

tweets.head()

The dataframe consists of 4 columns and we want to use only ‘Sentiment’ and ‘SentimentText’.

In [None]:
tweets  = tweets.drop(columns = ['ItemID', 'SentimentSource'], axis = 1)

tweets.head()

In [None]:
tweets.shape

In [None]:
tweets['Sentiment'].unique()

In [None]:
tweets.Sentiment.value_counts()

In [None]:
fig = plt.figure(figsize=(12, 8))

ax = sns.barplot(x=tweets.Sentiment.unique(), y=tweets.Sentiment.value_counts())

ax.set(xlabel='Labels')

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(tweets, test_size=0.2, random_state=42)

In [None]:
train.reset_index(drop=True), test.reset_index(drop=True)

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
train.to_csv('datasets/tweets/train_tweets.csv', index=False)
test.to_csv('datasets/tweets/test_tweets.csv', index=False)

In [None]:
!ls datasets/tweets

#### removing non alphanumeric character

In [None]:
def tweet_clean(text):
    
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) 
    text = re.sub(r'https?:/\/\S+', ' ', text) 
    
    return text.strip()

####  processing and tokenization, so that it can be converted into indices.

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner'])

def tokenizer(s): 
    return [w.text.lower() for w in nlp(tweet_clean(s))]

In [None]:
tokenizer("pytorch is #awesome!!")

In [None]:
TEXT = torchtext.data.Field(tokenize = tokenizer)

LABEL = torchtext.data.LabelField(dtype = torch.float)

In [None]:
datafields = [('Sentiment', LABEL), ('SentimentText', TEXT)]

#### TabularDataset to read csv files and process them

In [None]:
trn, tst = torchtext.data.TabularDataset.splits(path = 'datasets/tweets/', 
                                                train = 'train_tweets.csv',
                                                test = 'test_tweets.csv',    
                                                format = 'csv',
                                                skip_header = True,
                                                fields = datafields)

In [None]:
print(f'Number of training examples: {len(trn)}')
print(f'Number of testing examples: {len(tst)}')

In [None]:
vars(trn.examples[0])

In [None]:
vars(tst.examples[0])

#### Building vocabulary using glove.6B.100d

In [None]:
TEXT.build_vocab(trn, max_size=25000,
                 vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(trn)

In [None]:
print(TEXT.vocab.freqs.most_common(50))

In [None]:
print(TEXT.vocab.itos[:10])

In [None]:
print(LABEL.vocab.stoi)

#### Loading the data in batches (BucketIterator() dataloader - to group same length sentences )

In [None]:
train_iterator, test_iterator = torchtext.data.BucketIterator.splits(
                                (trn, tst),
                                batch_size = 64,
                                sort_key=lambda x: len(x.SentimentText),
                                sort_within_batch=False)

#### RNN architecture


In [None]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                 output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers, 
                           bidirectional = bidirectional, dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, text):
        
        embedded = self.dropout(self.embedding(text))
        
        output, hidden = self.rnn(embedded)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
       
        return self.fc(hidden.squeeze(0))

In [None]:
input_dim = len(TEXT.vocab)

embedding_dim = 100

hidden_dim = 20
output_dim = 1

n_layers = 2
bidirectional = True

dropout = 0.5

In [None]:
model = RNN(input_dim, 
            embedding_dim, 
            hidden_dim, 
            output_dim, 
            n_layers, 
            bidirectional, 
            dropout)

In [None]:
model

We retrieve the embeddings from the field's vocab, and check they're the correct size, [vocab size, embedding dim]

In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

replacing the initial weights of the embedding layer with the pre-trained embeddings.

In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

print(model.embedding.weight.data)

#### Train the Model

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

#### training the model

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.SentimentText).squeeze(1)
        
        loss = criterion(predictions, batch.Sentiment)
        
        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.Sentiment).float() 
        
        acc = correct.sum() / len(correct)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
num_epochs = 20

for epoch in range(num_epochs):
     
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% |')

### Testing the model

In [None]:
epoch_loss = 0
epoch_acc = 0

model.eval()

with torch.no_grad():

    for batch in test_iterator:

        predictions = model(batch.SentimentText).squeeze(1)

        loss = criterion(predictions, batch.Sentiment)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == batch.Sentiment).float() 
        
        acc = correct.sum()/len(correct)

        epoch_loss += loss.item()
        epoch_acc += acc.item()


test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

#### Input

In [None]:
sentence = 'I hate that show' 

#Run again for "That movie was really nice"
#Run again for "I hate that show but recently it has been quite good"
#Run again for "That movie was decent but kind of fizzled out towards the end"

In [None]:
tokenized = [tok.text for tok in nlp.tokenizer(sentence)]

In [None]:
indexed = [TEXT.vocab.stoi[t] for t in tokenized]

In [None]:
tensor = torch.LongTensor(indexed)

In [None]:
tensor = tensor.unsqueeze(1)

In [None]:
prediction = torch.sigmoid(model(tensor))

In [None]:
prediction.item()