<a href="https://colab.research.google.com/github/shazzad-hasan/practice-deep-learning-with-pytorch/blob/main/text_classification/tweet_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# upload kaggle API key from your local machine
from google.colab import files
files.upload()

In [None]:
# make a kaggle dir, copy the API key to it
# and make sure the file in only readable by yourself (chmod 600)
!mkdir ~/.kaggle 
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# use API command to download the dataset
!kaggle datasets download -d kazanova/sentiment140

In [None]:
# uncompress the dataset
!unzip -qq sentiment140.zip

In [None]:
!pip install torchtext==0.9.1
!pip install torch==1.8.1
!pip install googletrans==3.1.0a0

In [None]:
# import required libraries
import torch
import torchtext
from torchtext.legacy import data

import numpy as np
import pandas as pd
import random
import spacy

In [None]:
# check if cuda is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
  print("CUDA is not available")
else:
  print("CUDA is available")

device = torch.device('cuda') if train_on_gpu else torch.device('cpu')

In [None]:
tweets_df = pd.read_csv("training.1600000.processed.noemoticon.csv",
                       encoding='latin-1', header=None)

In [None]:
tweets_df.head(3)

In [None]:
tweets_df[0].value_counts()

### Pre-process the dataset

In [None]:
# create a column of type category from the label column
tweets_df["sentiment_cat"] = tweets_df[0].astype('category')
# encode category column as numerical info in another column (sentiment)
tweets_df["sentiment"] = tweets_df["sentiment_cat"].cat.codes
# save the modified csv back to dist
tweets_df.to_csv("train-processed.csv", header=None, index=None)   

In [None]:
tweets_df.head(3)

In [None]:
tweets_df.tail(3)

In [None]:
LABEL = data.LabelField() 
TWEET = data.Field('spacy', tokenizer_language='en_core_web_sm', lower=True)

# carete a list that maps the fields onto the list of rows that are in the tweets dataframe
fields = [('score',None), ('id',None), ('date',None), ('query',None),
          ('name',None), ('tweet', TWEET), ('category',None), ('label',LABEL)]

In [None]:
tweet_data = data.dataset.TabularDataset(
        path="train-processed.csv", 
        format="CSV", 
        fields=fields,
        skip_header=False)

In [None]:
# split the dataset into train, test, and validation sets
(train_data, test_data, valid_data) = tweet_data.split(split_ratio=[0.8, 0.1, 0.1],
                                            stratified=True, strata_field='label')

print("Num of training data: ", len(train_data))
print("Num of test data: ", len(test_data))
print("Num of validation_data: ", len(valid_data))

In [None]:
# limit the vocabulary in the training data
vocab_size = 20000
TWEET.build_vocab(train_data, max_size = vocab_size)
LABEL.build_vocab(train_data)
# most common words in the vocabulary
TWEET.vocab.freqs.most_common(10)

In [None]:
# define data loader
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = 32,
    device = device,
    sort_key = lambda x: len(x.tweet),
    sort_within_batch = False)

### Define a model

In [None]:
import torch.nn as nn

class Net(nn.Module):
  def __init__(self, num_embeddings, embedding_dim, hidden_dim, output_dim, num_layers):
    super(Net, self).__init__()

    self.embedding = nn.Embedding(num_embeddings, embedding_dim)
    self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, seq):
    output, (hidden,_) = self.encoder(self.embedding(seq))
    preds = self.fc(hidden.squeeze(0))
    return preds

In [None]:
num_embeddings = vocab_size+2
embedding_dim = 400
hidden_dim = 256
output_dim = 2
num_layers = 1

model = Net(num_embeddings, embedding_dim, hidden_dim, output_dim, num_layers)
model.to(device)

### Define a loss function and an optimizer

In [None]:
import torch.optim as optim

# specify loss function
criterion = nn.CrossEntropyLoss()

# specify optimizer
lr = 0.01
params = model.parameters()
optimizer = optim.Adam(params, lr=lr)

### Train the model

In [None]:
def train(epochs, model, optimizer, criterion, train_iterator, valid_iterator):

  train_loss, valid_loss = 0.0, 0.0

  model.train()
  for batch_idx, batch in enumerate(train_iterator):
    optimizer.zero_grad()
    predict = model(batch.tweet)
    loss = criterion(predict, batch.label)
    loss.backward()
    optimizer.step()
    train_loss += loss.data.item() * batch.tweet.size(0)
  train_loss /= len(train_iterator)
  
  model.eval()
  with torch.no_grad():
    for batch_idx, batch in enumerate(valid_iterator):
      predict = model(batch.tweet)
      loss = criterion(predict, batch.label)
      valid_loss += loss.data.item() * batch.tweet.size(0)
  valid_loss /= len(valid_iterator)

  return train_loss, valid_loss

In [None]:
num_epochs = 5
train_losses, valid_losses = [], []

for epoch in range(1, num_epochs+1):

  train_loss, valid_loss = train(num_epochs, model, optimizer, criterion, train_iterator, valid_iterator) 

  print('Epoch: {} | Training Loss: {:.2f} | Validation Loss: {:.2f}'.format(epoch, train_loss, valid_loss))     

  train_losses.append(train_loss)
  valid_losses.append(valid_loss)  

### Test the model

In [None]:
def classify_tweet(tweet):
  """ classify_tweet emulate the processing pipeline that happens internally
  and make the required prediction on the output of that pipeline """
  
  categories = {0: "Negative", 1:"Positive"}
  processed = TWEET.process([TWEET.preprocess(tweet)])
  processed = processed.to(device)
  model.eval()
  return categories[model(processed).argmax().item()]

In [None]:
tweet1 = "Watching this movie is just waste of time"
classify_tweet(tweet1)

In [None]:
tweet2 = "This movie is one of my favorite movies"
classify_tweet(tweet2)