<a href="https://colab.research.google.com/github/sujitpal/nlp-deeplearning-ai-examples/blob/master/01_01_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

%matplotlib inline

## Mount Drive

In [2]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%ls "drive/My Drive/nlp-deeplearning-ai-data"

testdata.manual.2009.06.14.csv  training.1600000.processed.noemoticon.csv


In [4]:
# %rm "drive/My Drive/nlp-deeplearning-ai-data/vocab.pkl"
# %rm "drive/My Drive/nlp-deeplearning-ai-data/index2word.pkl"
# %rm "drive/My Drive/nlp-deeplearning-ai-data/word2index.pkl"

In [5]:
DATA_DIR = "drive/My Drive/nlp-deeplearning-ai-data"
CUDA_LAUNCH_BLOCKING="1"

## Data Preprocessing

In [6]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "training.1600000.processed.noemoticon.csv"), 
                       names=["target", "tid", "tdate", "flag", "user", "text"],
                       encoding="latin1")
train_df.head()

Unnamed: 0,target,tid,tdate,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
test_df = pd.read_csv(os.path.join(DATA_DIR, "testdata.manual.2009.06.14.csv"),
                      names=["target", "tid", "tdate", "flag", "user", "text"])
test_df.head()

Unnamed: 0,target,tid,tdate,flag,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [8]:
test_df["target"].unique()

array([4, 0, 2])

### Decide thresholds

These are one-time computations needed to determine thresholds
* __vocabulary size__ : vocabulary is composed of the most frequent words used in training set, words outside this vocabulary will be marked as UNK.
* __maximum length of input sequence__ : we want to figure out the distribution of input sentence lengths (in tokens) so we have an optimum max length. Sentences longer than this length will be truncated, and sentences shorter than this will be PADded.

In [9]:
vocab_file = os.path.join(DATA_DIR, "vocab.pkl")
if not os.path.exists(vocab_file):
  num_recs = len(train_df)
  word_counts = collections.Counter()
  for i, text in enumerate(train_df["text"].values):
    if i % (num_recs // 100) == 0:
      print(".", end="")
    text = text.lower()
    for token in text.split():
      if token.startswith("@"):
        continue
      word_counts[token] += 1
  print(".")

len([w for w, c in word_counts.most_common() if c > 10])

.....................................................................................................


51145

In [10]:
num_tokens = []
for text in train_df.text.values:
  num_tokens.append(len(text.split()))

np.percentile(np.array(num_tokens), np.array([75, 80, 90, 95, 99]))

array([19., 20., 23., 25., 28.])

### Dataset and DataLoader

In [11]:
# train_df = train_df.sample(frac=0.001)
texts = train_df["text"].values
labels = train_df["target"].values

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2)
len(train_texts), len(train_labels), len(val_texts), len(val_labels)

(1280000, 1280000, 320000, 320000)

In [12]:
class SentimentDataset(Dataset):
  def __init__(self, texts, labels=None, vocab=None, 
               vocab_size=50000, maxlen=30):
    self.texts = texts
    self.labels = labels
    if vocab is None:
      self.vocab_ = self._build_vocab(texts, vocab_size)
    else:
      self.vocab_ = vocab
    self.word2index_ = {w:i+2 for i, w in enumerate(self.vocab_)}
    self.word2index_["PAD"] = 0
    self.word2index_["UNK"] = 1
    self.index2word_ = {v:k for k, v in self.word2index_.items()}
    self.vocab_size = vocab_size
    self.maxlen = maxlen

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    tokens = []
    for token in self.texts[idx].split():
      if token in self.word2index_.keys():
        tokens.append(self.word2index_[token])
      else:
        tokens.append(self.word2index_["UNK"])
    if len(tokens) > self.maxlen:
      tokens = tokens[0:self.maxlen]
    elif len(tokens) < self.maxlen:
      tokens = [self.word2index_["PAD"]] * (self.maxlen - len(tokens)) + tokens
    else:
      pass
    tokens = torch.tensor(tokens, dtype=torch.long)
    if self.labels is not None:
      label = self.labels[idx]
      label = 1 if label == 4 else 0
      return (tokens, label)
    else:
      return tokens

  def _build_vocab(self, texts, vocab_size):
    num_recs = len(texts)
    word_counts = collections.Counter()
    for i, text in enumerate(texts):
      if i % (num_recs // 100) == 0:
        print(".", end="")
      text = text.lower()
      for token in text.split():
        if token.startswith("@"):
          continue
        word_counts[token] += 1
    # truncate to vocab_size
    vocab = [w for w, c in word_counts.most_common(vocab_size)]
    vocab.append("PAD")
    vocab.append("UNK")
    return vocab

train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels, 
                               vocab=train_dataset.vocab_)
test_dataset = SentimentDataset(test_df["text"].values,
                                labels=test_df["target"].values,
                                vocab=train_dataset.vocab_)
train_dataset[10]

....................................................................................................

(tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             1,     1,    75,   137,  1734,   125,  1615,     4,   175,   467,
             1,    12,   543,   102,     7,     8,    55,    28,    62, 47222]),
 1)

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

### Network Definition

In [14]:
class SentimentNet(torch.nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, num_targets):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
    self.dropout = nn.Dropout(0.2)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
    self.linear = nn.Linear(hidden_dim, num_targets)

  def forward(self, x):
    x = self.embedding(x)
    x = self.dropout(x)
    lstm_out, (ht, ct) = self.lstm(x)
    return self.linear(ht[-1])


net = SentimentNet(vocab_size=len(train_dataset.vocab_), 
                   embed_dim=128,
                   hidden_dim=50,
                   num_targets=2)
net

SentimentNet(
  (embedding): Embedding(50002, 128, padding_idx=0)
  (dropout): Dropout(p=0.2, inplace=False)
  (lstm): LSTM(128, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=2, bias=True)
)

### Training Loop

In [15]:
def train(net, train_dataloader, val_dataloader, dev, num_epochs=10, lr=0.001):
  params = filter(lambda p: p.requires_grad, net.parameters())
  optimizer = torch.optim.Adam(params, lr=lr)
  for i in range(num_epochs):
    net.train()
    sum_loss, total = 0, 0
    for x, y in train_dataloader:
      x, y = x.to(dev), y.to(dev)
      # print(x.size(), y.size())
      y_ = net(x)
      optimizer.zero_grad()
      loss = F.cross_entropy(y_, y)
      loss.backward()
      optimizer.step()
      sum_loss += loss.item() * y.shape[0]
      total += y.shape[0]
    val_loss, val_acc = evaluate(net, val_dataloader, dev)
    print("EPOCH {:d}: train loss: {:.3f}, val loss: {:.3f}, val acc: {:.3f}"
      .format(i, sum_loss / total, val_loss, val_acc))
    

def evaluate(net, val_dataloader, dev):
  net.eval()
  correct, total, sum_loss = 0, 0, 0
  for x, y in val_dataloader:
    x, y = x.to(dev), y.to(dev)
    y_ = net(x)
    loss = F.cross_entropy(y_, y)
    _, pred = torch.max(y_, 1)
    correct += (pred == y).float().sum()
    total += y.shape[0]
    sum_loss += loss.item() * y.shape[0]
  return sum_loss / total, correct / total

In [16]:
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# dev = torch.device("cpu")
net.to(dev)

train(net, train_dataloader=train_dataloader, val_dataloader=val_dataloader, dev=dev)

EPOCH 0: train loss: 0.486, val loss: 0.445, val acc: 0.791
EPOCH 1: train loss: 0.438, val loss: 0.435, val acc: 0.797
EPOCH 2: train loss: 0.423, val loss: 0.432, val acc: 0.800
EPOCH 3: train loss: 0.412, val loss: 0.429, val acc: 0.801
EPOCH 4: train loss: 0.404, val loss: 0.431, val acc: 0.802
EPOCH 5: train loss: 0.398, val loss: 0.432, val acc: 0.802
EPOCH 6: train loss: 0.393, val loss: 0.434, val acc: 0.802
EPOCH 7: train loss: 0.388, val loss: 0.430, val acc: 0.802
EPOCH 8: train loss: 0.385, val loss: 0.435, val acc: 0.802
EPOCH 9: train loss: 0.381, val loss: 0.436, val acc: 0.803


### Evaluate on Test set

In [17]:
test_loss, test_acc = evaluate(net, test_dataloader, dev)
print("test_loss: {:.3f}, test_acc: {:.3f}".format(test_loss, test_acc))

test_loss: 0.819, test_acc: 0.627


In [18]:
torch.save(net.state_dict(), os.path.join(DATA_DIR, "sentiment-01.pt"))


In [19]:
%ls "drive/My Drive/nlp-deeplearning-ai-data"

sentiment-01.pt                 training.1600000.processed.noemoticon.csv
testdata.manual.2009.06.14.csv
