<a href="https://colab.research.google.com/github/sujitpal/nlp-deeplearning-ai-examples/blob/master/01_01_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

%matplotlib inline

In [None]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

/content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%ls "drive/My Drive/nlp-deeplearning-ai-data"

sentiment-01.pt                 training.1600000.processed.noemoticon.csv
testdata.manual.2009.06.14.csv


In [None]:
DATA_DIR = "drive/My Drive/nlp-deeplearning-ai-data"

In [None]:
train_df = pd.read_csv(os.path.join(DATA_DIR, "training.1600000.processed.noemoticon.csv"), 
                       names=["target", "tid", "tdate", "flag", "user", "text"],
                       encoding="latin1")
# train_df = train_df.sample(frac=0.001)
train_df.head()

Unnamed: 0,target,tid,tdate,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
test_df = pd.read_csv(os.path.join(DATA_DIR, "testdata.manual.2009.06.14.csv"),
                      names=["target", "tid", "tdate", "flag", "user", "text"])
test_df.head()

Unnamed: 0,target,tid,tdate,flag,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [None]:
def preprocess_label(i):
  return [1, 0] if i <= 2 else [0, 1]

def preprocess_text(s):
  s = s.lower()
  toks = s.split()
  toks = [t for t in toks if not(t.startswith("@"))]
  return " ".join(toks)

texts_trainval = [preprocess_text(t) for t in train_df["text"].values]
labels_trainval = [preprocess_label(i) for i in train_df["target"].values]

texts_train, texts_val, labels_train, labels_val = train_test_split(
    texts_trainval, labels_trainval, test_size=0.2)

texts_test = [preprocess_text(t) for t in test_df["text"].values]
labels_test = [preprocess_label(i) for i in test_df["target"].values]

print(len(texts_train), len(labels_train), len(texts_val), 
      len(labels_val), len(texts_test), len(labels_test))

1280000 1280000 320000 320000 498 498


In [None]:
vectorizer = CountVectorizer(min_df=5, max_features=5000)
Xtrain = vectorizer.fit_transform(texts_train)
Xval = vectorizer.transform(texts_val)
Xtest = vectorizer.transform(texts_test)

Xtrain.shape, Xval.shape, Xtest.shape

((1280000, 5000), (320000, 5000), (498, 5000))

In [None]:
class SentimentDataset(Dataset):
  def __init__(self, X, y=None):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self, i):
    if self.y is not None:
      return torch.tensor(self.X[i].toarray(), dtype=torch.float32), \
             torch.tensor(np.array(self.y[i]), dtype=torch.float32)
    else:
      return torch.tensor(self.X[i].toarray(), dtype=torch.float32), None

train_ds = SentimentDataset(Xtrain, np.array(labels_train))
val_ds = SentimentDataset(Xval, np.array(labels_val))
test_ds = SentimentDataset(Xtest, np.array(labels_test))

In [None]:
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=4)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False, num_workers=4)

In [None]:
class SentimentLogisticNet(torch.nn.Module):
  def __init__(self, vocab_size, output_size):
    super().__init__()
    self.linear = nn.Linear(vocab_size, output_size)

  def forward(self, x):
    x = self.linear(x)
    x = F.softmax(x)
    return x

net = SentimentLogisticNet(vocab_size=len(vectorizer.vocabulary_),
                           output_size=2) 
net

SentimentLogisticNet(
  (linear): Linear(in_features=5000, out_features=2, bias=True)
)

In [None]:
def train(net, dev, train_dl, val_dl, num_epochs=10, lr=1e-3):
  params = filter(lambda p: p.requires_grad, net.parameters())
  optimizer = torch.optim.Adam(params, lr=lr)
  for i in range(num_epochs):
    net.train()
    sum_loss, total = 0, 0
    for x, y in train_dl:
      x, y = x.to(dev), y.to(dev)
      y_ = net(x)
      optimizer.zero_grad()
      loss = F.binary_cross_entropy(y_, y)
      loss.backward()
      optimizer.step()
      sum_loss += loss.item() * y.shape[0]
      total += y.shape[0]
    val_loss, val_acc = evaluate(net, dev, val_dl)
    print("EPOCH {:d}: train loss: {:.3f}, val loss: {:.3f}, val acc: {:.3f}"
      .format(i, sum_loss / total, val_loss, val_acc))


def evaluate(net, dev, val_dl):
  net.eval()
  correct, total, sum_loss = 0, 0, 0
  for x, y in val_dl:
    x, y = x.to(dev), y.to(dev)
    y_ = net(x)
    loss = F.binary_cross_entropy(y_, y)
    _, pred = torch.max(y_, 1)
    correct += (pred == y).float().sum()
    total += y.shape[0]
    sum_loss += loss.item() * y.shape[0]
  return sum_loss / total, correct / total

In [None]:
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net.to(dev)

train(net, dev, train_dl, val_dl, num_epochs=3)

  
  # This is added back by InteractiveShellApp.init_path()


EPOCH 0: train loss: 1.749, val loss: 1.749, val acc: 1.000
EPOCH 1: train loss: 1.749, val loss: 1.749, val acc: 1.000
EPOCH 2: train loss: 1.749, val loss: 1.749, val acc: 1.000


In [None]:
test_loss, test_acc = evaluate(net, dev, test_dl)
print("test loss: {:.3f}, test acc: {:.3f}".format(test_loss, test_acc))

test loss: 1.739, test acc: 1.000


  
