Code based in HW1
Credits: This assignment and notebook was originally created by Zewei Chu (zeweichu@uchicago.edu)

In [1]:
import torch
import torch.utils.data as tud
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter, defaultdict
import operator
import os, math
import numpy as np
import random
import copy
import pandas as pd

# Feel free to define your own word_tokenizer instead of this naive 
# implementation. You may also use word_tokenize from nltk library 
# (from nltk import word_tokenize), which works better but slower. 
def word_tokenize(s):
    return s.split()

# set the random seeds so the experiments can be replicated exactly
seed = 30255
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

# Global class labels.
POS_LABEL = 'pos'
NEG_LABEL = 'neg'     

In [2]:
def load_data(data_file):
    data = []
    with open(data_file,'r',  encoding="utf8") as fin:
        for line in fin:
            label, content = line.split(",", 1)
            data.append((content.lower(), label))
    return data
data_dir = "large_movie_review_dataset"
train_data = load_data(os.path.join(data_dir, "train.txt"))
dev_data = load_data(os.path.join(data_dir, "dev.txt"))

In [3]:
print("number of TRAIN data", len(train_data))
print("number of DEV data", len(dev_data))

number of TRAIN data 25000
number of DEV data 5000


We have defined a generic model class as below. The model has 2 functions, train and classify. 

In [4]:
VOCAB_SIZE = 5000
class Model:
    def __init__(self, data):
        # Vocabulary is a set that stores every word seen in the 
        # training data
        self.vocab = Counter([word for content, label in data 
                              for word in word_tokenize(content)]
                            ).most_common(VOCAB_SIZE-1)
        # word to index mapping
        self.word_to_idx = {k[0]: v+1 for v, k in 
                            enumerate(self.vocab)}
        # all the unknown words will be mapped to index 0
        self.word_to_idx["UNK"] = 0 
        self.idx_to_word = {v:k for k, v in self.word_to_idx.items()}
        self.label_to_idx = {POS_LABEL: 0, NEG_LABEL: 1}
        self.idx_to_label = [POS_LABEL, NEG_LABEL]
        self.vocab = set(self.word_to_idx.keys())
        
    def train_model(self, data):
        '''
        Train the model with the provided training data
        '''
        raise NotImplementedError
        
    def classify(self, data):
        '''
        Classify the documents with the model
        '''
        raise NotImplementedError

# Logistic Regression with Bag of Words

In [5]:
class TextClassificationDataset(tud.Dataset):
    '''
    PyTorch provides a common dataset interface. 
    See https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
    The dataset encodes documents into indices. 
    With the PyTorch dataloader, you can easily get batched data for 
    training and evaluation. 
    '''
    def __init__(self, word_to_idx, data):
        
        self.data = data
        self.word_to_idx = word_to_idx
        self.label_to_idx = {POS_LABEL: 0, NEG_LABEL: 1}
        self.vocab_size = VOCAB_SIZE
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = np.zeros(self.vocab_size)
        
        item = torch.FloatTensor(item)
        # in training or tuning, we use both the document (review)
        # and its corresponding label
        if len(self.data[idx]) == 2: 
            for word in word_tokenize(self.data[idx][0]):
                item[self.word_to_idx.get(word, 0)] += 1
            label = self.label_to_idx[self.data[idx][1]]
            #label = torch.LongTensor([label])
            return item, label
        else: # in testing, we only use the document without label
            for word in word_tokenize(self.data[idx]):
                item[self.word_to_idx.get(word, 0)] += 1
            return item

In [6]:
best_model = None
class BoWLRClassifier(nn.Module, Model):
    '''
    Define your logistic regression model with bag of words features.
    '''
    def __init__(self, data):
        nn.Module.__init__(self)
        Model.__init__(self, data)
        
        '''
        In this model initialization phase, write code to do the 
        following: 
        1. Define a linear layer to transform bag of words features 
           into 2 classes. 
        2. Define the loss function; use cross entropy loss (see
            https://pytorch.org/docs/stable/nn.html?highlight=crossen#torch.nn.CrossEntropyLoss)
        3. Define an optimizer for the model; choose the Adam optimizer,
           which uses a version of the stochastic gradient descent 
           algorithm. (See https://pytorch.org/docs/stable/optim.html?highlight=sgd#torch.optim.Adam)
        '''
        self.mod= Model(data)
        self.linear = nn.Linear(VOCAB_SIZE, len(self.mod.label_to_idx))
        self.loss = nn.CrossEntropyLoss()
        self.opt = optim.Adam(self.linear.parameters())
        
        
    def forward(self, bow):
        '''
        Run the linear layer in the model for a single bag of words vector. 
        '''
        # WRITE YOUR CODE HERE
        # (You might be wondering why we don't explicitly have a
        # softmax component in our model. It is included in something
        # defined earlier. In what?)
        return self.linear(bow)
    
    
    def train_epoch(self, train_data):
        '''
        Train the model for one epoch with the training data
        When training a model, you repeat the following procedure:
        1. Get one batch of features and labels
        2. Make a forward pass with the features to get predictions
        3. Calculate the loss with the predictions and target labels
        4. Run a backward pass from the loss function to get the gradients
        5. Apply the optimizer step to update the model paramters
        
        For (1) you will have to understand how the PyTorch dataloader
        functions.
        '''
        # WRITE YOUR CODE HERE
        
        #mod = Model(train_data)
        #w2i = Model(train_data).word_to_idx
        #batches = TextClassificationDataset(w2i, train_data)
        for i in range(len(train_data)):
            self.linear.zero_grad()
            x, y = train_data[i]
            pred = self.forward(x)
            loss = self.loss(pred.view(1, -1), torch.LongTensor([y]))
            loss.backward()
            self.opt.step()
            if i%5000==0:
                print('still working: {}% of completion of this epoch. Loss = {}'.format((i/len(train_data))*100,loss))
        
    
    def classify(self, docs):
        '''
        This function classifies documents into their categories. 
        docs are documents without labels.
        '''
        # WRITE YOUR CODE HERE
        probs = self.forward(docs)
        cl = 1
        if probs[0] >= probs[1]:
            cl = 0
        return cl
                
    def evaluate_classifier_accuracy(self, data):
        '''
        This function evaluates the data with the current model. 
        data contains both documents and labels. 
        It calls classify() to make predictions, 
        and compares with the correct labels to return 
        the model accuracy on "data". 
        '''
        # WRITE YOUR CODE HERE
        correct = 0
        for i in range(len(data)):
            bow, label = data[i]
            pred = self.classify(bow)
            if pred == label:
                correct += 1
        return correct/len(data)
            
    
    def train_model(self, train_data, dev_data):
        """
        This function processes the entire training set for multiple epochs.
        After each training epoch, evaluate your model on the DEV set. 
        Save the best performing model on the DEV set to best_model
        """  
        num_epoch = 5
        batches_train=TextClassificationDataset(self.mod.word_to_idx, train_data)
        batches_dev=TextClassificationDataset(self.mod.word_to_idx, dev_data)
        best_acc = 0
        for e in range(num_epoch):
            self.train_epoch(batches_train)
            acc = self.evaluate_classifier_accuracy(batches_dev)
            if acc < best_acc:
                return 0
            best_acc = acc
            print('The accuracy after epoch {} is {}%'.format(e+1,round(acc*100, 2)))
        return acc

Train the model

In [116]:
lr_model = BoWLRClassifier(train_data)

In [117]:
lr_model.train_model(train_data, dev_data)

still working: 0.0% of completion of this epoch. Loss = 0.6903829574584961
still working: 20.0% of completion of this epoch. Loss = 0.07361388206481934
still working: 40.0% of completion of this epoch. Loss = 0.0046694278717041016
still working: 60.0% of completion of this epoch. Loss = 1.9550323486328125e-05
still working: 80.0% of completion of this epoch. Loss = 0.007635593414306641
The accuracy after epoch 1 is 86.3%
still working: 0.0% of completion of this epoch. Loss = 8.726119995117188e-05
still working: 20.0% of completion of this epoch. Loss = 0.014612436294555664
still working: 40.0% of completion of this epoch. Loss = 1.621246337890625e-05
still working: 60.0% of completion of this epoch. Loss = 3.7670135498046875e-05
still working: 80.0% of completion of this epoch. Loss = 0.0024690628051757812


0

# Tuning the model

In [7]:
class BoWLRClassifierT(nn.Module, Model):
    '''
    Define your logistic regression model with bag of words features.
    '''
    def __init__(self, data, EPOCHs, op, lr, weight_decay):
        nn.Module.__init__(self)
        Model.__init__(self, data)
        
        '''
        In this model initialization phase, write code to do the 
        following: 
        1. Define a linear layer to transform bag of words features 
           into 2 classes. 
        2. Define the loss function; use cross entropy loss (see
            https://pytorch.org/docs/stable/nn.html?highlight=crossen#torch.nn.CrossEntropyLoss)
        3. Define an optimizer for the model; choose the Adam optimizer,
           which uses a version of the stochastic gradient descent 
           algorithm. (See https://pytorch.org/docs/stable/optim.html?highlight=sgd#torch.optim.Adam)
        '''
        self.epochs = EPOCHs
        self.mod= Model(data)
        self.linear = nn.Linear(VOCAB_SIZE, len(self.mod.label_to_idx))
        self.loss = nn.CrossEntropyLoss()
        self.opt = op(self.linear.parameters(), lr=lr, weight_decay=weight_decay)
        
        
    def forward(self, bow):
        '''
        Run the linear layer in the model for a single bag of words vector. 
        '''
        # WRITE YOUR CODE HERE
        # (You might be wondering why we don't explicitly have a
        # softmax component in our model. It is included in something
        # defined earlier. In what?)
        return self.linear(bow)
    
    
    def train_epoch(self, train_data):
        '''
        Train the model for one epoch with the training data
        When training a model, you repeat the following procedure:
        1. Get one batch of features and labels
        2. Make a forward pass with the features to get predictions
        3. Calculate the loss with the predictions and target labels
        4. Run a backward pass from the loss function to get the gradients
        5. Apply the optimizer step to update the model paramters
        
        For (1) you will have to understand how the PyTorch dataloader
        functions.
        '''
        # WRITE YOUR CODE HERE
        sum_loss = 0
        for i in range(len(train_data)):
            self.linear.zero_grad()
            x, y = train_data[i]
            pred = self.forward(x)
            loss = self.loss(pred.view(1, -1), torch.LongTensor([y]))
            sum_loss += loss
            loss.backward()
            self.opt.step()
            if i%2500==0 and i>0:
                print('{}% of completion of this epoch. Average Loss = {}'.format((i/len(train_data))*100,sum_loss/2500))
                sum_loss = 0
        
    
    def classify(self, docs):
        '''
        This function classifies documents into their categories. 
        docs are documents without labels.
        '''
        # WRITE YOUR CODE HERE
        probs = F.log_softmax(self.forward(docs).view(1, -1), dim=1)[0]
        cl = 1
        if probs[0] >= probs[1]:
            cl = 0
        return cl
                
    def evaluate_classifier_accuracy(self, data):
        '''
        This function evaluates the data with the current model. 
        data contains both documents and labels. 
        It calls classify() to make predictions, 
        and compares with the correct labels to return 
        the model accuracy on "data". 
        '''
        # WRITE YOUR CODE HERE
        correct = 0
        for i in range(len(data)):
            bow, label = data[i]
            pred = self.classify(bow)
            if pred == label:
                correct += 1
        return correct/len(data)
            
    
    def train_model(self, train_data, dev_data):
        """
        This function processes the entire training set for multiple epochs.
        After each training epoch, evaluate your model on the DEV set. 
        Save the best performing model on the DEV set to best_model
        """  
        batches_train=TextClassificationDataset(self.mod.word_to_idx, train_data)
        batches_dev=TextClassificationDataset(self.mod.word_to_idx, dev_data)
        best_acc = 0
        for e in range(self.epochs):
            self.train_epoch(batches_train)
            acc = self.evaluate_classifier_accuracy(batches_dev)
            if acc < best_acc or acc < 0.6:
                print('Model stopped')
                return 0
            best_acc = acc
            print('The accuracy after epoch {} is {}%'.format(e+1,round(acc*100, 2)))
        return acc

In [12]:
results = []

### Running best model

In [13]:
best_model = None
epoch_numbers = [12]
optimizer = [optim.Adam]#[optim.SGD, optim.Adam]
learning_rates = [0.00001]
decays = [0]
best_accuracy = 0
for e in epoch_numbers:
    for o in optimizer:
        for l in learning_rates:
            for d in decays:
                model_def = '{} epochs, using {} optimizer, lr={} and weight_decay = {}'.format(e, o, l, d)
                lr_model_t = BoWLRClassifierT(data=train_data, EPOCHs=e, op=o, lr=l, weight_decay=d)
                accuracy = lr_model_t.train_model(train_data, dev_data)
                results.append([model_def, accuracy])
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_model = copy.deepcopy(lr_model_t)

10.0% of completion of this epoch. Average Loss = 0.7018959522247314
20.0% of completion of this epoch. Average Loss = 0.6727177500724792
30.0% of completion of this epoch. Average Loss = 0.6563047766685486
40.0% of completion of this epoch. Average Loss = 0.6330961585044861
50.0% of completion of this epoch. Average Loss = 0.621982753276825
60.0% of completion of this epoch. Average Loss = 0.6039065718650818
70.0% of completion of this epoch. Average Loss = 0.588162362575531
80.0% of completion of this epoch. Average Loss = 0.582866370677948
90.0% of completion of this epoch. Average Loss = 0.569302499294281
The accuracy after epoch 1 is 79.48%
10.0% of completion of this epoch. Average Loss = 0.5480149388313293
20.0% of completion of this epoch. Average Loss = 0.5391193628311157
30.0% of completion of this epoch. Average Loss = 0.5304416418075562
40.0% of completion of this epoch. Average Loss = 0.5207783579826355
50.0% of completion of this epoch. Average Loss = 0.5194603204727173
6

In [14]:
best_accuracy

0.8728

In [15]:
best_model

BoWLRClassifierT(
  (linear): Linear(in_features=5000, out_features=2, bias=True)
  (loss): CrossEntropyLoss()
)

In [23]:
import json
import util

## Evaluation of segments

In [25]:
beach = util.open_json('beach_to_process.json')
nature = util.open_json('nature_to_process.json')
culture = util.open_json('culture_to_process.json')
shopping = util.open_json('shopping_to_process.json')

In [75]:
def log_sent(segments_dict, best_model):
    results = {}
    cities = segments_dict.keys()
    for city in cities:
        segments = segments_dict[city]
        vocab = Counter([v for sent in segments for v in word_tokenize(sent)])
        seg_word_to_idx = {k: v+1 for v, k in enumerate(vocab)}
        segmens_rev = TextClassificationDataset(seg_word_to_idx, segments)
        city_sents = [best_model.classify(r) for r in segmens_rev]
        if len(city_sents)>10:
            ratios = {'positive':round(len([j for j in city_sents if j ==0]),2)/len(city_sents),
                      'negative':round(len([j for j in city_sents if j ==1]),2)/len(city_sents)}
            results[city]=ratios
    return results

In [76]:
shopping_sent = log_sent(shopping, best_model)

In [77]:
pd.DataFrame.from_dict(shopping_sent, orient='index').sort_values(by='positive', ascending=False)

Unnamed: 0,positive,negative
hermosillo,0.545455,0.454545
chihuahua,0.538462,0.461538
cancun,0.513064,0.486936
ciudad juarez,0.444444,0.555556
zapopan,0.423077,0.576923
tijuana,0.393443,0.606557
leon,0.384615,0.615385
puerto penasco,0.153846,0.846154


In [78]:
nature_sent = log_sent(nature, best_model)
pd.DataFrame.from_dict(nature_sent, orient='index').sort_values(by='positive', ascending=False)

Unnamed: 0,positive,negative
monterrey,0.666667,0.333333
oaxaca,0.591837,0.408163
valle de bravo,0.5,0.5
ciudad valles,0.421053,0.578947
palenque,0.394667,0.605333
comitan,0.277778,0.722222


In [79]:
culture_sent = log_sent(culture, best_model)
pd.DataFrame.from_dict(culture_sent, orient='index').sort_values(by='positive', ascending=False)

Unnamed: 0,positive,negative
saltillo,0.608696,0.391304
xalapa,0.607843,0.392157
cuernavaca,0.474359,0.525641
monterrey,0.45679,0.54321
chihuahua,0.4,0.6
ciudad juarez,0.4,0.6
aguascalientes,0.391304,0.608696
oaxaca,0.370474,0.629526
mexico city,0.358798,0.641202
san cristobal de las casas,0.214286,0.785714


In [80]:
beach_sent = log_sent(beach, best_model)
pd.DataFrame.from_dict(beach_sent, orient='index').sort_values(by='positive', ascending=False)

Unnamed: 0,positive,negative
todos santos,0.715447,0.284553
sayulita,0.654485,0.345515
manzanillo,0.622222,0.377778
zihuatanejo de azueta,0.605042,0.394958
cabo san lucas,0.573472,0.426528
zihuatanejo,0.525,0.475
puerto escondido,0.484848,0.515152
rosarito,0.47619,0.52381
huatulco,0.427119,0.572881
punta de mita,0.266667,0.733333
