I think that we use Pandas as an underlying data cleaning tool, then Pytorch to do the hard work of computation. The key to getting PyTorch is getting pandas. 

In [2]:
import collections
from collections import Counter
import numpy as np
import pandas as pd
import re
import string
import torch

from argparse import Namespace

In [3]:
from data import download

In [4]:
import os
os.getcwd()

'/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/chapters/chapter_3'

In [5]:
args = Namespace(
    raw_train_dataset_csv="/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/data/yelp/raw_train.csv",
    raw_test_dataset_csv="/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/data/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

In [6]:


# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [7]:
# making the subset equal across the review classes
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [8]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [9]:
train_reviews.rating.value_counts()

2    280000
1    280000
Name: rating, dtype: int64

In [10]:
review_subset.rating.value_counts()

2    28000
1    28000
Name: rating, dtype: int64

In [11]:
# Unique classes
set(review_subset.rating)

{1, 2}

In [12]:
# Splitting the subset by rating to create our new train, val, and test splits
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [13]:
# Create split data
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list)

In [14]:
# Write split data to file
final_reviews = pd.DataFrame(final_list)

In [15]:
final_reviews.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

In [16]:
# Preprocess the reviews
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [17]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [18]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,all i can say is that a i had no other option ...,train
1,negative,i went here once when my long time stylist mov...,train
2,negative,i don t know why i stopped here for lunch this...,train
3,negative,did i order the wrong thing ? or maybe it was ...,train
4,negative,i went here for restaurant week . the restaura...,train


In [19]:
final_reviews.to_csv(args.output_munged_csv, index=False)

In [20]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        '''
        Args:
            review_df (pandas.DataFrame): the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        '''
        self.review_df = review_df
        self._vectorizer = vectorizer
        
        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.review_df[self.review_df.split=='val']
        self.validation_size = len(self.val_df)
        
        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.validation_size),
                            'test': (self.test_df, self.test_size)}
        
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        '''Load dataset and make a new vectorizer from scratch
        
        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset'''
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))
    
    def get_vectorizer(self):
        '''Returns the vectorizer'''
        return self._vectorizer
    
    def set_split(self, split="train"):
        '''Selects the split in the dataset using a column in the dataframe
        
        Args:
            split(str): one of "train", "val", or "test"'''
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def __len__(self):
        return self._vectorizer
    
    def __getitem__(self):
        """
        the primary entry point method for PyTorch datasets
        
        Args:
            index(int): the index to the data point
        Return:
            a dict of the data point's features (x_data) and label (y_label)
        """
        row = self._target_df.iloc[index]
        
        review_vector = \
            self._vectorizer.vectorize(row.review)
        rating_index = \
            self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {'x_data': review_vector,
               'y_target': rating_index}
    
    def get_num_batches(self, batch_size):
        '''Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size(int)
        Returns:
            number of batches in the dataset
        '''
        return len(self) // batch_size

In [21]:
class Vocabulary(object):
    '''Class to process text and extract Vocabulary for mapping'''
    
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK> "):
        """Args:
            token_to_idx (dict): a pre-existing map of tokens to indices.
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    @classmethod
    def to_serializable(self):
        '''Returns a dictionary that can be serialized'''
        return {'token_to_idx': self._token_to_idx,
               'add_unk': self._add_unk,
               'unk_token': self._unk_token}

    def add_token(self, token):
        '''
        Update mapping dicts based on the token.

        Args: token (str): the item to add into the Vocabulary

        Returns: index (int): the integer corresponding to the token
        '''
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        '''Retrieve the index associated with the token
        or the UNK index if token isn't present

        Args:
            token(str): the token to look up

        Returns:
            index(int): the index corresponding to the token
        Notes:
            'unk_index' needs to be x=0 (having been added into the vocabulary for the UNK functionality)
            '''
        if self.add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        '''
        Args:
            index(int): the index to look up
        Returns:
            token(str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        '''
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self.idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(Size=%d)" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [22]:
class ReviewVectorizer(object):
    """The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary) : map words to integers
            rating_vocab (Vocabulary) : maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
        
    def vectorize(self, review):
        """ Create a collapsed one-hit vector for the review
        Args:
            review(str): the review
        Returns:
            one_hot(np.ndarray): the collapsed one-hot encoding
        """
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab_lookup_token(token)] =1
        
        return one_hot
    
    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff(int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        #Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)
        
        #Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review_df.review:
                for word in review.split(" "):
                    if word not in string.punctuation:
                        word_counts[word] += 1
                        
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
                
        return cls(review_vocab, rating_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        """Instantiate a ReviewVector from a serializable dictionary
        
        Args:
            contents(dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating vocab'])
        
        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
    
    @classmethod
    def to_serializable(self):
        '''Create the serializable dictionary for caching.
        
        Returns:
            contents (dict): the serializable dictionary'''
        return {'review_vocab': self.review_vocab.to_serializable(),
               'rating_vocab': self.rating_vocab.to_serializable()}

In [23]:
def generate_batches(dataset, batch_size, shuffle=True,
                    drop_last=True, device='cpu'):
    '''
    A generator function which wraps the PyTorch DataLoader.
    It will ensure each tensor is on the write device location.
    '''
    downloader = DataLoader(dataset=dataset, batch_size=batch_size,
                           shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_dict_data[name] = data_dict[name].to(device)
        yield out_data_dict

In [24]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    """A simple perceptron-based classifier"""
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
        
    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor) an input data tensor
            x.in.shape should be (batch, num_features)
            apply_sigmoid(bool): a flag for the sigmoid activation
                should be false if used with the cross-entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,)"""
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

In [25]:
from argparse import Namespace

args=Namespace(
    #data and path information
    frequency_cutoff = 25,
    model_state_file = 'model.pth',
    review_csv='/Users/thomassullivan/projects/GitHub/PyTorchNLPBook/data/yelp/reviews_with_splits_lite.csv',
    save_dir='/Users/thomassullivan/projects/GitHub/PyTorchNLPBookmodel_storage/ch3/yelp',
    vectorizer_file = 'vectorizer.json',
    #no model hyperparameters
    #Training hyperparameters
    batch_size = 128,
    early_stopping_criteria=5,
    learning_rate = 0.001,
    num_epochs=100,
    seed=1337,
    #runtime options omitted for space
    )

In [26]:
import torch.optim as optim
def make_train_state(args):
    return {'epoch_index': 0,
           'train_loss':[],
           'train_acc':[],
           'val_loss':[],
           'val_acc':[],
           'test_loss': -1,
           'test_acc': -1}

In [27]:
train_state = make_train_state(args)

In [28]:
if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")


In [29]:
args.device

device(type='cpu')

In [30]:
train_state

{'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': [],
 'test_loss': -1,
 'test_acc': -1}

In [None]:
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
    #vectorizer = dataset.get_vectorizer()