<a href="https://colab.research.google.com/github/mutherr/CS6120-PS1/blob/master/PS1_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import json
import requests
import torch
import random
from torch.autograd import Variable
from torch.utils import data
import torch.nn.functional as F
from torch.utils.data.dataset import random_split

In [0]:
#read in the movie review corpus
def readReviews():
  raw = requests.get("https://raw.githubusercontent.com/mutherr/CS6120-PS1-data/master/cornell_reviews.json").text.strip()
  corpus = [json.loads(line) for line in raw.split("\n")]
  #shuffle the dataset, but always the same way for consistency
  random.Random(4).shuffle(corpus)
  return corpus

In [0]:
#Here is where you will featurize the data.
#NB: The current contents are for testing only
#This function should return: 
#  -a numpy matrix of document features
#  -a list of the correct class for each document
#  -a list of the vocabulary used by the features, such that the ith term of the
#    list is the word whose counts appear in the ith column of the matrix. 
def createFeatures(corpus):
  from sklearn.feature_extraction.text import CountVectorizer
  import string

  texts = [entry["text"] for entry in corpus]
  genres = [entry["class"] for entry in corpus]

  vectorizer = CountVectorizer(max_df=.75)
  matrix = vectorizer.fit_transform(texts).todense()
  vocab = vectorizer.get_feature_names()

  return matrix,genres,vocab

In [0]:
#Model code.
class LogisticRegression(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, x):
        #outputs = F.softmax(self.linear(x))
        outputs = self.linear(x)
        return outputs

#given a numpy matrix representation of the features for the training set, the 
# vector of true classes for each example, and the vocabulary as described 
# above, this computes the accuracy of the model using leave 10-fold cv
# validation and reports the most indicative features for each class
def evaluateModel(X,y,vocab,penalty="l1"):
  #prepare the data for pytorch

  #convert y from a list of class names to a list of correct class ids
  # (1=pos, 0=neg)
  y_binarized = [1 if y[i]=="pos" else 0 for i in range(len(y))]

  #convert to pytorch tensors and create dataset and loader for the
  # train and test sets
  tensorX = torch.Tensor(X)
  tensorY = torch.LongTensor(y_binarized)
  fullDataset = data.TensorDataset(tensorX,tensorY)

  train_len = int(len(fullDataset) * 0.9)
  trainDataset, testDataset = \
      random_split(fullDataset, [train_len, len(fullDataset) - train_len])

  train_loader = data.DataLoader(trainDataset,batch_size=train_len)
  test_loader = data.DataLoader(testDataset,batch_size=len(fullDataset) - train_len)

  #create the model
  #determine input dimensionality (i.e. vocab size)
  input_dim = len(vocab)
  #known output dim for binary classification task
  output_dim = 2

  batch_size = train_len
  n_iters = 300
  epochs = n_iters / (len(trainDataset) / batch_size)
  learning_rate = 0.001
  model = LogisticRegression(input_dim, output_dim)

  criterion = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,weight_decay=.1)
  
  iter = 0
  for epoch in range(int(epochs)):
      for i, (freqs, labels) in enumerate(train_loader):
          freqs = Variable(freqs)
          labels = Variable(labels)

          optimizer.zero_grad()
          outputs = model(freqs)

          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          iter+=1
          if iter%50==0:
              # calculate Accuracy
              correct = 0
              total = 0
              with torch.no_grad():
                for i,(freqs,labels) in enumerate(train_loader):
                  freqs = Variable(freqs)
                  outputs = model(freqs)
                  _, predicted = torch.max(outputs.data, 1)
                  total+= labels.size(0)
                  correct+= (predicted == labels).sum()
              train_accuracy = 100 * correct/total
              correct = 0
              total = 0
              for freqs, labels in test_loader:
                  freqs = Variable(freqs)
                  outputs = model(freqs)
                  _, predicted = torch.max(outputs.data, 1)
                  total+= labels.size(0)
                  correct+= (predicted == labels).sum()
              accuracy = 100 * correct/total
              print("Iteration: {}. Loss: {}. Train Accuracy: {} Test Accuracy: {}.".format(iter, loss.item(), train_accuracy, accuracy))
  
  #get the vocab terms most indicative of each class
  params = next(model.parameters())
  neg_weights_sorted = params[0, :].argsort()
  pos_weights_sorted = params[1, :].argsort()
  print(neg_weights_sorted)
  print(pos_weights_sorted)

  termsToTake = 20
  pos_indicators = [vocab[i] for i in pos_weights_sorted[:termsToTake]]
  neg_indicators = [vocab[i] for i in neg_weights_sorted[:termsToTake]]
  print(pos_indicators)
  print(neg_indicators)

  #return avg_score,pos_indicators,neg_indicators

def evaluateModelL2(X,y,vocab):
  return evaluateModel(X,y,vocab,penalty="l2")

In [198]:
corpus = readReviews()

X,y,vocab = createFeatures(corpus)

# print("L1 norm")
# avg_score,pos_indicators,neg_indicators = evaluateModel(X,y,vocab)

# print("The model's average accuracy is %f"%avg_score)
# print("The most informative terms for neg are: %s"%pos_indicators)
# print("The most informative terms for pos are: %s"%neg_indicators)

print("L2 norm")
avg_score,pos_indicators,neg_indicators = evaluateModelL2(X,y,vocab)

print("The model's average accuracy is %f"%avg_score)
print("The most informative terms for neg are: %s"%pos_indicators)
print("The most informative terms for pos are: %s"%neg_indicators)

L2 norm
Iteration: 100. Loss: 0.693328857421875. Train Accuracy: 50 Test Accuracy: 42.
Iteration: 200. Loss: 0.6914485692977905. Train Accuracy: 51 Test Accuracy: 33.
Iteration: 300. Loss: 0.6898007988929749. Train Accuracy: 53 Test Accuracy: 22.
Iteration: 400. Loss: 0.6883254051208496. Train Accuracy: 53 Test Accuracy: 18.
Iteration: 500. Loss: 0.6869875192642212. Train Accuracy: 53 Test Accuracy: 14.
Iteration: 600. Loss: 0.6857595443725586. Train Accuracy: 54 Test Accuracy: 12.
Iteration: 700. Loss: 0.6846251487731934. Train Accuracy: 54 Test Accuracy: 10.


KeyboardInterrupt: ignored