<a href="https://colab.research.google.com/github/mutherr/CS6120-PS1/blob/master/PS1_Reviews-NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import json
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,LeaveOneOut,KFold
import numpy as np

In [0]:
#read in the shakespeare corpus
def readShakespeare():
  raw = requests.get("https://raw.githubusercontent.com/mutherr/CS6120-PS1-data/master/shakespeare_plays.json").text.strip()
  corpus = [json.loads(line) for line in raw.split("\n")]

  #remove histories from the data, as we're only working with tragedies and comedies
  corpus = [entry for entry in corpus if entry["genre"] != "history"]
  return corpus

In [0]:
#Here is where you will featurize the data.
#NB: The current contents are for testing only
#This function should return: 
#  -a numpy matrix of document features
#  -a list of the correct genre for each document
#  -a list of the vocabulary used by the features, such that the ith term of the
#    list is the word whose counts appear in the ith column of the matrix. 
def createFeatures(corpus):
  from sklearn.feature_extraction.text import CountVectorizer

  texts = [entry["text"] for entry in corpus]
  genres = [entry["genre"] for entry in corpus]

  vectorizer = CountVectorizer()
  texts = vectorizer.fit_transform(texts)
  vocab = vectorizer.get_feature_names()

  return texts,genres,vocab

In [0]:
#given a numpy matrix representation of the features for the training set, the 
# vector of true classes for each example, and the vocabulary as described 
# above, this computes the accuracy of the model using leave one out cross 
# validation and reports the most indicative features for each class
def evaluateModel(X,y,vocab,penalty="l1"):
  #create and fit the model
  model = LogisticRegression(penalty=penalty,solver="liblinear")
  results = cross_validate(model,X,y,cv=LeaveOneOut())
  
  #determine the average accuracy
  scores = results["test_score"]
  avg_score = sum(scores)/len(scores)
  
  #determine the most informative features
  # this requires us to fit the model to everything, because we need a
  # single model to draw coefficients from, rather than 26
  model.fit(X,y)
  neg_class_prob_sorted = model.coef_[0, :].argsort()
  pos_class_prob_sorted = (-model.coef_[0, :]).argsort()

  termsToTake = 20
  pos_indicators = [vocab[i] for i in neg_class_prob_sorted[:termsToTake]]
  neg_indicators = [vocab[i] for i in pos_class_prob_sorted[:termsToTake]]

  return avg_score,pos_indicators,neg_indicators

def evaluateModelL2(X,y,vocab):
  return evaluateModel(X,y,vocab,penalty="l2")

In [0]:
#Run this to read the corpus and fit the model using your featurization scheme

corpus = readShakespeare()

X,y,vocab = createFeatures(corpus)

#this call will fit a model with L1 normalization
print("----------L1 Norm-----------")
avg_score,pos_indicators,neg_indicators = evaluateModel(X,y,vocab)
print("The model's average accuracy is %f"%avg_score)
print("The most informative terms for comedy are: %s"%pos_indicators)
print("The most informative terms for tragedy are: %s"%neg_indicators)
#this call will fit a model with L2 normalization
print("----------L2 Norm-----------")
avg_score,pos_indicators,neg_indicators = evaluateModelL2(X,y,vocab)
print("The model's average accuracy is %f"%avg_score)
print("The most informative terms for comedy are: %s"%pos_indicators)
print("The most informative terms for tragedy are: %s"%neg_indicators)