<a href="https://colab.research.google.com/github/mutherr/CS6120-PS1/blob/master/PS1_Shakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import json
import requests
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,LeaveOneOut
import numpy as np

In [0]:
#read in the shakespeare corpus
def readShakespeare():
  raw = requests.get("https://raw.githubusercontent.com/mutherr/CS6120-PS1/master/shakespeare_plays.json").text.strip()
  corpus = [json.loads(line) for line in raw.split("\n")]

  #remove histories from the data, as we're only working with tragedies and comedies
  corpus = [entry for entry in corpus if entry["genre"] != "history"]
  return corpus

In [0]:
#Here is where you will featurize the data.
# The current contents are for testing only
def createFeatures(corpus):
  from sklearn.feature_extraction.text import CountVectorizer

  texts = [entry["text"] for entry in corpus]
  genres = [entry["genre"] for entry in corpus]

  vectorizer = CountVectorizer()
  texts = vectorizer.fit_transform(texts)

  return texts,genres,vectorizer

In [0]:
#given a numpy matrix representation of the features for the training set and the 
# vector of true classes for each example, this computes the accuracy of the
# model using leave one out cross validation and reports the most indicative
# features for each class
def evaluateModel(X,y,vectorizer,penalty="l1"):
  #create and fit the model
  model = LogisticRegression(penalty=penalty,solver="liblinear")
  results = cross_validate(model,X,y,cv=LeaveOneOut())
  
  #determine the average accuracy
  scores = results["test_score"]
  avg_score = sum(scores)/len(scores)
  print("The model's average accuracy is %f"%avg_score)
  
  #determine the most informative features
  # this requires us to fit the model to everything, because we need a
  # single model to draw coefficients from, rather than 26
  model.fit(X,y)
  neg_class_prob_sorted = model.coef_[0, :].argsort()
  pos_class_prob_sorted = (-model.coef_[0, :]).argsort()

  termsToTake = 20
  pos_indicators = np.take(vectorizer.get_feature_names(), neg_class_prob_sorted[:termsToTake])
  neg_indicators = np.take(vectorizer.get_feature_names(), pos_class_prob_sorted[:termsToTake])

  print("The most informative terms for comedies are: %s"%pos_indicators)
  print("The most informative terms for tragedies are: %s"%neg_indicators)

def evaluateModelL2(X,y,vectorizer):
  return evaluateModel(X,y,vectorizer,penalty="l2")

In [203]:
#Run this to read the corpus and fit the model using your featurization scheme

corpus = readShakespeare()

X,y,vectorizer = createFeatures(corpus)

print("----------L1 Norm-----------")
#this call will fit a model with L1 normalization
evaluateModel(X,y,vectorizer)
print("----------L2 Norm-----------")
#this call will fit a model with L2 normalization
evaluateModelL2(X,y,vectorizer)

----------L1 Norm-----------
The model's average accuracy is 0.769231
The most informative terms for comedies are: ['helena' 'prospero' 'sir' 'you' 'for' 'your' 'me' 'duke' 'of' 'love'
 'preserver' 'preserved' 'preserve' 'preserv' 'preservation' 'preservers'
 'presents' 'presentment' 'presently' 'presenting']
The most informative terms for tragedies are: ['our' 'him' 'rom' 'thy' 'iago' 'ham' 'imogen' 'what' 'brutus' 'his'
 'lear' 'timon' 'preservers' 'preserver' 'preserved' 'preserv'
 'preservative' 'preservation' 'presents' 'presentment']
----------L2 Norm-----------
The model's average accuracy is 0.730769
The most informative terms for comedies are: ['you' 'prospero' 'duke' 'helena' 'antonio' 'me' 'for' 'your' 'sir'
 'ariel' 'sebastian' 'hermia' 'lysander' 'parolles' 'stephano' 'will'
 'leontes' 'caliban' 'demetrius' 'love']
The most informative terms for tragedies are: ['ham' 'iago' 'him' 'our' 'othello' 'what' 'his' 'lear' 'imogen' 'brutus'
 'rom' 'nurse' 'romeo' 'caesar' 'thy' 'c