## Context

* First, replicating the analysis done in http://blog.yhat.com/posts/harry-potter-classification.html
* Second, pulling in additional data trying Naive Bayes instead to do multi-class classification

## Pull in data from Harry Potter wiki

In [309]:
import pandas as pd
import requests

In [310]:
# constants
houses = ['Gryffindor', 'Ravenclaw', 'Hufflepuff', 'Slytherin']
base_url = "http://harrypotter.wikia.com/api/v1/Articles/List?expand=1&limit=1000&category="

In [311]:
# pull in articles about characters in each house
data = pd.DataFrame()

for house in houses:
    formatted_url = base_url + house + 's'
    r = requests.get(formatted_url)
    info = r.json()
    # pull the data we want into a dataframe with the house name
    temp = pd.DataFrame(info['items'])[['id', 'title', 'url', 'type']]
    temp = temp[temp.type == 'article']
    temp.drop(['type'], axis=1, inplace=True)
    temp['house'] = pd.Series([house] * len(temp))
    # add it into the final dataset
    data = pd.concat([data, temp])

In [312]:
data.head()

Unnamed: 0,id,title,url,house
1,33349,Astrix Alixan,/wiki/Astrix_Alixan,Gryffindor
2,33353,Filemina Alchin,/wiki/Filemina_Alchin,Gryffindor
3,7018,Euan Abercrombie,/wiki/Euan_Abercrombie,Gryffindor
4,99282,Sakura Akagi,/wiki/Sakura_Akagi,Gryffindor
5,99036,Zakir Akram,/wiki/Zakir_Akram,Gryffindor


In [313]:
# looks like we have between 150-240 characters in each house
# makes sense there would be more in Gryffindor & Slytherin because Hufflepuffs are boring af
data.groupby('house').count()

Unnamed: 0_level_0,id,title,url
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Gryffindor,238,238,238
Hufflepuff,147,147,147
Ravenclaw,151,151,151
Slytherin,206,206,206


In [314]:
# no dupes / no characters found to be in multiple houses / no double crossers!
print len(data.id)
print len(data.id.unique())

749
749


In [None]:
article_id_text = {}
base_url = "http://harrypotter.wikia.com/api/v1/Articles/AsSimpleJson?id="

# loops through articles for characters in each house
for article_id in data.id:
    formatted_url = base_url + str(article_id)
    r = requests.get(formatted_url)
    sections = r.json()['sections']
    # pull in the content section if there is a Personality and Traits sections
    content = [sections[i]['content'] for i, x in enumerate(sections) if sections[i]['title'] == 'Personality and traits']
    if content:
        paragraphs = content[0]
        text = [paragraphs[i]['text'] for i, x in enumerate(paragraphs)]
        all_text = ' '.join(text)
    else:
        all_text = ''
    
    article_id_text[article_id] = all_text

text_data = pd.DataFrame.from_dict(article_id_text, orient='index').reset_index()
text_data.columns = ['id', 'text']
text_data['text_length'] = text_data.text.map(lambda x: len(x))

In [317]:
# combine article data with character data
# and then filter out characters with no article data
df = pd.merge(data, text_data, on='id', how='left')
df = df[df.text_length > 0]

In [318]:
# welp, this dramatically reduces the size of our dataset
df.groupby('house').count()['id']

house
Gryffindor    42
Hufflepuff    11
Ravenclaw     15
Slytherin     26
Name: id, dtype: int64

## Use NLTK to pull in a list of traits

In [152]:
from nltk.corpus import wordnet as wn
import copy
from itertools import combinations

In [319]:
# from these manually curated baseline list of traits
# build upon it to pull in all synonyms and antonyms for traits and anti-traits
trait_dict = {}
trait_dict['Gryffindor'] = [
    'bravery',
    'nerve',
    'chivalry',
    'daring',
    'courage',
]
trait_dict['Slytherin'] = [
    'resourceful',
    'cunning',
    'ambition',
    'determination',
    'self-preservation',
    'fraternity',
    'cleverness',
]
trait_dict['Ravenclaw'] = [
    'intelligence',
    'wit',
    'wisdom',
    'creativity',
    'originality',
    'individuality',
    'acceptance',
]
trait_dict['Hufflepuff'] = [
    'dedication',
    'diligence',
    'fairness',
    'patience',
    'kindness',
    'tolerance',
    'persistence',
    'loyalty',
]

In [131]:
# again, manually curated synonyms
relevant_synsets = {}
relevant_synsets['Ravenclaw'] = [
    wn.synset('intelligence.n.01'), wn.synset('wit.n.01'), wn.synset('brain.n.02'),
    wn.synset('wisdom.n.01'), wn.synset('wisdom.n.02'), wn.synset('wisdom.n.03'),
    wn.synset('wisdom.n.04'), wn.synset('creativity.n.01'), wn.synset('originality.n.01'),
    wn.synset('originality.n.02'), wn.synset('individuality.n.01'), wn.synset('credence.n.01'),
    wn.synset('acceptance.n.03')
]
relevant_synsets['Hufflepuff'] = [
    wn.synset('dedication.n.01'), wn.synset('commitment.n.04'), wn.synset('commitment.n.02'),
    wn.synset('diligence.n.01'), wn.synset('diligence.n.02'), wn.synset('application.n.06'),
    wn.synset('fairness.n.01'), wn.synset('fairness.n.01'), wn.synset('patience.n.01'),
    wn.synset('kindness.n.01'), wn.synset('forgivingness.n.01'), wn.synset('kindness.n.03'),
    wn.synset('tolerance.n.03'), wn.synset('tolerance.n.04'), wn.synset('doggedness.n.01'),
    wn.synset('loyalty.n.01'), wn.synset('loyalty.n.02')
]
relevant_synsets['Gryffindor'] = [
    wn.synset('courage.n.01'), wn.synset('fearlessness.n.01'), wn.synset('heart.n.03'),
    wn.synset('boldness.n.02'), wn.synset('chivalry.n.01'), wn.synset('boldness.n.01')
]
relevant_synsets['Slytherin'] = [
    wn.synset('resourcefulness.n.01'), wn.synset('resource.n.03'), wn.synset('craft.n.05'),
    wn.synset('cunning.n.02'), wn.synset('ambition.n.01'), wn.synset('ambition.n.02'),
    wn.synset('determination.n.02'), wn.synset('determination.n.04'),
    wn.synset('self-preservation.n.01'), wn.synset('brotherhood.n.02'),
    wn.synset('inventiveness.n.01'), wn.synset('brightness.n.02'), wn.synset('ingenuity.n.02')
]

In [132]:
def get_forms(lemma):
    drfs = lemma.derivationally_related_forms()
    output_list = []
    if drfs:
        for drf in drfs:
            drf_pos = str(drf).split(".")[1]
            if drf_pos in ['n', 's', 'a']:
                output_list.append(drf.name().lower())
                if drf_pos in ['s', 'a']:
                    if len(drf.name()) == 3:
                        last_letter = drf.name()[-1:]
                        output_list.append(drf.name().lower() + last_letter + 'er')
                        output_list.append(drf.name().lower() + last_letter + 'est')
                        output_list.append(drf.name().lower()+'ness')
                        output_list.append(drf.name().lower()+'ly')
                    elif drf.name()[-4:] in ['able', 'ible']:
                        output_list.append(drf.name().lower()+'r')
                        output_list.append(drf.name().lower()+'st')
                        output_list.append(drf.name().lower()+'ness')
                        output_list.append(drf.name()[:-1].lower()+'y')
                    elif drf.name()[-1:] == 'e':
                        output_list.append(drf.name().lower()+'r')
                        output_list.append(drf.name().lower()+'st')
                        output_list.append(drf.name().lower()+'ness')
                        output_list.append(drf.name().lower()+'ly')
                    elif drf.name()[-2:] == 'ic':
                        output_list.append(drf.name().lower()+'er')
                        output_list.append(drf.name().lower()+'est')
                        output_list.append(drf.name().lower()+'ness')
                        output_list.append(drf.name().lower()+'ally')
                    elif drf.name()[-1:] == 'y':
                        output_list.append(drf.name()[:-1].lower()+'ier')
                        output_list.append(drf.name()[:-1].lower()+'iest')
                        output_list.append(drf.name()[:-1].lower()+'iness')
                        output_list.append(drf.name()[:-1].lower()+'ily')
                    else:
                        output_list.append(drf.name().lower()+'er')
                        output_list.append(drf.name().lower()+'est')
                        output_list.append(drf.name().lower()+'ness')
                        output_list.append(drf.name().lower()+'ly')
        return output_list
    else:
        return output_list

In [320]:
new_trait_dict = copy.deepcopy(trait_dict)
antonym_dict = {}

In [321]:
# Add synonyms and word forms to the (new) trait dictionary
# Also add antonyms (and their word forms) to the antonym dictionary
for house, traits in trait_dict.items():
    antonym_dict[house] = []
    for trait in traits:
        # first get a list of the synsets for each trait that we've curated above
        synsets = wn.synsets(trait, pos=wn.NOUN)
        for synset in synsets:
            # if the synset is within the relevant synsets that we've also curated above then go ahead
            if synset in relevant_synsets[house]:
                for lemma in synset.lemmas():
                    # put the synonym into the dictionary
                    new_trait_dict[house].append(lemma.name().lower())
                    # and if there are additional forms of it, add it into the list as well
                    if get_forms(lemma):
                        new_trait_dict[house].extend(get_forms(lemma))
                    # and if there are any antonyms, put it into the antonyms dictionary
                    if lemma.antonyms():
                        for antonym in lemma.antonyms():
                            antonym_dict[house].append(antonym.name().lower())
                            if get_forms(antonym):
                                antonym_dict[house].extend(get_forms(antonym))
    # then let's make sure that all the words are unique and sorted for each house
    new_trait_dict[house] = sorted(list(set(new_trait_dict[house])))
    antonym_dict[house] = sorted(list(set(antonym_dict[house])))

In [322]:
# make sure that there aren't repeat words across houses in each of the traits and antonyms dict
def is_no_overlap(dict):
    results = []
    # create pairs of houses using itertools combinations
    house_pairs = list(combinations(dict.keys(), 2))
    for pair in house_pairs:
        # check if two sets are separate
        results.append(set(dict[pair[0]]).isdisjoint(dict[pair[1]]))
    return results

In [323]:
print is_no_overlap(new_trait_dict)
print is_no_overlap(antonym_dict)

[True, True, True, True, True, True]
[True, True, True, True, True, True]


## Sorting Students into Houses

In [186]:
from nltk import word_tokenize
from collections import defaultdict

In [241]:
# input text
# output house
def sorting_hat(text):
    scores = defaultdict(int)
    word_list = [word.lower() for word in word_tokenize(text)]
    for house in houses:
        scores[house] = sum([True for word in word_list if word in new_trait_dict[house]]) - sum([True for word in word_list if word in antonym_dict[house]])
    sorted_house = max(scores, key=scores.get)
    if sum([True for i in scores.values() if i==scores[sorted_house]]) == 1:
        return sorted_house
    else:
        return 'Tie'

In [324]:
df['sorted_house'] = df['text'].map(lambda x: sorting_hat(x))

In [325]:
df.head()

Unnamed: 0,id,title,url,house,text,text_length,sorted_house
16,325,Katie Bell,/wiki/Katie_Bell,Gryffindor,Katie was a friendly person who was very inter...,698,Tie
23,31,Sirius Black,/wiki/Sirius_Black,Gryffindor,Sirius was true to the ideal of a Gryffindor s...,3483,Hufflepuff
30,20,Lavender Brown,/wiki/Lavender_Brown,Gryffindor,Lavender was a somewhat silly and sentimental ...,585,Gryffindor
45,449,Colin Creevey,/wiki/Colin_Creevey,Gryffindor,Colin was a person who was very easily excited...,1457,Slytherin
46,451,Dennis Creevey,/wiki/Dennis_Creevey,Gryffindor,Dennis apparently had a similar personality to...,185,Slytherin


In [326]:
df.groupby('sorted_house').count()['id']

sorted_house
Gryffindor    16
Hufflepuff    18
Ravenclaw     18
Slytherin     11
Tie           32
Name: id, dtype: int64

In [327]:
# pct of ties
print "% of ties: " + "{0:.2f}".format(len(df[df.sorted_house=='Tie'])/float(len(df))*100)
# pct of matches - damn, this is sucks
print "% of matches: " + "{0:.2f}".format(len(df[df.sorted_house==df.house])/float(len(df))*100)

% of ties: 33.68
% of matches: 26.32


## Try using a Naive Bayes classifier

* <b>naive bayes</b> methods are supervised learning algorithms that applies bayes theorem
  * naive assumption of independence between every pair of features
  * not the best for actual prediction probabilities but good for classification
* <b>gaussiannb</b> assumes a gaussian distribution for the likelihood of a given feature for a given class
* <b>multinomialnb</b> assumes multinomially distributed data
  * most suitable for text classification, data is represented as word vector counts (tf-idf also works)
  * smoothing prevents zero probabilities in computation e.g. laplace smoothing (prob of x = (count of x + k) / N + k * count of x)
* <b>bernoullinb</b> assumes features are bernoulli - different from multinomial and explicitly penalizes the non-occurrence of a feature

In [335]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np

* converting data into <b>bag of words</b> representation where each column is a unique word and each row is a character (typically, high dimensional sparse datasets)
* can use countvectorizer in sklearn to tokenize the text data (can also support n-grams)
* occurrence count can be problematic though because larger documents will have higher counts; in that case, use <b>term frequencies (tf)</b> which is the occurrence divided by the total occurrences
* then you might want to de-emphasize words that are common to all documents, which brings us to <b>term frequency times inverse document frequency (tf-idf)</b>

In [None]:
df = df[df.house.isnull() == False]

In [371]:
# we are vectorizing all of the unique words in each character description
# we have 95 characters and 5096 unique words
count_vect = CountVectorizer(ngram_range=(1,3), stop_words='english')
X_counts = count_vect.fit_transform(df['text'].values)
X_counts.shape

(94, 37754)

In [372]:
# figure out term frequency and inverse term frequencies
X_tfidf = TfidfTransformer(use_idf=True,).fit_transform(X_counts)
X_tfidf.shape

(94, 37754)

In [373]:
# y is the house they belong to
df['house_target'] = df['house'].map(lambda x: houses.index(x))
y = df['house_target'].values
y.shape

(94,)

In [374]:
# welp, less than 50% match with naive bayes multinomial on tf idf of text
accuracy = []
skf = StratifiedKFold(n_splits=5, random_state=213, shuffle=True)
for train, test in skf.split(X_tfidf, y):
    predicted_house = MultinomialNB().fit(X_tfidf[train], y[train]).predict(X_tfidf[test])
    accuracy.append(sum(y[test]==predicted_house)/float(len(y[test])))
accuracy

[0.42857142857142855,
 0.47368421052631576,
 0.44444444444444442,
 0.44444444444444442,
 0.44444444444444442]

In [375]:
# welp, less than 50% match with naive bayes bernoulli on tf idf of text
# not super successful - should try to see if more/better data is need, more refinement of features
accuracy = []
skf = StratifiedKFold(n_splits=5, random_state=213, shuffle=True)
for train, test in skf.split(X_tfidf, y):
    predicted_house = BernoulliNB().fit(X_tfidf[train], y[train]).predict(X_tfidf[test])
    accuracy.append(sum(y[test]==predicted_house)/float(len(y[test])))
accuracy

[0.47619047619047616,
 0.42105263157894735,
 0.3888888888888889,
 0.44444444444444442,
 0.44444444444444442]