In [89]:
import pandas as pd
import numpy as np
from collections import Counter as ctr

In [90]:
data = pd.read_csv('pnp-train-2.txt',delimiter='\t',encoding='latin-1', names=['type','name'])
# basic tokenization
data['clean'] = data.name.apply(lambda x: x.lower().split())

### Split into train and test data

In [91]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=200)

# create ctr
type_ctr = ctr(train.type)

### Create Helper Functions for Naive-Based Classifier
$$ΠP(C|w_1,w_2...) = \frac{P(w_i|C)P(C)}{P(w_i)}$$

In [92]:
# P(C) - prior probability
def Pa(C=''):
    return type_ctr[C] / len(train)

In [93]:
# P(w_i) - probablity of word
words_ctr = ctr([word for row in train.clean for word in row])

def Pb(W=''):
    if W not in words_ctr:
        return 0.0001
    return words_ctr[W] / sum(words_ctr.values())

In [94]:
# Get word counts given a type
type_word_count = {}

for col_name in list(set(data.type)):
    sub_df = train[train.type == col_name]
    type_word_count[col_name] = []
    for row in sub_df.clean:
        for word in row:
            type_word_count[col_name].append(word)
    type_word_count[col_name] = ctr(type_word_count[col_name])

# Define P(w_i|C)
def Pba(W='', C=''):
    t = type_word_count[C]
    if W not in t: return 0.0000001
    return t[W] / sum(t.values())

In [95]:
# Define P(C|w1, w2, w3,...,wn)
def Pab(C='', W=''):
    return (Pba(W=W, C=C) * Pa(C=C) / Pb(W=W))

In [96]:
# define P(type|sentence)
def Ps(S, C=''):
    return np.prod([Pab(C=C, W=word) for word in S])

In [118]:
def bayesClassifier(df):
    guesses = []
    for _, row in df.iterrows():
        bestProb = 0
        bestGuess = ''
        for currType in df['type'].unique():
            currProb = Ps(S=row['clean'], C=currType)
            if (currProb > bestProb):
                bestGuess = currType
                bestProb = currProb
        guesses.append(bestGuess)
    return guesses

In [119]:
train['guesses'] = bayesClassifier(train)

In [120]:
sum(train['type'] == train['guesses']) / len(train)

0.991547619047619

In [121]:
test['guesses'] = bayesClassifier(test)

In [123]:
sum(test['type'] == test['guesses']) / len(test)

0.6662699357295881