In [50]:
import pandas as pd
import numpy as np
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [51]:
df = pd.read_csv("../Bank.csv")
df = df.iloc[:94]
df.head()

Unnamed: 0,Sentence,Class
0,I deposited my paycheck at the bank.,Financial Institution
1,The bank of the river was eroded by the heavy ...,River Border
2,She works at the bank as a teller.,Financial Institution
3,Let's have a picnic by the bank of the river.,River Border
4,The bank approved my loan application.,Financial Institution


In [52]:
stops = set(stopwords.words("english"))

In [53]:
for _,row in df.iterrows():
    row["Sentence"] = row["Sentence"].lower()
    row["Sentence"] = word_tokenize(row["Sentence"])
    row["Sentence"] = [i for i in row["Sentence"] if i not in stops and i not in string.punctuation]
df.head()

Unnamed: 0,Sentence,Class
0,"[deposited, paycheck, bank]",Financial Institution
1,"[bank, river, eroded, heavy, rains]",River Border
2,"[works, bank, teller]",Financial Institution
3,"[let, 's, picnic, bank, river]",River Border
4,"[bank, approved, loan, application]",Financial Institution


In [54]:
sentences = df["Sentence"]
sentences = sentences.tolist()
sentences[:5]

[['deposited', 'paycheck', 'bank'],
 ['bank', 'river', 'eroded', 'heavy', 'rains'],
 ['works', 'bank', 'teller'],
 ['let', "'s", 'picnic', 'bank', 'river'],
 ['bank', 'approved', 'loan', 'application']]

In [55]:
classes = df["Class"]
classes = classes.tolist()
classes[:5]

['Financial Institution',
 'River Border',
 'Financial Institution',
 'River Border',
 'Financial Institution']

## Class Frequency Counters

In [56]:
from collections import defaultdict

In [57]:
F_freq = defaultdict(int)
R_freq = defaultdict(int)
length = 0
F_counter = 0
R_counter = 0

In [58]:
for i in range(94):
    length = length + 1
    if classes[i] == "Financial Institution":
        F_counter += 1

        for j in sentences[i]:
            F_freq[j] += 1
    if classes[i] == "River Border":
        R_counter += 1

        for j in sentences[i]:
            F_freq[j] += 1     
F_counter, R_counter

(45, 48)

### Priors

In [59]:
import math

In [60]:
total_classes = F_counter + R_counter
prior_fin = math.log2(F_counter/total_classes)
prior_riv = math.log2(R_counter/total_classes)

prior_fin, prior_riv

(-1.0473057147783567, -0.9541963103868752)

In [61]:
Vocab = set(list(F_freq.keys()) + list(R_freq.keys()))
V = len(Vocab)
V

255

## Test

In [64]:
df = pd.read_csv("../Bank.csv")
test = df.iloc[95:, :]
test.head()

Unnamed: 0,Sentence,Class
95,I need to update my contact information with t...,?
96,The bank provides online banking services for ...,?
97,The beavers constructed a dam along the bank o...,?
98,I need to check my transaction history at the ...,?
99,She works as a financial consultant at the bank.,?


In [67]:
for _,row in test.iterrows():
    row["Sentence"] = row["Sentence"].lower()
    row["Sentence"] = word_tokenize(row["Sentence"])
    row["Sentence"] = [i for i in row["Sentence"] if i not in stops and i not in string.punctuation]
test.head()

Unnamed: 0,Sentence,Class
95,"[need, update, contact, information, bank]",?
96,"[bank, provides, online, banking, services, co...",?
97,"[beavers, constructed, dam, along, bank, river]",?
98,"[need, check, transaction, history, bank]",?
99,"[works, financial, consultant, bank]",?


In [78]:
for _, row in test.iterrows():
    score_fin = prior_fin
    score_riv = prior_riv

    for token in row["Sentence"]:
        score_fin += math.log2(F_freq[token] + 1) - math.log2(F_counter + V)
        score_riv += math.log2(R_freq[token] + 1) - math.log2(R_counter + V)

    if score_fin > score_riv:
        row["Class"] = "Financial Institution"
    else:
        row["Class"] = "River Border"

test

Unnamed: 0,Sentence,Class
95,"[need, update, contact, information, bank]",Financial Institution
96,"[bank, provides, online, banking, services, co...",Financial Institution
97,"[beavers, constructed, dam, along, bank, river]",Financial Institution
98,"[need, check, transaction, history, bank]",Financial Institution
99,"[works, financial, consultant, bank]",Financial Institution
