In [0]:
#@title

from google.colab import drive
drive.mount('/content/drive/')

In [0]:
#@title
import os
os.chdir('/content/drive/My Drive/ML')

In [0]:
genres = ['DETECT', 'FANTAST', 'RELIGION', 'TALES']

In [0]:
import os
import glob
path = 'ENGLISH/'
texts = {}

for genre in genres:
  texts[genre] = {}
  inpath = path + genre
  for filename in glob.glob(os.path.join(inpath, '*.TXT')):    
    try:
      with open(filename, 'r', encoding='utf-8') as f:      
        texts[genre][filename.split('/')[-1]] = ''.join(f.readlines())
      print(filename)
    except:      
      with open(filename, 'r', encoding='latin-1') as f:      
        texts[genre][filename.split('/')[-1]] = ''.join(f.readlines())
      print(filename)

In [12]:
tcount = 0
for genre in texts:
  print(genre + " : " + str(len(texts[genre])))
  tcount += len(texts[genre])
print('Total : ', str(tcount))

DETECT : 29
FANTAST : 75
RELIGION : 22
TALES : 28
Total :  154


In [13]:
# Split texts to train and test subsets

import random

train_texts = {}
test_texts = {}
train_num = 0


for genre in texts:
  train_texts[genre] = {}
  test_texts[genre] = {}
  train_num = int(len(texts[genre])*0.8)
  train_sample = random.sample(texts[genre].keys(), train_num)
  for text in texts[genre]:
    if text in train_sample:
      train_texts[genre][text] = texts[genre][text]
    else:
      test_texts[genre][text] = texts[genre][text]

      
print('Train texts:')      
tcount = 0
for genre in train_texts:
  print(genre + " : " + str(len(train_texts[genre])))
  tcount += len(train_texts[genre])
print('-----')
print('Total :', str(tcount))  

print('\n')    

print('Test texts:')      
tcount = 0
for genre in test_texts:
  print(genre + " : " + str(len(test_texts[genre])))
  tcount += len(test_texts[genre])
print('-----')
print('Total :', str(tcount)) 

Train texts:
DETECT : 23
FANTAST : 60
RELIGION : 17
TALES : 22
-----
Total : 122


Test texts:
DETECT : 6
FANTAST : 15
RELIGION : 5
TALES : 6
-----
Total : 32


In [14]:
import math

p_genres = {}
for genre in genres:  
  p_genres[genre] = math.log((len(texts[genre])) / tcount)
p_genres

{'DETECT': -0.09844007281325252,
 'FANTAST': 0.8517522107365839,
 'RELIGION': -0.3746934494414107,
 'TALES': -0.13353139262452263}

### Calculate Likelihood

In [0]:
from bs4 import BeautifulSoup
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
from nltk.tag import pos_tag
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [0]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(sentence):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word
  
def clean_text_lemma(text, lemma=True): 
    ''' 
    Utility function to clean text by removing links, special characters 
    using simple regex statements. Also converting words to lemmas.
    '''    
    text = BeautifulSoup(text, 'html.parser').get_text()    
    stripped_text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).lower().split()
    lemma_text = lemmatize_all(stripped_text)
    filtered_text = [word for word in lemma_text if word not in stopWords]
    join_text = ' '.join(filtered_text)
    return join_text

In [0]:
def make_bag(text):
  bag = {}
  max_count = 0
  words = text.split()
  
  for word in words:      
    if word in bag:
      bag[word] += 1
    else:
      bag[word] = 1
  
  for word in bag:
    if bag[word] > max_count:
      max_count = bag[word]
  
   
  for word in bag:
    bag[word] = (float(bag[word]) / max_count) * 100
  
  return bag

In [0]:
# Clean train text and count words per text

p_word = {}
for genre in train_texts:
  p_word[genre] = {}
  for text in train_texts[genre]:    
    train_texts[genre][text] = clean_text_lemma(train_texts[genre][text])
    p_word[genre][text] = make_bag(train_texts[genre][text])    
    print(text)

In [20]:
# Gather mean and variance per word per genre
import numpy as np
p_word_stat = {}

for genre in p_word:  
  p_word_stat[genre] = {}
  
  for text in p_word[genre]:   
    for word in p_word[genre][text]:     
      if word in p_word_stat[genre]:
        p_word_stat[genre][word]['values'] = np.append(p_word_stat[genre][word]['values'], p_word[genre][text][word])
      else:
        p_word_stat[genre][word] = {}
        p_word_stat[genre][word]['values'] = np.array([p_word[genre][text][word]])
        
  for word in p_word_stat[genre]:
    if len(p_word_stat[genre][word]['values']) < len(p_word[genre]):      
      dif = len(p_word[genre]) - len(p_word_stat[genre][word]['values'])      
      p_word_stat[genre][word]['values'] = np.pad(p_word_stat[genre][word]['values'], (0,dif), 'constant')
    
    p_word_stat[genre][word]['mean'] = p_word_stat[genre][word]['values'].mean()
    p_word_stat[genre][word]['var'] = p_word_stat[genre][word]['values'].var()
    p_word_stat[genre][word]['std'] = p_word_stat[genre][word]['values'].std()
  
  print(len(p_word[genre]))

23
60
17
22


In [0]:
import math

def calculateProbability(x, mean, stdev):
  exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
  return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent


def naive_product(text_bag, class_stats):
  naive_p = 1
  for word in text_bag:
    try:
      p = calculateProbability(text_bag[word], class_stats[word]['mean'], class_stats[word]['std'])
      p = math.log(p)
    except:
      p = math.log(2.2250738585072014e-250)
      
    if not p == 0.0:      
      naive_p += p      
    else:
      naive_p += math.log(2.2250738585072014e-250)
      print(naive_p)
  return naive_p

In [0]:
def predict(text, priors=p_genres, classes=genres, word_stats=p_word_stat):
  predictions = {}
  text_lemma = clean_text_lemma(text)
  text_bag = make_bag(text_lemma)  
  
  for genre in classes:
    prediction = priors[genre] + naive_product(text_bag, word_stats[genre])
    predictions[genre] = prediction
  sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
  return sorted_predictions 

### Model accuracy

In [0]:
total = 0
guessed = 0
for genre in test_texts:
  print("Predict", genre)
  for text in test_texts[genre]:
    total += 1
    prediction = predict(test_texts[genre][text])
    print(prediction)
    if prediction[0][0] == genre:
      guessed += 1    

In [24]:
print("Model accuracy:", "{0:8.2f}".format(guessed / total))

Model accuracy:     0.69
