<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/Application_Author_Readability_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np

In [0]:
def automatic_readibility_index(nchars, nwords, nsents):
  return 4.71 * (nchars/nwords) + 0.5 * (nwords/nsents) - 21.43

In [0]:
print(abs(automatic_readibility_index(300, 40, 10) - 15.895) < 0.001)

True


In [0]:
def extract_counts(sentences):
  nsents = len(sentences)
  nwords = 0
  nchars = 0
  for sentence in sentences:
    if isinstance(sentence, str):
      wordlist = sentence.strip().split(" ")
    elif isinstance(sentence, list):
      wordlist = sentence
    nwords += len(wordlist)
    for word in wordlist:
      nchars += len(word.strip())
      
  return nchars, nwords, nsents

In [0]:
print(extract_counts(
    [["This", "was", "rather", "easy", "."], 
     ["Please", "give", "me", "something", "more", "challenging"]]) == (54, 11, 2))

True


In [0]:
sentences = [["This", "was", "rather", "easy", "."], 
             ["Please", "give", "me", "something", "more", "challenging"]]

n_chars, n_words, n_sents = extract_counts(sentences)
print(abs(automatic_readibility_index(n_chars, n_words, n_sents) - 4.442) < 0.001)

True


In [0]:
def compute_ari(sentences):
  n_chars, n_words, n_sents = extract_counts(sentences)
  return automatic_readibility_index(n_chars, n_words, n_sents)

In [0]:
print(abs(compute_ari(sentences) - 4.442) < 0.001)

True


In [0]:
"""Authorship attribution

The naive bayes classifier is a probabilistic classifier that given a set of 
features tries to find the class with the highest probability.
"""

In [0]:
from collections import defaultdict

In [0]:
def classify(scores):
  return max(scores, key=scores.get)

In [51]:
scores = {"Hermans": 0.15, "Voskuil": 0.55, "Reve": 0.2, "Mulisch": 0.18, "Claus": 0.02}
print(classify(scores) == "Voskuil")

True


In [0]:
def read_corpus_file(filepath):
  with open(filepath, 'rt', encoding='utf-8') as fp:
    text = fp.read()
  return text

In [0]:
from glob import glob
def find_corpus_files(corpus_directory, extension="*.txt"):
  return glob(corpus_directory + "/" + extension)

In [0]:
PUNCTUATION = (".",",",":",";","\"","'","!","?","(",")","[","]","/")
WHITESPACE = (" ", "\t", "\n", "\r")
def tokenize(text):
  tokens = []
  begin = 0
  if text[-1] != ' ': text += ' '
  
  for i, c in enumerate(text):
    if c in PUNCTUATION or c in WHITESPACE:
      token = text[begin:i]
      if token not in WHITESPACE:
        tokens.append(token)
      begin = i + 1
  return tokens

In [0]:
def extract_features(filename):
  return tokenize(read_corpus_file(filename))

In [0]:
def update_counts(author, text, feature_database):
  for feature in text:
    feature_database[author][feature] += 1
  return feature_database

In [0]:
def add_file_to_database(author, filename, feature_database):
  return update_counts(author, extract_features(filename), feature_database)

In [0]:
def add_authorlist_to_database(author_filename, feature_database):
  for author, filename in author_filename:
    feature_database = add_file_to_database(author, filename, feature_database)
  return feature_database

In [0]:
from math import log
def log_probability(feature_count, feature_sum, n_features):
  return log( (feature_count + 1.0) / (feature_sum + n_features) )

In [0]:
def score(features, feature_database):
  scores = defaultdict(float)
  # compute the number of features in the feature database here
  n_features = len(set([feature for author in feature_database for feature in feature_database[author].keys()]))
  
  for author in feature_database:
    features_sum = sum(feature_database[author].values())
    
    for feature in features:
      scores[author] += log_probability(feature_database[author][feature], features_sum, n_features)
      
  return scores

In [0]:
tfeature_database = defaultdict(lambda: defaultdict(int))
tfeature_database["A"]["the"] = 2
tfeature_database["A"]["a"] = 5
tfeature_database["A"]["book"]= 1
tfeature_database["B"]["the"] = 5
tfeature_database["B"]["a"] = 1
tfeature_database["B"]["book"] = 6

In [40]:
score(["the", "a", "the", "be", "book"], tfeature_database)

defaultdict(float, {'A': -7.307345136867633, 'B': -7.317674737439682})

In [0]:
def extract_author(filename):
    # insert your code here
    return filename.split('/')[-1].split('_')[0]

In [0]:
def predict_author(text, feature_database):
  return classify(score(extract_features(text), feature_database))

In [0]:
#get data
#!wget https://liferay.de.dariah.eu/tatom/_downloads/datasets.zip
#!unzip /content/datasets.zip

In [45]:
find_corpus_files("/content/data/austen-brontë")

['/content/data/austen-brontë/CBronte_Jane.txt',
 '/content/data/austen-brontë/Austen_Sense.txt',
 '/content/data/austen-brontë/Austen_Emma.txt',
 '/content/data/austen-brontë/CBronte_Professor.txt',
 '/content/data/austen-brontë/Austen_Pride.txt',
 '/content/data/austen-brontë/CBronte_Villette.txt']

In [61]:
feature_database = defaultdict(lambda: defaultdict(int))
author_filename = [
    (extract_author(fname), fname) for fname in find_corpus_files("/content/data/austen-brontë")
]

feature_database = add_authorlist_to_database(author_filename, feature_database)
print(predict_author('/content/data/austen-brontë/CBronte_Jane.txt', feature_database))

CBronte


In [62]:
print(predict_author('/content/data/austen-brontë/Austen_Pride.txt', feature_database))

Austen
