**Module 8 Project: Bayesian Machine Learning**

In [85]:
!git clone https://github.com/VVV-3/FMML_Module8_Project.git

Cloning into 'FMML_Module8_Project'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 0), reused 18 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 438.62 KiB | 2.18 MiB/s, done.


In [86]:
!pip install pdfreader



In [87]:
import os
os.chdir("FMML_Module8_Project")

In [88]:
!ls

autohire  data	README.md


In [89]:
!ls autohire

bow.py	    explainer.py    __init__.py  model.py	   utils.py
encoder.py  hyperparams.py  __main__.py  requirements.txt


In [90]:
!cat autohire/bow.py

from os import stat
import typing

import numpy as np

import typing


class BagOfWords:
    """
    A type of encoder, makes
    """

    def __init__(self, data: typing.Iterable) -> None:
        """
        Generate the bag of words
        :param data: an array of words, or an iterable containing arrays of words
        """
        data = np.array(self.__linearize_array(data))
        self.index_to_words = np.unique(data)
        self.words_to_index = {w: i for i, w in enumerate(self.index_to_words)}

    @classmethod
    def __linearize_array(cls, text):
        x = []
        for item in text:
            if isinstance(item, str):
                x.append(item)
            else:
                x.extend(cls.__linearize_array(item))
        return x

    def __call__(self, text: typing.Iterable[str]) -> np.array:
        return self.get_counts(text)

    def __len__(self) -> int:
        return len(self.index_to_words)

    def encode_data(
        self: "BagOfWords",
        text

In [91]:
!cat autohire/encoder.py

import numpy as np


class LabelEncoder:
    """
    Label encode a series of labels
    """

    def __init__(self, data) -> None:
        self.__training_data = data
        self.index_to_token = list(set(data))
        self.token_to_index = {
            token: index for index, token in enumerate(self.index_to_token)
        }

    def __len__(self):
        return len(self.token_to_index)

    @property
    def encoded_data(self):
        return np.array([self.token_to_index[token] for token in self.__training_data])

    def encode(self, data):
        return np.array([self.token_to_index[token] for token in data])

    def decode(self, data):
        if isinstance(data, int) or isinstance(data, np.int64):
            return self.index_to_token[data]
        else:
            return np.array([self.index_to_token[index] for index in data])


In [92]:
!cat autohire/explainer.py

import numpy as np

from .model import BayesianMulticlassModel
from .bow import BagOfWords
from .encoder import LabelEncoder


class BayesianModelExplainer(BayesianMulticlassModel):
    """
    Explainer of the decision made by the base model
    """

    def __init__(self, label_encoder: LabelEncoder, bag_of_words: BagOfWords) -> None:
        super().__init__(len(label_encoder), len(bag_of_words))
        self.bag_of_words = bag_of_words
        self.label_encoder = label_encoder

    def explain(self, text=None, label_filter=None):
        """
        Visualize what are the prior probabilities of classes and which words
        add the the likelihood of each class.
        """
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        e

In [93]:
!cat autohire/hyperparams.py

WORD_LENGTH_THRESHOLD = 2
WORD_COUNT_THRESHOLD = 1


In [94]:
!cat autohire/__init__.py

In [95]:
!cat autohire/__main__.py

import numpy as np

from autohire.utils import parse_pdf, parse_resume_df
from autohire.bow import BagOfWords
from autohire.encoder import LabelEncoder
from autohire.model import BayesianMulticlassModel
from autohire.explainer import BayesianModelExplainer


if __name__ == "__main__":
    x_train, y_train = parse_resume_df()
    bag_of_words = BagOfWords(x_train)
    label_encoder = LabelEncoder(y_train)

    x_train = bag_of_words.get_counts(x_train)
    y_train = label_encoder.encode(y_train)
    model = BayesianMulticlassModel(len(label_encoder), len(bag_of_words))
    model.fit(x_train=x_train, y_train=y_train)

    x_test_input = parse_pdf("data/resumes/computers_2.pdf")
    x_test = bag_of_words.get_counts(x_test_input)
    result = model.predict(x_train[0])
    result = label_encoder.decode(result)

    for job in result[:5]:
        print(job)

    explainable_model = BayesianModelExplainer(label_encoder, bag_of_words)
    explainable_model.fit(x_train=x_train, y_train=y_train)

In [96]:
!cat autohire/model.py

import typing
import numpy as np


class BayesianMulticlassModel:
    """
    A multi-class bayesian classfier from encoded text tokens
    """

    def __init__(self, num_classes, num_tokens) -> None:
        self.counts = np.zeros(shape=(num_classes, num_tokens))

    def fit(self, x_train: typing.Iterable[np.ndarray], y_train: typing.Iterable[int]):
        for x, y in zip(x_train, y_train):
        self.counts[y] += x

    def predict(self, counts_vector):
        class_frequencies = np.sum(self.counts, axis=1)
        word_frequencies = np.sum(self.counts, axis=0)

        prior = class_frequencies / np.sum(class_frequencies)  # p(label)
        likelihood = self.counts / np.expand_dims(
            class_frequencies, axis=1
        )  # p(word|label)
        evidence = word_frequencies / np.sum(word_frequencies)  # p(word)

        likelihood = np.multiply(likelihood, counts_vector)
        prior = np.expand_dims(prior, axis=1)

        posterior_marginal = prior * likelihood / e

In [97]:
!cat autohire/requirements.txt

numpy>=1.19.0
pandas>=1.3.0

black
mypy
pylint
pytest
sphinx
sphinx-rtd-docs


In [98]:
!cat autohire/utils.py

import re
from collections import defaultdict

import numpy as np
import pandas as pd
from pdfreader import PDFDocument, SimplePDFViewer, document

from .hyperparams import *


def clean_text(text: str):
    """
    Given text it removes all the non-character words, small words,
    converts everything to small letters, tokenizes and returns as a list.
    :param text: The text to be cleaned
    """
    text = text.lower()
    text = re.sub("[^a-z]", " ", text)
    data = text.split()
    data = list(filter(lambda x: len(x) >= WORD_LENGTH_THRESHOLD, data))
    return data


def parse_pdf(filename: str):
    """
    Read text from a PDF file.
    Clean the text, tokenize it, and return as a list of tokens.
    :param :
    """
    fd = open(filename, "rb")
    document = PDFDocument(fd)
    viewer = SimplePDFViewer(fd)
    output_strings = []
    for i in range(len(list(document.pages()))):
        viewer.navigate(1)
        viewer.render()
        output_strings.extend(viewer.canvas.st