In [1]:
# Hypernym based classifiers

import numpy as np
from nltk import pos_tag, word_tokenize
import file_reader
from pprint import pprint
from collections import Counter
from nltk.corpus import wordnet
from collections import OrderedDict
from copy import deepcopy
from math import ceil
from random import shuffle

In [2]:
# function to return test and training sets
def make_train_test_set(n_files=5, random=True):
    file_structure_train = {}
    file_structure_test = {}
    file_structure, categories = file_reader.readfile(num=n_files)
    for genre in file_structure.keys():
        if random:
            shuffle(file_structure[genre])
        file_structure_test[genre] = file_structure[genre][:ceil(len(file_structure[genre]) / 8)]
        file_structure_train[genre] = file_structure[genre][ceil(len(file_structure[genre]) / 8):]
    return file_structure_train, file_structure_test

In [3]:
def train(file_structure):
    unreadable_files = []
    genre_dict = {}
    for folder in file_structure.keys():
        word_dict = {}
        for file in file_structure[folder]:
            try:
                text = open(file).read().replace(u'\ufeff', '').replace(u'\n', ' ')
            except:
                unreadable_files.append(file)
                continue

            if len(text) == 0:
                unreadable_files.append(file)
                continue

            text = text.translate({ord(c): " " for c in "!@#$%^&*()[]{};:,/<>?\|`~-=_+"})
            tokenized_text = word_tokenize(text)
            tmp = pos_tag(tokenized_text)
            filtered_tok_text = list(filter(lambda x: ('NN' in x[1]), tmp))
            (tok, tag) = zip(*filtered_tok_text)
            token_list_len = len(tokenized_text)
            token_counter = Counter(tok)

            for k in token_counter.most_common(10):
                # print(k[0], k[1] / token_list_len)
                if k[0] in word_dict:
                    word_dict[k[0]] += k[1] / token_list_len
                else:
                    word_dict[k[0]] = k[1] / token_list_len

        genre_dict[folder] = word_dict

    new_genre_dict = OrderedDict()
    for categories in genre_dict.keys():
        new_genre_dict[categories] = {}
        for word in genre_dict[categories].keys():
            new_genre_dict[categories][word] = genre_dict[categories][word]

            try:
                if wordnet.synsets(word)[0].lemma_names('eng')[0] not in genre_dict[categories]:
                    if wordnet.synsets(word)[0].lemma_names('eng')[0] in new_genre_dict[categories]:
                        new_genre_dict[categories][wordnet.synsets(word)[0].lemma_names('eng')[0]] += \
                            genre_dict[categories][word]
                    else:
                        new_genre_dict[categories][wordnet.synsets(word)[0].lemma_names('eng')[0]] = \
                            genre_dict[categories][
                                word]
                else:
                    new_genre_dict[categories][wordnet.synsets(word)[0].lemma_names('eng')[0]] = genre_dict[categories][
                        word]
            except:
                pass

            try:
                if wordnet.synsets(word)[0].hypernyms()[0].lemma_names('eng')[0] not in genre_dict[categories]:
                    if wordnet.synsets(word)[0].hypernyms()[0].lemma_names('eng')[0] in new_genre_dict[categories]:
                        new_genre_dict[categories][wordnet.synsets(word)[0].hypernyms()[0].lemma_names('eng')[0]] += \
                            genre_dict[categories][word]
                    else:
                        new_genre_dict[categories][wordnet.synsets(word)[0].hypernyms()[0].lemma_names('eng')[0]] = \
                            genre_dict[categories][word]
                else:
                    new_genre_dict[categories][wordnet.synsets(word)[0].hypernyms()[0].lemma_names('eng')[0]] += \
                        genre_dict[categories][word]
            except:
                #             print("hypernym try block -",word)
                pass

    label_vector = []

    for genre in new_genre_dict.keys():
        for key in new_genre_dict[genre]:
            label_vector.append(key)

    tmp = [0 for x in range(0, len(label_vector))]
    val_dict = {genre: deepcopy(tmp) for genre in new_genre_dict.keys()}

    for genre in new_genre_dict.keys():
        for keys in genre_dict[genre]:
            val_dict[genre][label_vector.index(keys)] = new_genre_dict[genre][keys]

    return val_dict, unreadable_files, label_vector




In [29]:
def test(file_structure, lv):
    unreadable_files = []
    token_count = {x:0 for x in lv}
    vec_dict = {}
    for folder in file_structure.keys():
        for file in file_structure[folder]:
            train_vector = []

            try:
                text = open(file).read().replace(u'\ufeff', '').replace(u'\n', ' ')
            except:
                unreadable_files.append(file)
                continue
                
            if len(text) == 0:
                unreadable_files.append(file)
                continue

            text = text.translate({ord(c): " " for c in "!@#$%^&*()[]{};:,/<>?\|`~-=_+"})
            tokenized_text = word_tokenize(text)
            tmp = pos_tag(tokenized_text)
            filtered_tok_pos_text = list(filter(lambda x: ('NN' in x[1]), tmp))
            filtered_tok_text, filtered_pos = zip(*filtered_tok_pos_text)
            for word in filtered_tok_text:
                train_vector.append(word)
                try:
                    train_vector.append(wordnet.synsets(word)[0].lemma_names('eng')[0])
                except:
                    pass

                try:
                    train_vector.append(wordnet.synsets(word)[0].hypernyms()[0].lemma_names('eng')[0])
                except:
                    pass
              
            for word in lv:
                try:
                    token_count[word] = train_vector.count(word)
                except:
                    token_count[word] = 0
            
            vec_dict[file] = token_count            
    return vec_dict,unreadable_files

In [6]:
file_structure_train, file_structure_test = make_train_test_set(n_files=5, random=True)

In [7]:
val_dict_train, unreadable_files_train, label_vector = train(file_structure_train)

In [30]:
vec_dict, unreadable_files_test = test(file_structure_test, label_vector)

In [31]:
pprint(vec_dict)

{'./data/Animals/101': {'A': 70,
                        'Arcoll': 0,
                        'Aunt': 0,
                        'BY': 23,
                        'Baron': 0,
                        'Bert': 0,
                        'Bobbsey': 0,
                        'Bradwardine': 0,
                        'C.': 1,
                        'Catholic': 15,
                        'Christ': 79,
                        'Colonel': 0,
                        'Cygnes': 0,
                        'Dik': 0,
                        'Edward': 0,
                        'European': 13,
                        'Fergus': 0,
                        'Fig': 0,
                        'Flossie': 0,
                        'Foundation': 24,
                        'Frank': 0,
                        'Freddie': 0,
                        'Frost': 0,
                        'Galen': 0,
                        'God': 1616,
                        'Gutenberg': 170,
                        'Hare': 0,
  

                        'Colonel': 0,
                        'Cygnes': 0,
                        'Dik': 0,
                        'Edward': 0,
                        'European': 13,
                        'Fergus': 0,
                        'Fig': 0,
                        'Flossie': 0,
                        'Foundation': 24,
                        'Frank': 0,
                        'Freddie': 0,
                        'Frost': 0,
                        'Galen': 0,
                        'God': 1616,
                        'Gutenberg': 170,
                        'Hare': 0,
                        'Henriques': 0,
                        'Him': 128,
                        'Illustration': 0,
                        'Jesus': 178,
                        'John': 21,
                        'KU': 0,
                        'Kansas': 0,
                        'Laputa': 0,
                        'Lord': 27,
                        'Mac': 0,
                        'Marais':