In [214]:
import os
import re
import collections as col

import pandas as pd
import numpy as np
from tqdm import tqdm

import books_utils as bu

from nltk import stem
from sklearn import metrics
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [50]:
import importlib
importlib.reload(bu)

<module 'books_utils' from '/Users/sudodoki/Projects/AI_ML/projector-nlp/final-project-public/experiment-3/books_utils.py'>

In [167]:
stemmer = stem.PorterStemmer()

In [2]:
annotations = pd.read_csv('../data/character_relation_annotations.txt.gz', sep='\t')
# dropping values that have gibberish affinity - might transform this later based on category
annotations = annotations[(annotations['affinity'] != 'NR') & (annotations['character_1'] != 'NR') & (annotations['character_2'] != 'NR')].copy()
annotations['book_name'] = (annotations['title'] + ' ' + annotations['author']).str.replace("\s", "_")
print(annotations.shape)
# making sure no NR in character_1/character_2/affinity
annotations.describe()

(2137, 11)


Unnamed: 0,annotator,change,title,author,character_1,character_2,affinity,coarse_category,fine_category,detail,book_name
count,2137,2137,2137,2137,2137,2137,2137,2137,2137,2137,2137
unique,14,3,109,49,1005,825,3,4,30,528,109
top,annotator_1,no,Narrative of the Life of Frederick Douglass,William Shakespeare,Joseph K.,Timon,positive,social,friend,NR,Madame_Bovary_Gustave_Flaubert
freq,760,1712,20,613,15,17,1120,886,342,1591,20


In [3]:
# https://git.io/vpzth
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    row_accumulator = []

    def splitListToRows(row, separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            # super custom hack for Mr & Mrs. case
            substitution = s
            if s.endswith('Mr.') or s.endswith('Mrs.'):
                substitution = s + ' ' + re.sub('^Mrs?\.\s?', '', split_row[-1])
            new_row[target_column] = substitution
            row_accumulator.append(new_row)

    df.apply(splitListToRows, axis=1, args = (separator, ))
    new_df = pd.DataFrame(row_accumulator)
    return new_df

new_df = splitDataFrameList(annotations, 'character_1', ' and ')
new_df = splitDataFrameList(new_df, 'character_2', ' and ')
new_df = splitDataFrameList(new_df, 'character_1', ';')
new_df = splitDataFrameList(new_df, 'character_2', ';')

In [39]:
existing_files = []
chars = {}
book_names = new_df['book_name'].unique()
for name in book_names:
    chars[name] = np.unique(new_df[new_df['book_name'] == name][['character_1', 'character_2']].values)
    file = '../data/books/{}.txt'.format(name)
    existing_files.append(os.path.isfile(file))
len(book_names), len(existing_files), all(existing_files)

(109, 109, True)

In [5]:
books = [bu.Book(name, book_NLP_folder="../data/bookNLP_output", source_folder="../data/books") for name in book_names]

In [6]:
for book in books:
    len(book.characters.all)

In [8]:
book_by_name = {}
for book in books:
    book_by_name[book.name] = book

In [9]:
new_df.shape

(2165, 11)

In [47]:
def count_matching_chars(book, char):
    found_count = 0
    found_chars = []
    for bchar in book.characters.all:
        name_in_book = bu.book_name_to_annotated_name(book.name, bchar, [char], False)
        if name_in_book:
            found_count += 1
            found_chars.append(bchar)
    return found_count, found_chars
def has_known_characters(book_by_name_map):
    def do_the_actual_thing(row):
        book = book_by_name_map[row['book_name']]
        character_1 = row['character_1']
        character_2 = row['character_2']
        count_1, _ = count_matching_chars(book, character_1)
        count_2, _ = count_matching_chars(book, character_2)
        return count_1 == 1 and count_2 == 1
    return do_the_actual_thing

In [61]:
filtered_df = new_df[new_df.apply(has_known_characters(book_by_name), axis=1)]

In [59]:
# for book in books:
#     print(re.sub('.', '=', f'Book: {book.name}'))
#     print(f'Book: {book.name}')
#     print(re.sub('.', '-', f'Book: {book.name}'))
#     for char in chars[book.name]:
#         count, found_chars = count_matching_chars(book, char)
#         if count > 1:
#             print(f'{char} <- {", ".join([bu.longest_name(c) for c in found_chars])}')

In [65]:
len(filtered_df['character_1'].unique()), len(filtered_df['character_2'].unique())

(488, 411)

In [66]:
def sort_chars_alphabetically(row):
    char_1, char_2 = sorted([row['character_1'], row['character_2']])
    copy = row.to_dict()
    copy['character_1'] = char_1
    copy['character_2'] = char_2
    return pd.Series(copy)
sorted_filtered_df = filtered_df.apply(sort_chars_alphabetically, axis=1)

In [67]:
len(sorted_filtered_df['character_1'].unique()), len(sorted_filtered_df['character_2'].unique())

(372, 387)

In [80]:
affinity_stats = sorted_filtered_df.groupby(['character_1', 'character_2', 'affinity']).size().unstack(fill_value=0)
affinity_stats

Unnamed: 0_level_0,affinity,negative,neutral,positive
character_1,character_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aaron,Tamora,0,0,2
Abel Whittle,Michael Henchard,2,0,0
Abigail,Barabas,1,1,0
Abigail,Don Lodowick,0,1,0
Abigail,Don Mathias,0,0,2
Achilles,Hector,0,1,0
Achilles,Peleus,0,0,1
Adam,Eve,1,0,1
Adam,Orlando,0,0,1
Adam Bede,Bartle Massey,0,0,1


In [82]:
print(affinity_stats['negative'].value_counts())
print(affinity_stats['neutral'].value_counts())
print(affinity_stats['positive'].value_counts())

0    405
1    150
2     46
3      1
Name: negative, dtype: int64
0    458
1    136
2      8
Name: neutral, dtype: int64
1    243
0    226
2    131
4      1
3      1
Name: positive, dtype: int64


In [83]:
filtered_df

Unnamed: 0,affinity,annotator,author,book_name,change,character_1,character_2,coarse_category,detail,fine_category,title
0,positive,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,yes,Sancho Panza,Don Quixote,professional,he ends up taking more authority and advising ...,servant,Don Quixote
1,positive,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Rocinante,Don Quixote,professional,Rocinante is his horse,person offering service to client,Don Quixote
2,positive,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Dulcinea del Toboso,Don Quixote,social,NR,unrequited love interest,Don Quixote
3,positive,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Cervantes,Cide Hamete Benengeli,professional,NR,colleague,Don Quixote
8,negative,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Altisidora,social,she is mocking him,lovers,Don Quixote
13,neutral,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Sancho Panza,professional,Sancho is Don's squire.,employer,Don Quixote
14,positive,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Rocinante,social,Rocinante is Don's horse.,friend,Don Quixote
15,neutral,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Sancho Panza,Dapple,social,Dappie is Sancho's donkey.,friend,Don Quixote
16,neutral,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Dulcinea del Toboso,Don Quixote,social,NR,unrequited love interest,Don Quixote
26,positive,annotator_3,Louisa May Alcott,Little_Women_Louisa_May_Alcott,no,Marmee,Mr. March,familial,NR,husband/wife,Little Women


In [86]:
def get_unique_lambda(df):
    def unique_affinity_and_category(row):
        matching_categories = df[
            (df['book_name'] == row['book_name']) &
            ((df['character_1'] == row['character_1']) & (df['character_2'] == row['character_2']) |
            (df['character_2'] == row['character_1']) & (df['character_1'] == row['character_2']))
            & (df['coarse_category'] != 'NR')
        ]['coarse_category'].unique()
        matching_affinity = df[
            (df['book_name'] == row['book_name']) &
            ((df['character_1'] == row['character_1']) & (df['character_2'] == row['character_2']) |
            (df['character_2'] == row['character_1']) & (df['character_1'] == row['character_2']))
        ]['affinity'].unique()
        # Being able to 'play' the data, going to get more balanced classes
        # professional > familial > social
        category = 'NR'
        if 'professional' in matching_categories:
            category = 'professional'
        elif 'familial' in matching_categories:
            category = 'familial'
        elif 'social' in matching_categories:
            category = 'social'
        # negative > neutral > positive
        affinity = 'positive'
        if 'negative' in matching_affinity:
            affinity = 'negative'
        elif 'neutral' in matching_affinity:
            affinity = 'neutral'
        elif 'positive' in matching_categories:
            affinity = 'positive'
        return pd.Series({'affinity': affinity, 'coarse_category': category})
    return unique_affinity_and_category
lookup_non_unique = get_unique_lambda(sorted_filtered_df)
new_categories_affinity = sorted_filtered_df.apply(lookup_non_unique, axis=1)

In [93]:
sorted_filtered_df['affinity'] = new_categories_affinity['affinity']
sorted_filtered_df['coarse_category'] = new_categories_affinity['coarse_category']
sorted_filtered_df

Unnamed: 0,affinity,annotator,author,book_name,change,character_1,character_2,coarse_category,detail,fine_category,title
0,neutral,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,yes,Don Quixote,Sancho Panza,professional,he ends up taking more authority and advising ...,servant,Don Quixote
1,positive,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Rocinante,professional,Rocinante is his horse,person offering service to client,Don Quixote
2,neutral,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Dulcinea del Toboso,social,NR,unrequited love interest,Don Quixote
3,positive,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Cervantes,Cide Hamete Benengeli,professional,NR,colleague,Don Quixote
8,negative,annotator_1,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Altisidora,Don Quixote,social,she is mocking him,lovers,Don Quixote
13,neutral,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Sancho Panza,professional,Sancho is Don's squire.,employer,Don Quixote
14,positive,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Rocinante,professional,Rocinante is Don's horse.,friend,Don Quixote
15,neutral,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Dapple,Sancho Panza,social,Dappie is Sancho's donkey.,friend,Don Quixote
16,neutral,annotator_2,Miguel de Cervantes,Don_Quixote_Miguel_de_Cervantes,no,Don Quixote,Dulcinea del Toboso,social,NR,unrequited love interest,Don Quixote
26,positive,annotator_3,Louisa May Alcott,Little_Women_Louisa_May_Alcott,no,Marmee,Mr. March,familial,NR,husband/wife,Little Women


In [92]:
unique_known_relations = sorted_filtered_df.drop_duplicates(['character_1', 'character_2', 'affinity', 'coarse_category'])

In [95]:
unique_known_relations.reset_index(inplace=True)

In [97]:
unique_known_relations.shape

(603, 12)

In [103]:
all_X = unique_known_relations[['book_name', 'character_1', 'character_2']]
all_y = unique_known_relations[['affinity', 'coarse_category']]
# stratifying on affinity because on joint it yields error of too few items in group
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.33, random_state=42, stratify=all_y['affinity'])

In [194]:
def build_char_ids_index(book):
    res = col.defaultdict(list)
    for p in book.paragraphs:
        chars = frozenset(p[p['characterId'] > 0 ]['characterId'].unique())
        id = p['paragraphId'].unique()[0]
        res[chars].append(id)
    return res
def gather_relevant_paragraphs(X, book_mapping):
    xs = X.to_dict(orient='records')
    
    all_words = []
    all_lemmas = []
    all_stems = []
    words_mapping = col.defaultdict(str)
    lemmas_mapping = col.defaultdict(str)
    stems_mapping = col.defaultdict(str)
    characters_to_paragraph_mapping = {}
    for x in tqdm(xs):
        book_name = x['book_name']
        char_1 = x['character_1']
        char_2 = x['character_2']
        book = book_mapping[book_name]
        _, book_chars_1 = count_matching_chars(book, char_1)
        _, book_chars_2 = count_matching_chars(book, char_1)
        book_chars_1_id = [char['id'] for char in book_chars_1]
        book_chars_2_id = [char['id'] for char in book_chars_2]
        if not book in characters_to_paragraph_mapping:
            characters_to_paragraph_mapping[book] = build_char_ids_index(book)
        for charset in characters_to_paragraph_mapping[book]:
            if any([cid in charset for cid in book_chars_1_id]) and any([cid in charset for cid in book_chars_2_id]):
                paragraphIds = characters_to_paragraph_mapping[book][charset]
                subset = book.tokens[book.tokens['paragraphId'].isin(paragraphIds)]

                p_tokens = subset # subset[(subset['characterId'] == -1)]
                p_words = ''
                p_lemmas = ''
                p_stems = ''
                for (i, tok) in enumerate(p_tokens.to_dict(orient='records')):
                    if i > 0:
                        p_words += ' '
                        p_lemmas += ' '
                        p_stems += ' '
                    if tok['characterId'] == -1:
                        p_words += str(tok['normalizedWord'])
                        p_lemmas += str(tok['lemma'])
                        p_stems += stemmer.stem(str(tok['normalizedWord']))
                    else:
                        if tok['characterId'] in book_chars_1_id and not p_words[-9:-1] == "%CHAR_1%":
                            p_words += "%CHAR_1%"
                            p_lemmas += "%CHAR_1%"
                            p_stems += "%CHAR_1%"
                        elif tok['characterId'] in book_chars_2_id and not p_words[-9:-1] == "%CHAR_3%":
                            p_words += "%CHAR_2%"
                            p_lemmas += "%CHAR_2%"
                            p_stems += "%CHAR_2%"
                        elif not p_words[-12:-1] == "%CHAR_MISC%":
                            p_words += "%CHAR_MISC%"
                            p_lemmas += "%CHAR_MISC%"
                            p_stems += "%CHAR_MISC%"

 
                all_words.append(p_words)
                words_mapping[f"{book_name}_{char_1}_{char_2}"] += f' #{p_words}'

                all_lemmas.append(p_lemmas)
                lemmas_mapping[f"{book_name}_{char_1}_{char_2}"] += f' #{p_lemmas}'
                
                all_stems.append(p_stems)
                stems_mapping[f"{book_name}_{char_1}_{char_2}"] += f' #{p_stems}'
    return all_words, words_mapping, all_lemmas, lemmas_mapping, all_stems, stems_mapping

In [201]:
processed_X = gather_relevant_paragraphs(all_X, book_by_name)
print("Done")

100%|██████████| 603/603 [15:44<00:00,  2.10it/s] 

Done





In [206]:
import json
with open('processed_X.json', 'w') as fout:
    json.dump(processed_X, fout)
# with open('processed_X.json', 'r') as fin:
#     dumped_processed = json.load(fin)

In [215]:
all_words, words_mapping, all_lemmas, lemmas_mapping, all_stems, stems_mapping = processed_X

In [220]:
class ParagraphMapper(TransformerMixin):
    """Map row with character_1, character_2 and book_name key to corresponding values in dict"""
    def __init__(self, paragraphs_mapping):
        self.paragraphs_mapping = paragraphs_mapping
    
    def transform(self, X, *_):
        result = []
        for x in X.to_dict(orient='records'):
            book_name = x['book_name']
            char_1 = x['character_1']
            char_2 = x['character_2']
            key = f"{book_name}_{char_1}_{char_2}"
            if key in self.paragraphs_mapping:
                paragraphs = self.paragraphs_mapping[key]
            else:
                paragraphs = ""
            result.append(paragraphs)
        return result
    
    def fit(self, *_):
        return self

In [223]:
def report(cross_scores):
    def avg(numbers):
        return float(sum(numbers)) / max(len(numbers), 1)
    for mode in ['test', 'train']:
        print(f'On {mode} ', end='')
        for metric in ['precision', 'recall', 'f1']:
            nums = cross_scores[f'{mode}_{metric}_macro']
            print(f' avg {metric}: {avg(nums):.2f} ± {np.std(nums):.2f}', end = '')
        print('')

One more time - redoing all the approaches in baseline but on known chars only this time

# BoW (single) + LogReg on whole paragraphs
## BoW on 'originalWord'

In [224]:
vectorizer = CountVectorizer()
vectorizer.fit(all_words)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(words_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
On test  avg precision: 0.48 ± 0.04 avg recall: 0.48 ± 0.05 avg f1: 0.48 ± 0.05
On train  avg precision: 0.89 ± 0.01 avg recall: 0.89 ± 0.01 avg f1: 0.89 ± 0.00




             precision    recall  f1-score   support

   negative       0.44      0.45      0.45       132
    neutral       0.43      0.42      0.43        79
   positive       0.56      0.56      0.56       193

avg / total       0.50      0.50      0.50       404



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.32 ± 0.04 avg recall: 0.32 ± 0.04 avg f1: 0.32 ± 0.04
On train  avg precision: 0.86 ± 0.05 avg recall: 0.88 ± 0.05 avg f1: 0.86 ± 0.04




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.45      0.47      0.46       157
professional       0.23      0.20      0.21        75
      social       0.45      0.45      0.45       170

 avg / total       0.40      0.41      0.41       404



'DONE'

This seems to be slightly better, than initial previous experiment (probably due to fact we skipped unknown characters) for affinity, providing baseline for further investigate

## BoW on 'lemmas'

In [225]:
vectorizer = CountVectorizer()
vectorizer.fit(all_lemmas)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(lemmas_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.47 ± 0.06 avg recall: 0.47 ± 0.07 avg f1: 0.47 ± 0.06
On train  avg precision: 0.88 ± 0.01 avg recall: 0.89 ± 0.01 avg f1: 0.89 ± 0.00




             precision    recall  f1-score   support

   negative       0.47      0.45      0.46       132
    neutral       0.42      0.38      0.40        79
   positive       0.54      0.58      0.56       193

avg / total       0.49      0.50      0.49       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.32 ± 0.04 avg recall: 0.31 ± 0.02 avg f1: 0.31 ± 0.02
On train  avg precision: 0.88 ± 0.05 avg recall: 0.85 ± 0.06 avg f1: 0.85 ± 0.04




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.45      0.48      0.46       157
professional       0.26      0.23      0.24        75
      social       0.46      0.46      0.46       170

 avg / total       0.42      0.42      0.42       404



'DONE'

## BoW on 'stems' + LogReg (whole paragraphs)

In [226]:
vectorizer = CountVectorizer()
vectorizer.fit(all_stems)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(stems_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.46 ± 0.06 avg recall: 0.46 ± 0.07 avg f1: 0.45 ± 0.07
On train  avg precision: 0.89 ± 0.01 avg recall: 0.89 ± 0.01 avg f1: 0.89 ± 0.00




             precision    recall  f1-score   support

   negative       0.43      0.41      0.42       132
    neutral       0.41      0.38      0.39        79
   positive       0.54      0.58      0.56       193

avg / total       0.48      0.49      0.48       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.34 ± 0.06 avg recall: 0.33 ± 0.05 avg f1: 0.33 ± 0.05
On train  avg precision: 0.84 ± 0.10 avg recall: 0.82 ± 0.09 avg f1: 0.82 ± 0.09




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.46      0.46      0.46       157
professional       0.26      0.21      0.24        75
      social       0.46      0.49      0.47       170

 avg / total       0.42      0.43      0.42       404



'DONE'

## Results 
### Affinity

BoW on 'originalWord' + LogReg (whole paragraphs 
+ On test  avg precision: 0.48 ± 0.04 avg recall: 0.48 ± 0.05 avg f1: 0.48 ± 0.05
+ On train  avg precision: 0.89 ± 0.01 avg recall: 0.89 ± 0.01 avg f1: 0.89 ± 0.00

BoW on 'lemmas' + LogReg (whole paragraphs)  
+ On test  avg precision: 0.47 ± 0.06 avg recall: 0.47 ± 0.07 avg f1: 0.47 ± 0.06
+ On train  avg precision: 0.88 ± 0.01 avg recall: 0.89 ± 0.01 avg f1: 0.89 ± 0.00

BoW on 'stems' + LogReg (whole paragraphs)  
+ On test  avg precision: 0.46 ± 0.06 avg recall: 0.46 ± 0.07 avg f1: 0.45 ± 0.07
+ On train  avg precision: 0.89 ± 0.01 avg recall: 0.89 ± 0.01 avg f1: 0.89 ± 0.00


As we can see, even though generalizing capability (based on train stats) are somewhat the same, working with whole words seem to be the best way for identifying affinity

### Category 

BoW on 'originalWord' + LogReg (whole paragraphs)  
+ On test  avg precision: 0.32 ± 0.04 avg recall: 0.32 ± 0.04 avg f1: 0.32 ± 0.04
+ On train  avg precision: 0.86 ± 0.05 avg recall: 0.88 ± 0.05 avg f1: 0.86 ± 0.04

BoW on 'lemmas' + LogReg (whole paragraphs)  
+ On test  avg precision: 0.32 ± 0.04 avg recall: 0.31 ± 0.02 avg f1: 0.31 ± 0.02
+ On train  avg precision: 0.88 ± 0.05 avg recall: 0.85 ± 0.06 avg f1: 0.85 ± 0.04

BoW on 'stems' + LogReg (whole paragraphs)  
+ On test  avg precision: 0.34 ± 0.06 avg recall: 0.33 ± 0.05 avg f1: 0.33 ± 0.05
+ On train  avg precision: 0.84 ± 0.10 avg recall: 0.82 ± 0.09 avg f1: 0.82 ± 0.09

Even though stems seems to have best cv precision/recall/f1, it also features the biggest std and also lowest scores for train set. Based on size and std I would consider using lemmas.

# BoW (n-grams) + LogReg on whole paragraphs

## BoW on Lemmas

In [229]:
vectorizer = CountVectorizer(ngram_range=(2, 3))
vectorizer.fit(all_lemmas)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(lemmas_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.52 ± 0.07 avg recall: 0.51 ± 0.07 avg f1: 0.51 ± 0.07
On train  avg precision: 0.89 ± 0.00 avg recall: 0.88 ± 0.01 avg f1: 0.89 ± 0.00




             precision    recall  f1-score   support

   negative       0.50      0.50      0.50       132
    neutral       0.48      0.41      0.44        79
   positive       0.59      0.63      0.61       193

avg / total       0.54      0.54      0.54       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.33 ± 0.06 avg recall: 0.32 ± 0.06 avg f1: 0.31 ± 0.06
On train  avg precision: 0.86 ± 0.10 avg recall: 0.79 ± 0.09 avg f1: 0.82 ± 0.09




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.43      0.48      0.45       157
professional       0.26      0.15      0.19        75
      social       0.45      0.49      0.47       170

 avg / total       0.40      0.42      0.41       404



'DONE'

In [231]:
vectorizer = CountVectorizer(ngram_range=(2, 5), max_features=100000)
vectorizer.fit(all_lemmas)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(lemmas_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.51 ± 0.05 avg recall: 0.49 ± 0.05 avg f1: 0.50 ± 0.05
On train  avg precision: 0.89 ± 0.00 avg recall: 0.88 ± 0.01 avg f1: 0.89 ± 0.00




             precision    recall  f1-score   support

   negative       0.46      0.46      0.46       132
    neutral       0.47      0.41      0.44        79
   positive       0.58      0.62      0.60       193

avg / total       0.52      0.52      0.52       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.31 ± 0.05 avg recall: 0.30 ± 0.05 avg f1: 0.30 ± 0.05
On train  avg precision: 0.86 ± 0.10 avg recall: 0.79 ± 0.09 avg f1: 0.82 ± 0.09




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.40      0.45      0.42       157
professional       0.25      0.16      0.20        75
      social       0.42      0.45      0.43       170

 avg / total       0.38      0.39      0.38       404



'DONE'

In [232]:
vectorizer = CountVectorizer(ngram_range=(2, 5), max_features=10000)
vectorizer.fit(all_lemmas)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(lemmas_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.47 ± 0.05 avg recall: 0.47 ± 0.05 avg f1: 0.47 ± 0.05
On train  avg precision: 0.89 ± 0.00 avg recall: 0.88 ± 0.01 avg f1: 0.89 ± 0.00




             precision    recall  f1-score   support

   negative       0.44      0.43      0.44       132
    neutral       0.42      0.38      0.40        79
   positive       0.56      0.59      0.57       193

avg / total       0.49      0.50      0.49       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.32 ± 0.05 avg recall: 0.31 ± 0.05 avg f1: 0.30 ± 0.05
On train  avg precision: 0.86 ± 0.10 avg recall: 0.79 ± 0.09 avg f1: 0.82 ± 0.09




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.41      0.46      0.43       157
professional       0.24      0.17      0.20        75
      social       0.44      0.44      0.44       170

 avg / total       0.39      0.40      0.39       404



'DONE'

In [233]:
vectorizer = CountVectorizer(ngram_range=(1, 2))
vectorizer.fit(all_lemmas)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(lemmas_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.49 ± 0.06 avg recall: 0.48 ± 0.06 avg f1: 0.48 ± 0.06
On train  avg precision: 0.88 ± 0.01 avg recall: 0.89 ± 0.00 avg f1: 0.89 ± 0.00
             precision    recall  f1-score   support

   negative       0.48      0.45      0.47       132
    neutral       0.40      0.39      0.40        79
   positive       0.57      0.60      0.59       193

avg / total       0.51      0.51      0.51       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.32 ± 0.04 avg recall: 0.31 ± 0.03 avg f1: 0.31 ± 0.03
On train  avg precision: 0.86 ± 0.09 avg recall: 0.80 ± 0.09 avg f1: 0.82 ± 0.09




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.42      0.48      0.45       157
professional       0.23      0.19      0.20        75
      social       0.45      0.44      0.44       170

 avg / total       0.40      0.40      0.40       404



'DONE'

In [234]:
vectorizer = CountVectorizer(ngram_range=(1, 3))
vectorizer.fit(all_lemmas)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(lemmas_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.48 ± 0.08 avg recall: 0.47 ± 0.07 avg f1: 0.47 ± 0.07
On train  avg precision: 0.88 ± 0.01 avg recall: 0.89 ± 0.00 avg f1: 0.89 ± 0.00
             precision    recall  f1-score   support

   negative       0.45      0.45      0.45       132
    neutral       0.39      0.38      0.39        79
   positive       0.57      0.59      0.58       193

avg / total       0.50      0.50      0.50       404

category


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


On test  avg precision: 0.32 ± 0.04 avg recall: 0.31 ± 0.03 avg f1: 0.31 ± 0.03
On train  avg precision: 0.87 ± 0.05 avg recall: 0.87 ± 0.05 avg f1: 0.86 ± 0.04




              precision    recall  f1-score   support

          NR       0.00      0.00      0.00         2
    familial       0.43      0.45      0.44       157
professional       0.24      0.19      0.21        75
      social       0.44      0.46      0.45       170

 avg / total       0.40      0.41      0.40       404



'DONE'

## Results 
### Affinity

BoW on Lemmas (2-3)
+ On test  avg precision: 0.52 ± 0.07 avg recall: 0.51 ± 0.07 avg f1: 0.51 ± 0.07
+ On train  avg precision: 0.89 ± 0.00 avg recall: 0.88 ± 0.01 avg f1: 0.89 ± 0.00

BoW on Lemmas (2, 5), max_features=100000
+ On test  avg precision: 0.51 ± 0.05 avg recall: 0.49 ± 0.05 avg f1: 0.50 ± 0.05
+ On train  avg precision: 0.89 ± 0.00 avg recall: 0.88 ± 0.01 avg f1: 0.89 ± 0.00

BoW on Lemmas (2, 5), max_features=10000
+ On test  avg precision: 0.47 ± 0.05 avg recall: 0.47 ± 0.05 avg f1: 0.47 ± 0.05
+ On train  avg precision: 0.89 ± 0.00 avg recall: 0.88 ± 0.01 avg f1: 0.89 ± 0.00

BoW on Lemmas (1, 2)
+ On test  avg precision: 0.49 ± 0.06 avg recall: 0.48 ± 0.06 avg f1: 0.48 ± 0.06
+ On train  avg precision: 0.88 ± 0.01 avg recall: 0.89 ± 0.00 avg f1: 0.89 ± 0.00

BoW on Lemmas (1, 3)
+ On test  avg precision: 0.48 ± 0.08 avg recall: 0.47 ± 0.07 avg f1: 0.47 ± 0.07
+ On train  avg precision: 0.88 ± 0.01 avg recall: 0.89 ± 0.00 avg f1: 0.89 ± 0.00

### Category

BoW on Lemmas (2-3)
+ On test  avg precision: 0.33 ± 0.06 avg recall: 0.32 ± 0.06 avg f1: 0.31 ± 0.06
+ On train  avg precision: 0.86 ± 0.10 avg recall: 0.79 ± 0.09 avg f1: 0.82 ± 0.09

BoW on Lemmas (2, 5), max_features=100000
+ On test  avg precision: 0.31 ± 0.05 avg recall: 0.30 ± 0.05 avg f1: 0.30 ± 0.05
+ On train  avg precision: 0.86 ± 0.10 avg recall: 0.79 ± 0.09 avg f1: 0.82 ± 0.09

BoW on Lemmas (2, 5), max_features=10000
+ On test  avg precision: 0.32 ± 0.05 avg recall: 0.31 ± 0.05 avg f1: 0.30 ± 0.05
+ On train  avg precision: 0.86 ± 0.10 avg recall: 0.79 ± 0.09 avg f1: 0.82 ± 0.09

BoW on Lemmas (1, 2)
+ On test  avg precision: 0.32 ± 0.04 avg recall: 0.31 ± 0.03 avg f1: 0.31 ± 0.03
+ On train  avg precision: 0.86 ± 0.09 avg recall: 0.80 ± 0.09 avg f1: 0.82 ± 0.09

BoW on Lemmas (1, 3)
+ On test  avg precision: 0.32 ± 0.04 avg recall: 0.31 ± 0.03 avg f1: 0.31 ± 0.03
+ On train  avg precision: 0.87 ± 0.05 avg recall: 0.87 ± 0.05 avg f1: 0.86 ± 0.04

Seems that taking bi- & trigrams might give better score for affinity and categories, but it has higher std, for categories I would probably stick with bow on unigrams and bigrams

In [241]:
unique_known_relations[all_y['coarse_category'] == 'NR']

Unnamed: 0,index,affinity,annotator,author,book_name,change,character_1,character_2,coarse_category,detail,fine_category,title
18,65,negative,annotator_1,William Shakespeare,Antony_and_Cleopatra_William_Shakespeare,no,Cleopatra,Dolabella,NR,NR,enemy,Antony and Cleopatra
172,695,neutral,annotator_2,George Eliot,Adam_Bede_George_Eliot,no,Hetty Sorrel,Martin Poyser,NR,Martin is a foster parent to Hetty.,foster parent,Adam Bede


In [242]:
all_y['coarse_category'].unique()

array(['professional', 'social', 'familial', 'NR'], dtype=object)

# DataFix: NR in coarse_category

Need to cleanup those, one is clearly familial based on 'fine_category', let another one be social yet it's unclear as fine_category is 'enemy'

In [268]:
y_train.loc[(X_train['book_name'] == 'Antony_and_Cleopatra_William_Shakespeare') &
(X_train['character_1'] == 'Cleopatra') &
(X_train['character_2'] == 'Dolabella'), 'coarse_category'] = 'social'
y_train[(X_train['book_name'] == 'Antony_and_Cleopatra_William_Shakespeare') &
(X_train['character_1'] == 'Cleopatra') &
(X_train['character_2'] == 'Dolabella')]['coarse_category']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


18    social
Name: coarse_category, dtype: object

In [269]:
y_train.loc[(X_train['book_name'] == 'Adam_Bede_George_Eliot') &
(X_train['character_1'] == 'Hetty Sorrel') &
(X_train['character_2'] == 'Martin Poyser'), 'coarse_category'] = 'familial'
y_train[(X_train['book_name'] == 'Adam_Bede_George_Eliot') &
(X_train['character_1'] == 'Hetty Sorrel') &
(X_train['character_2'] == 'Martin Poyser')]['coarse_category']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


172    familial
Name: coarse_category, dtype: object

# Trying to use framePolarity

Okay, there are at least 3 tools that might help extract frames:
+ https://github.com/google/sling
+ http://www.ark.cs.cmu.edu/SEMAFOR
+ https://github.com/Noahs-ARK/open-sesame

but they are not trivial to use on mac. 

I might get back to this later, but for now just gonna use words identified in there to populate vocabulary (I know this is absolutely not what frame do, but still gonna use this as 'features engineering' example)

In [263]:
with open('../data/FramePolarityLexicon.txt') as input_txt:
    res = []
    readme_count = 0
    for line in input_txt.readlines():
        if "# ReadMe #" in line:
            readme_count += 1
            continue
        if readme_count < 2:
            continue
        if line.startswith('###'):
            continue
        if len(line.strip()) == 0:
            continue
        if line.startswith("'"):
            word = line.split(":")[0][1:-1]
            res += word.split('_')
vocab = list(set(res))

In [270]:
vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary=vocab)
vectorizer.fit(all_words)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(words_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.41 ± 0.06 avg recall: 0.41 ± 0.06 avg f1: 0.40 ± 0.06
On train  avg precision: 0.70 ± 0.01 avg recall: 0.64 ± 0.02 avg f1: 0.66 ± 0.02
             precision    recall  f1-score   support

   negative       0.37      0.26      0.30       132
    neutral       0.39      0.35      0.37        79
   positive       0.49      0.61      0.54       193

avg / total       0.43      0.44      0.43       404

category
On test  avg precision: 0.38 ± 0.05 avg recall: 0.37 ± 0.03 avg f1: 0.36 ± 0.04
On train  avg precision: 0.68 ± 0.01 avg recall: 0.59 ± 0.01 avg f1: 0.61 ± 0.01
              precision    recall  f1-score   support

    familial       0.41      0.39      0.40       158
professional       0.28      0.16      0.20        75
      social       0.45      0.55      0.50       171

 avg / total       0.40      0.42      0.40       404



'DONE'

In [273]:
stem_vocab = list(set([stemmer.stem(word) for word in res]))

In [275]:
vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary=stem_vocab)
vectorizer.fit(all_stems)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(stems_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.45 ± 0.05 avg recall: 0.45 ± 0.05 avg f1: 0.44 ± 0.05
On train  avg precision: 0.75 ± 0.02 avg recall: 0.70 ± 0.02 avg f1: 0.72 ± 0.02
             precision    recall  f1-score   support

   negative       0.41      0.36      0.38       132
    neutral       0.37      0.39      0.38        79
   positive       0.54      0.57      0.56       193

avg / total       0.46      0.47      0.47       404

category
On test  avg precision: 0.40 ± 0.05 avg recall: 0.39 ± 0.04 avg f1: 0.39 ± 0.04
On train  avg precision: 0.74 ± 0.03 avg recall: 0.69 ± 0.02 avg f1: 0.71 ± 0.02
              precision    recall  f1-score   support

    familial       0.45      0.45      0.45       158
professional       0.24      0.16      0.19        75
      social       0.49      0.57      0.53       171

 avg / total       0.43      0.45      0.43       404



'DONE'

In [276]:
# https://github.com/zfsang/CharacterGo/blob/master/code/Match_label.ipynb
family = ['father','mother','aunt','wife','daughter','sibling','twin','family','heir','ancestor',
          'brother','uncle','sister','niece','grand','cousin','adopt','relat','nephew','son','child','divorce']
friend = ['friend','playmate','widow','frien']
romance = ['ex','lover','love','girlfriend','attraction','boyfriend','affair','engage',
          'fiance','crush','date','sweet','partner','couple','flirt','marr']
enemy = ['enem','victim','traitor','compet','parties','riv','dislike','foe','death','counter',
         'murder','accuse','duel','conflicts','hate','foil','opposition','disguise','kill']
acquaintance = ['acqua','coworker','student','prof','pup','roommate','school','work','host','housemates','companion',
               'neighbor','roomate','wizard','ally','allies','flatmate','mate','group','miss','member','peasant',
               'coll','train','comrade','land']
service = ['fellow','assist','doctor','detect','devil','master','mistress','slave','rule','henchman',
           'employer','serv','lead','law','king','prison',',coach','proph','resear','edit','ward',
           'cook','sale','officer','boss','office','lord','emperor','interview','chief','support','advis',
          'nurse','man','owner','mentor','benef','manager','ruler','starbuck','super','tetrarch','tour',
          'counsel','judge','merchant','employ','flower','general','warder','house','soldier','maid','major','help',
           'patient','cook','ward','business','bank','tenant','keeper','captain','tutor','keeper','actor','buy','lend',
          'porter','caller','scout','hire','protect','guide','attorney','coach','caretaker','associate','advers',
          'interrogator','harpooner','henchman']

In [280]:
res_extended = list(set(res + family + friend + romance + enemy + acquaintance + service))

In [281]:
vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary=res_extended)
vectorizer.fit(all_words)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(words_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.47 ± 0.03 avg recall: 0.47 ± 0.04 avg f1: 0.46 ± 0.03
On train  avg precision: 0.84 ± 0.01 avg recall: 0.83 ± 0.01 avg f1: 0.83 ± 0.01
             precision    recall  f1-score   support

   negative       0.47      0.40      0.43       132
    neutral       0.38      0.41      0.39        79
   positive       0.56      0.60      0.58       193

avg / total       0.49      0.50      0.49       404

category
On test  avg precision: 0.40 ± 0.06 avg recall: 0.39 ± 0.05 avg f1: 0.39 ± 0.05
On train  avg precision: 0.83 ± 0.03 avg recall: 0.80 ± 0.02 avg f1: 0.81 ± 0.02
              precision    recall  f1-score   support

    familial       0.47      0.48      0.48       158
professional       0.19      0.17      0.18        75
      social       0.51      0.53      0.52       171

 avg / total       0.44      0.44      0.44       404



'DONE'

In [282]:
stem_vocab = list(set([stemmer.stem(word) for word in res_extended]))

In [283]:
vectorizer = CountVectorizer(ngram_range=(1, 2), vocabulary=stem_vocab)
vectorizer.fit(all_stems)
print("Done with fitting vectorizer")
mapper = ParagraphMapper(stems_mapping)

X_train_vect = vectorizer.transform(mapper.transform(X_train))
X_test_vect = vectorizer.transform(mapper.transform(X_test))
print("Done with vectorizing input")
# do affinity first
print('affinity')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['affinity'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['affinity'], cv=5)
print(metrics.classification_report(y_train['affinity'], y_predicted_cv))

# do the category
print('category')
predictor = LogisticRegression()

scores = cross_validate(predictor, X_train_vect, y_train['coarse_category'], scoring=['precision_macro', 'recall_macro', 'f1_macro'], cv=5, return_train_score=True)
report(scores)
y_predicted_cv = cross_val_predict(predictor, X_train_vect, y_train['coarse_category'], cv=5)
print(metrics.classification_report(y_train['coarse_category'], y_predicted_cv))
"DONE"

Done with fitting vectorizer
Done with vectorizing input
affinity
On test  avg precision: 0.47 ± 0.04 avg recall: 0.47 ± 0.06 avg f1: 0.47 ± 0.05
On train  avg precision: 0.86 ± 0.01 avg recall: 0.85 ± 0.01 avg f1: 0.86 ± 0.01
             precision    recall  f1-score   support

   negative       0.41      0.41      0.41       132
    neutral       0.44      0.48      0.46        79
   positive       0.55      0.53      0.54       193

avg / total       0.48      0.48      0.48       404

category
On test  avg precision: 0.41 ± 0.04 avg recall: 0.41 ± 0.04 avg f1: 0.41 ± 0.04
On train  avg precision: 0.87 ± 0.01 avg recall: 0.84 ± 0.01 avg f1: 0.85 ± 0.01
              precision    recall  f1-score   support

    familial       0.46      0.47      0.47       158
professional       0.26      0.25      0.26        75
      social       0.51      0.50      0.51       171

 avg / total       0.44      0.45      0.44       404



'DONE'

# Approaches to try: 

- ✅ BoW with ngrams
- framePolarity thingy
- word2vec averaging for words between char1 - char2 -> classification
- ulmfit / http://nlp.fast.ai/classification/2018/05/15/introducting-ulmfit.html
- custom LSTM 
- BoW for windows of chars between
- BoW with S-V-O
