In [1]:
import pandas as pd
import re
import gzip
import tarfile
import spacy
import ast
from collections import defaultdict, Counter

In [27]:
nlp = spacy.load('en')
ner = spacy.load('xx')

In [3]:
original_chars_df = pd.read_csv('movie_chars.csv.gz')
original_movie_df = pd.read_csv('movie_list.csv.gz')
original_chars_df = original_chars_df.set_index('id')
original_chars_df['cast'] = original_chars_df['cast'].apply(ast.literal_eval)
original_movie_df = original_movie_df.set_index('id')

In [4]:
# given pair movie, actor tell whether actor was in movie
# dict id <-> name
name_by_id = {}
# dict name <-> aliases
aliases_by_name = {}
# http://mentalfloss.com/article/24761/origins-10-nicknames
mapping = {
    "Richard": "Dick",
    "William": "Bill",
    "Henry": "Hank",
    "John": "Jack",
    "Charles": "Chuck",
    "Margaret": "Peggy",
    "Edward": "Ted",
    "Henry": "Harry",
    "James": "Jim",
    "Sarah": "Sally",
}

In [5]:
def aliasify(name):
    res = []
    parts = name.strip().split(' ')
    first_part = parts[0]
    res.append(parts[-1])
    res.append('{} {}'.format(first_part[0], ' '.join(parts[1:])))
    res.append('{}. {}'.format(first_part[0], ' '.join(parts[1:])))
    if first_part in mapping:
        res.append('{} {}'.format(mapping[first_part], ' '.join(parts[1:])))
    return res

def process_cast(cast):
    global i
    for actor in cast:
        name_by_id[actor['id']] = actor['name']
        if not actor['name'] in aliases_by_name:
            aliases_by_name[actor['name']] = aliasify(actor['name'])
    return cast
def possible_names_by_alias(alias):
    if alias in aliases_by_name:
        return [alias]
    return [ent for ent in aliases_by_name if alias in aliases_by_name[ent]]

flatten = lambda l: [item for sublist in l for item in sublist]

def actors_id_by_name(name):
    res = list()
    for id in name_by_id:
        if name_by_id[id] == name:
            res.append(id) 
    return res

def movie_ids_by_title(title):
    # need to check for title not being stop words
    title_wo_leading_trailing_digits = re.sub("(^\s*\d+\s*|\s*\d+\s*$)", "", title)
    ids = original_movie_df[(original_movie_df['title'].str.contains(title_wo_leading_trailing_digits)) | (original_movie_df['title'].str.contains(title)) | (original_movie_df['original_title'] == title)].index.ravel()
    return [int(id) for id in ids]

def cast_ids_by_movie_id(movie_id):
    cast = original_chars_df.get_value(movie_id, 'cast')
    return [p['id'] for p in cast]

def fact_check(actor, movie):
    possible_actors = flatten([actors_id_by_name(name) for name in possible_names_by_alias(actor)])
    possible_movies = movie_ids_by_title(movie)
    match = []
    for movie_id in possible_movies:
        for actor_id in possible_actors:
            if actor_id in cast_ids_by_movie_id(movie_id):
                match.append((movie_id, original_movie_df.get_value(str(movie_id), 'title'), actor_id, name_by_id[actor_id]))
    return len(match) > 0 , match

In [6]:
original_chars_df['cast'].apply(process_cast)
"DONE"

'DONE'

In [7]:
def split_label(label_str):
     return label_str.split(" ")

def parse_annotation(text):
    entries = text.split("\n")
    entries = [entry for entry in entries if entry]
    result = []
    for entry in entries:
        [_, label, text] = entry.split("\t")
        [type, start, end] = split_label(label)
        result.append({ 'type': type, 'start': int(start), 'end': int(end), 'text': text})
    return result

In [8]:
input_file = tarfile.open('movie_data.tar.gz')
related_members = []
for member in input_file.getmembers():
        name = member.name
        if re.search(u"\.txt(\.ann)?$", str(member.name)):
            related_members.append(member)

all_data = defaultdict(dict)
for member in related_members:
    is_annotation = True if re.search(u"\.txt\.ann", str(member.name)) else False
    if is_annotation:
        id = member.name.split("/")[-1][0:-8]
    else:
        id = member.name.split("/")[-1][0:-4]
    with input_file.extractfile(member) as single_input:
        if is_annotation:
            all_data[id]['ann'] = parse_annotation(single_input.read().decode("utf-8") )
        else:
            all_data[id]['sent'] = single_input.read().decode("utf-8")
    
input_file.close()

In [9]:
def print_sent(sent, anns):
    copied = sorted(anns.copy(), key=lambda k: k['start']) 
    for char in range(0, len(sent)):
        if len(copied) and char == copied[0]['start']:
            print('[', end = '')
        if len(copied) and char == copied[0]['end']:
            print(']<-{}'.format(copied[0]['type']), end = '')
            copied.pop(0)
        print(sent[char], end = '')

def get_metrics(raw_dict):
    tp = raw_dict['tp']
    fp = raw_dict['fp']
    fn = raw_dict['fn']
    try:
        precision = tp / (tp + fp)
    except: 
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        f1 = 2 / (1 / recall + 1/ precision)
    except:
        f1 = 0
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
def evaluate_single(ground_anns, predicted_anns):
    tp = 0
    fp = 0
    fn = 0
    gr_copy_rel = set([frozenset(an.items()) for an in ground_anns if 'REL' in an['type']])
    gr_copy_misc = set([frozenset(an.items()) for an in ground_anns if 'REL' not in an['type']])
    pr_copy_rel = set([frozenset(an.items()) for an in predicted_anns if 'REL' in an['type']])
    pr_copy_misc = set([frozenset(an.items()) for an in predicted_anns if 'REL' not in an['type']])
    tp = len(pr_copy_rel & gr_copy_rel)
    fp = len(pr_copy_rel - gr_copy_rel)
    fn = len(gr_copy_rel - pr_copy_rel) # not really false negative, but ¯\_(ツ)_/¯
    return {
        'tp': tp,
        'fp': fp,
        'fn': fn
    }

def print_compare(sent, ground_anns, predicted_anns):
    print("ORIG: {}".format(sent))
    print("GOLD: ", end='')
    print_sent(sent, ground_anns)
    print('')
    print("OURS: ", end='')
    print_sent(sent, predicted_anns)
    print('')

def evaluate_all(dataset, rules):
    all_precision = Counter()
    for datum in dataset:
        predicted = rules(datum['sent'])
        all_precision.update(evaluate_single(datum['ann'], predicted))
    print(get_metrics(all_precision))
    
def print_top5(dataset, rules, by_what = 'tp', from_top = True):
    all_preds = []
    for datum in dataset:
        predicted = rules(datum['sent'])
        all_preds.append([evaluate_single(datum['ann'], predicted), datum, predicted])
    to_print = sorted(all_preds, key = lambda row: -row[0][by_what])
    if from_top:
        to_print = to_print[0:5]
    else:
        to_print = to_print[-5:]
    for pr in to_print:
        _, datum, predicted = pr
        print_compare(datum['sent'], datum['ann'], predicted)

In [96]:
def ent2hashable(ent):
    return frozenset({'text': ent.text, 'start': ent.start_char, 'end': ent.end_char}.items())
def hashable2ent(frozen):
    dict = {}
    for pair in frozen:
        dict[pair[0]] = pair[1]
    return dict
def get_movies_candidates(sent):
    doc = nlp(sent)
    ners = ner(sent)
    doc_movies = set([ent2hashable(ent) for ent in doc.ents if ent.label_ in ["WORK_OF_ART"]])
    ner_movies = set([ent2hashable(ent) for ent in ners.ents if ent.label_ in ["MISC"]])
    possible_movies = [hashable2ent(ent) for ent in doc_movies | ner_movies]
    return possible_movies

def keep_longest_matching(strings):
    res = []
    input = sorted(strings, key=len, reverse = True)
    while (input):
        s = input.pop()
        if any([True for i in input if s in i]):
            continue
        res.append(s)
    return res
def get_actors_candidates(sent):
    doc = nlp(sent)
    ners = ner(sent)
    doc_actors = set([ent2hashable(ent) for ent in doc.ents if ent.label_ in ["PERSON"]])
    ner_actors = set([ent2hashable(ent) for ent in ners.ents if ent.label_ in ["PER"]])
    possible_actors = [hashable2ent(ent) for ent in doc_actors | ner_actors]
    names_to_keep = keep_longest_matching([act['text'] for act in possible_actors])
    return [act for act in possible_actors if act['text'] in names_to_keep]
def split_connected(entity):
    text = entity['text']
    if "&" in text:
        start = entity['start']
        names = [s.strip() for s in text.split('&')]
        return [{'start': start + text.index(name), 'end': start + text.index(name) + len(name), 'text': name} for name in names]
    return [entity]
all_verbs = set()
stop_verbs = set(["direct", "don", "drop", "reject", "turn"])
def get_related_ents(sent):
    res = []
    doc = nlp(sent)

    poss_movies = get_movies_candidates(sent)
    movies = []
    for movie in poss_movies or ():
        if movie['text'] == 'Oscar' or movie['text'] == 'Emmy':
            continue
        movie.update({'type': 'REL_MOV'})
        movies.append(movie)
    poss_actors = get_actors_candidates(sent)
    actors = []
    poss_actors = flatten([split_connected(actor) for actor in poss_actors])
    for actor in poss_actors or ():
        verbs = set()
        for tok in doc:
            if tok.tag_ == "NNP" and str(tok) in actor['text']:
                related_verb = None
                cur = tok
                for cur in [tok.head, tok.head and tok.head.head, tok.head.head and tok.head.head.head]:
                    if related_verb:
                        continue
                    if cur.tag_ == 'VBD':
                        related_verb = cur.lemma_
                        verbs.update([related_verb])

        if actor['text'] == 'Oscar' or actor['text'] == 'Emmy':
            continue
        if "'s" in actor['text']:
            continue
        actor.update({'type': 'REL_ACT'})
        if len(stop_verbs & verbs) > 0:
            actor.update({'type': 'MISC_AC'})
        if len(movies) == 0:
            actor.update({'type': 'MISC_AC'})
        actors.append(actor)
#         if len(verbs):
#             all_verbs.update(list(verbs))
    if (len([actor for actor in actors if actor['type'] == 'REL_ACT']) == 0):
        for movie in movies:
            movie.update({'type': 'MISC_MO'})
    return movies + actors

In [97]:
def split_related_ents(sent, entities):
    # Well, actually, this doesn't enforce 'same movie' <-> 'different actors' or 'same actor' <-> 'different movies'
    semicolon_re = re.compile(";")
    actor = None
    movie = None
    statements = list()

    for ent in sorted(entities, key=lambda k: k['start']):
        prev_semi = None
        if actor:
            prev_semi = prev_semi or semicolon_re.search(sent, actor['end'])
        if movie:
            prev_semi = prev_semi or semicolon_re.search(sent, movie['end'])
        if prev_semi and prev_semi.start() < ent['start']:
            actor = None
            movie = None
        if ent['type'] == 'REL_MOV':
            movie = ent
        if ent['type'] == 'REL_ACT':
            actor = ent
        # yes we can get same entity marked as person and movie
        if movie and actor and not (movie['start'] == actor['start'] or movie['end'] == actor['end']):
            statements.append((movie, actor))
    return statements

def splitted_ents_to_fact_checks(statements):
    res = list()
    for statement in statements:
        movie, actor = statement
        res.append(((movie['text'], actor['text']), fact_check(actor['text'], movie['text'])))
    return res

def format_single(check_result):
    (movie, actor), (valid, relations) = check_result
    res = '{} starred in {} -> '.format(actor, movie)
    if valid:
        res += "Yes, it's true."
        if len(relations) == 1:
            imdb_id = original_movie_df.get_value(str(relations[0][0]), 'imdb_id')
            res += " See https://www.imdb.com/title/{}".format(imdb_id)
        if len(relations) > 1:
            for rel in relations:
                movie_id, movie_name, actor_id, actor_name = rel
                if movie == movie_name and actor == actor_name:
                    imdb_id = original_movie_df.get_value(str(movie_id), 'imdb_id')
                    res += " See https://www.imdb.com/title/{}".format(imdb_id)
                    return res
                imdb_id = original_movie_df.get_value(str(movie_id), 'imdb_id')
                res += " Supposedly, you mean {} was in {} (https://www.imdb.com/title/{})".format(actor_name, movie_name, imdb_id)
    else:
        res += "This might be not true."
    return res
## split sentence doc = nlp(text, parse=True) | doc.sents - end2end
def end2end_check(paragraph, debug = False):
    # couldn't make this work with , parse=False, tag=False, entity=False
    sents = nlp(paragraph).sents
    found = list()
    for sent in sents:
        sent = str(sent)
        statements = split_related_ents(sent, get_related_ents(sent))
        if len(statements) == 0:
            continue
        if (debug):
            print(statements)
        checked = splitted_ents_to_fact_checks(statements)
        if (debug):
            print(checked)
        found = found + [valid for _, (valid, relations) in checked]
        verdict = "\n".join([format_single(res) for res in checked])
        print("{}: \n{}".format(sent, verdict))
    return any(found)

In [98]:
evaluate_all(all_data.values(), get_related_ents)
# print_top5(all_data.values(), get_related_ents, by_what='tp', from_top=False)
print_top5(all_data.values(), get_related_ents, by_what='tp')

{'precision': 0.5166240409207161, 'recall': 0.7038327526132404, 'f1': 0.5958702064896756}
ORIG: Peter Fonda was in "Ulee's Gold"; "Fool's Gold" stars Kate Hudson daughter of Goldie Hawn
GOLD: [Peter Fonda]<-REL_ACT was in "[Ulee's Gold]<-REL_MOV"; "[Fool's Gold]<-REL_MOV" stars [Kate Hudson]<-REL_ACT daughter of [Goldie Hawn
OURS: [Peter Fonda]<-REL_ACT was in "[Ulee's Gold]<-REL_MOV"; "[Fool's Gold]<-REL_MOV" stars [Kate Hudson]<-REL_ACT daughter of [Goldie Hawn
ORIG: "High Society" is a musical version of The Philadelphia Story Cary Grant-Jimmy Stewart-Katharine Hepburn classic
GOLD: "[High Society]<-MISC_MO" is a musical version of [The Philadelphia Story]<-REL_MOV [Cary Grant]<-REL_ACT-[Jimmy Stewart]<-REL_ACT-[Katharine Hepburn]<-REL_ACT classic
OURS: "High Society" is a musical version of [The Philadelphia Story]<-REL_MOV Cary Grant-Jimmy Stewart-Katharine Hepburn]<-REL_MOV classic
ORIG: Before "The Blues Brothers", Dan Aykroyd & John Belushi starred in 1941 WWII farce
GOLD: Befo

## Validate end-to-end

In [91]:
# Taken from
# http://trivia.fyi/category/movie-trivia
# https://www.sporcle.com/games/rockinpink3/movie-trivia-true-or-false
# https://en.wikipedia.org/wiki/Platoon_(film)
validation_set = [
    ['Michael Keaton played Spiderman villain in the 2017 superhero movie "Spiderman: Homecoming"', True],
    ['In the 1997 American science fiction comedy Men in Black, Tommy Lee Jones played Agent K', True],
    ['Jim Carrey and Renee Zellweger starred in Me, Myself & Irene, released in 2000, about a cop with dissociative identity disorder', True],
    ['Emma Watson is known for playing which character in Harry Potter?', True],
    ['Every character in "The Wizard of Oz" wanted something different from the Wizard.', False],
    ["Faye Dunaway was considered for Elaine Robinson in 'The Graduate' (1967), but she had to turn it down in order to star in Bonnie and Clyde (1967)", False],
    ["John Travolta and Olivia Newton-John did not get along during the filming of 'Grease' (1978)", True],
    ["Kate Hudson turned down the role of Mary-Jane in 'Spider-Man' (2002)", False],
    ["Harrison Ford originally had a cameo in 'E.T' (1982) but it was later cut from the film", False],
    ["Jason Bateman was considered for the role of Chris Chambers in 'Stand By Me' (1986)", False],
    ["Heath Ledger dated his 'Brokeback Mountain' (2005) co-star Michelle Williams", True],
    ["Dave Chappelle turned down the role of Bubba in 'Forrest Gump' (1994)", False],
    ["Ben Affleck was originally going to direct 'Gone Girl' (2014) along with starring in it", True],
    ["Cameron Diaz did her own singing in 'The Mask' (1994)", True],
    ["Vivian Leigh disliked kissing Clark Gable in 'Gone With the Wind' (1939) because he had bad breath", True],
    ["Daniel Radcliffe was offered a role in 'Les Miserables' (2012)", False],
    ["Christopher Plummer and Julie Andrews had an affair during the filming of 'The Sound of Music' (1965)", True],
    ["To break the ice for the nude scene, Kate Winslet flashed Leonardo Dicaprio before shooting began on 'Titanic' (1997)", True],
    ["'American Sniper' (2014) is Clint Eastwood's highest grossing film that he's directed", False],
    ["Denzel Washington was originally going to play Jules in 'Pulp Fiction' (1994) but dropped out due to creative differences with director Quentin Tarantino", False],
    ["Angelina Jolie kept the horns she wore in 'Maleficent' (2014)", True],
    ["Jennifer Aniston was originally supposed to do a nude scene in 'Horrible Bosses' (2011) but the director decided not to film it at the last minute", True],
    ["Michael Bay had Megan Fox gain 10 pounds for her role in 'Transformers' (2007)", True],
    ["Mia Farrow auditioned for the part of Kay in 'The Godfather' (1972)", False],
    ["In 1967, U.S. Army volunteer Chris Taylor arrives in South Vietnam and is assigned to an infantry platoon of the 25th Infantry Division near the Cambodian border.", False],
    ["After his tour of duty in the Vietnam War ended in 1968, Oliver Stone wrote a screenplay called Break, a semi-autobiographical account detailing his experiences with his parents and his time in the Vietnam War.", False],
    ["Platoon was filmed on the island of Luzon in the Philippines starting in February 1986", False],
    ["James Woods, who had starred in Stone's film Salvador", True],
    ["Denzel Washington expressed interest in playing the role of Elias", False],
    ["Informed by director Oliver Stone's personal experiences in Vietnam, Platoon forgoes easy sermonizing in favor of a harrowing, ground-level view of war, bolstered by no-holds-barred performances from Charlie Sheen and Willem Dafoe", True],
]

In [92]:
X = [entry[0] for entry in validation_set]
Y = [entry[1] for entry in validation_set]
Predicted = [end2end_check(sent) for sent in X]

Michael Keaton played Spiderman villain in the 2017 superhero movie "Spiderman: Homecoming": 
Spiderman starred in Spiderman: Homecoming -> This might be not true.
In the 1997 American science fiction comedy Men in Black, Tommy Lee Jones played Agent K: 
Tommy Lee Jones starred in Men in Black -> Yes, it's true. See https://www.imdb.com/title/tt0119654
Tommy Lee Jones starred in Agent K -> This might be not true.
Jim Carrey and Renee Zellweger starred in Me, Myself & Irene, released in 2000, about a cop with dissociative identity disorder: 
Renee Zellweger starred in Me -> This might be not true.
Renee Zellweger starred in Myself & Irene -> This might be not true.
Emma Watson is known for playing which character in Harry Potter?: 
Emma Watson starred in Harry Potter -> Yes, it's true. Supposedly, you mean Emma Watson was in Harry Potter and the Philosopher's Stone (https://www.imdb.com/title/tt0241527) Supposedly, you mean Emma Watson was in Harry Potter and the Chamber of Secrets (htt

In [14]:
from sklearn import metrics

In [93]:
print('Accuracy: ', metrics.accuracy_score(Y, Predicted))
print('Precision: ', metrics.precision_score(Y, Predicted))
print('Recall: ', metrics.recall_score(Y, Predicted))
print('F1: ', metrics.f1_score(Y, Predicted))


Accuracy:  0.733333333333
Precision:  0.9
Recall:  0.5625
F1:  0.692307692308


# Afterthoughts
- no fancy things done in actually working with deps on stage of finding relatedness of movie <-> actor
- data labelling should be done differently: stick to performer <-> media relations (it's inconsistent in labeled data due to change of though in the middle), + add REL_MOVIE_A + REL_ACTOR_A / REL_MOVIE_B + REL_MOVIE_B to help distinguish cases with multiple movies /actors better
- would be great to get a proper lecture on organizing code, datasets and experiments