In [178]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np

import spacy
from spacy import displacy
from sklearn.model_selection import train_test_split

from rules import *
from rule_model import RuleModel, build_dataframe, score_row

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [119]:
data = './out'
files = os.listdir(data)

nlp = spacy.load('en')

In [120]:
train_files, test_files = train_test_split(files, train_size=0.5, random_state=1234)

In [121]:
train = build_dataframe(data, train_files).fillna('no')
test = build_dataframe(data, test_files).fillna('no')

train['cast'] = train['cast'].apply(lambda x: x.split(','))
test['cast'] = test['cast'].apply(lambda x: x.split(','))

In [122]:
train.head(10)

Unnamed: 0,cast,paragraph,movie
0,"[Matt Damon, Jeff Daniels, Jessica Chastain, K...",The Martian is a 2015 science fiction film dir...,The_Martian_(film).txt
0,"[Lily James, Kevin Spacey, Ansel Elgort, Eiza ...",Baby Driver is a 2017 action crime film writte...,Baby_Driver_(film).txt
0,"[Ted Levine, Ruby Dee, Denzel Washington, Josh...",American Gangster is a 2007 American biographi...,American_Gangster_(film).txt
0,"[Eric Bana, Ewan McGregor, Tom Sizemore, Sam S...",Black Hawk Down is a 2001 war film produced an...,Black_Hawk_Down_(film).txt
0,"[Alan Tudyk, Bruce Greenwood, Chi McBride, Jam...","I, Robot (stylized as i,robot) is a 2004 Ameri...","I,_Robot_(film).txt"
0,"[Audrey Tautou, Mathieu Kassovitz]",Amélie (also known as Le Fabuleux Destin d'Amé...,Amélie.txt
0,"[Alan Rickman, Colin Firth, Hugh Grant, Rowan ...",Love Actually is a 2003 Christmas-themed roman...,Love_Actually.txt
0,"[Robert Carlyle, Ewan McGregor, Kevin McKidd, ...",Trainspotting is a 1996 British black comedy f...,Trainspotting_(film).txt
0,"[Jean Reno, Natalie Portman, Danny Aiello, Gar...",Léon: The Professional (French: Léon; original...,Léon:_The_Professional.txt
0,"[Martina Gedeck, Ulrich Tukur, Sebastian Koch,...",The Lives of Others (German: Das Leben der And...,The_Lives_of_Others.txt


In [124]:
rules = [ActorPerformanceMentionRule(), ActorCastRule(), FilmStartsRule()]
model = RuleModel(rules)

In [125]:
# pre-analyse data (fit to spacy), because it takes a lot of time 

train['X'] = model.analyse(train['paragraph'])
test['X'] = model.analyse(test['paragraph'])

In [8]:
train['pred'] = model.predict(train['X'], analyse=False)
scores = train[['cast', 'pred']].apply(lambda x: score_row(x['cast'], x['pred']), axis=1)

print('Train F1 score:', scores.mean())

test['pred'] = model.predict(test['X'], analyse=False)
scores = test[['cast', 'pred']].apply(lambda x: score_row(x['cast'], x['pred']), axis=1)
print('Test F1 score', scores.mean())

  'precision', 'predicted', average, warn_for)


Train F1 score: 0.40944665104764544
Test F1 score 0.3774518937262914


In [9]:
train[['cast', 'pred', 'movie']].head(10)

Unnamed: 0,cast,pred,movie
0,"[Matt Damon, Jeff Daniels, Jessica Chastain, K...","{Jessica Chastain, Matt Damon, Sebastian Stan,...",The_Martian_(film).txt
0,"[Lily James, Kevin Spacey, Ansel Elgort, Eiza ...","{Kevin Spacey, Ansel Elgort, Jon Bernthal, Jam...",Baby_Driver_(film).txt
0,"[Ted Levine, Ruby Dee, Denzel Washington, Josh...","{Russell Crowe, Denzel Washington, Crowe}",American_Gangster_(film).txt
0,"[Eric Bana, OBE, Ewan McGregor, Tom Sizemore, ...",{},Black_Hawk_Down_(film).txt
0,"[Alan Tudyk, Bruce Greenwood, Chi McBride, Jam...",{},"I,_Robot_(film).txt"
0,"[Audrey Tautou, Mathieu Kassovitz]",{},Amélie.txt
0,"[Alan Rickman, Colin Firth, Hugh Grant, Rowan ...",{},Love_Actually.txt
0,"[Robert Carlyle, OBE, Ewan McGregor, Kevin McK...","{Jonny Lee Miller, Robert Carlyle, Ewen Bremne...",Trainspotting_(film).txt
0,"[Jean Reno, Natalie Portman, Danny Aiello, Gar...","{Jean Reno, Oldman, Gary Oldman}",Léon:_The_Professional.txt
0,"[Martina Gedeck, Ulrich Tukur, Sebastian Koch,...","{Dreyman, Ulrich Mühe}",The_Lives_of_Others.txt


Looks like some of the movies doesn't have any actor detected. For example `Black Hawk Down` has following line in text 

_The film features a large ensemble cast, including Josh Hartnett, Ewan McGregor, Eric Bana, Tom Sizemore, William Fichtner, Jason Isaacs, Tom Hardy in his feature film debut, and Sam Shepard._

which is not covered by rules.

Also `I, Robot` has the line which is covered by rules, but in some reasons was not detected 

_The film stars Will Smith, Bridget Moynahan, Bruce Greenwood, James Cromwell, Chi McBride and Alan Tudyk._

In [10]:
nlp = spacy.load('en')
doc = nlp("The film stars Will Smith, Bridget Moynahan, Bruce Greenwood, James Cromwell, Chi McBride and Alan Tudyk.")
displacy.render(doc, style='dep', jupyter=True)

Looks like _stars_ is not a verb, so let's add rule for nouns.

I also made similar analysis and added other rules, like:
- extracting actors from phrases like "played by somebody"
- removing detected actors with names longer then 3 words (The BAFTA Award for Best Screenplay was detected and others)

Changed approach from checking each sentence separatelly to applying rules to entire doc at once 

In [84]:
rules = [ActorPerformanceMentionRule(), ActorCastRule(), FilmStartsRule(), 
         FilmStarsRuleAsNoun(), IncludingRule(), PlayedByRule()]
model = RuleModel(rules)

train['pred'] = model.predict(train['X'], analyse=False)
scores = train[['cast', 'pred']].apply(lambda x: score_row(x['cast'], x['pred']), axis=1)

print('Train F1 score:', scores.mean())

Train F1 score: 0.4631426613993872


  'precision', 'predicted', average, warn_for)


F1 score improved a bit, but still low

### Let's see articles with lowest F1 score:

In [82]:
results = pd.concat([train[['cast', 'pred', 'movie']], scores], axis=1).reset_index(drop=True).sort_values([0], ascending=True)
results.head(20)

Unnamed: 0,cast,pred,movie,0
12,"[Tom Cruise, Timothy Spall, Koyuki, Billy Conn...","[Golden Satellite Awards, Academy Awards, Gold...",The_Last_Samurai.txt,0.0
89,"[Dave Franco, Woody Harrelson, Morgan Freeman,...",[Jack Wilder],Now_You_See_Me_(film).txt,0.0
78,"[Leonardo DiCaprio, Ray Winstone, Matt Damon, ...","[Raging Bull, J. Hoberman, Taxi Driver]",The_Departed.txt,0.0
74,"[Rory Culkin, William Mapother, Robin Tunney, ...",[],The_Zodiac_(film).txt,0.0
64,"[Malin Åkerman, Matthew Goode, Jeffrey Dean Mo...","[Leonid Brezhnev, Richard Nixon, John Lennon, ...",Watchmen_(film).txt,0.0
88,"[Nicolas Cage, Christopher Mintz-Plasse, Aaron...","[Universal Studios, Christopher Tookey, Andrew...",Kick-Ass_(film).txt,0.0
56,"[Brian d'Arcy James, Liev Schreiber, Rachel Mc...",[Gary Galone],Spotlight_(film).txt,0.0
45,"[Rip Torn, Linda Fiorentino, Vincent D'Onofrio...",[],Men_in_Black_(film).txt,0.0
63,"[Daisy Ridley""]","[John Boyega, Andy Serkis, Mark Hamill, Lupita...",Star_Wars:_The_Force_Awakens.txt,0.0
6,"[Alan Rickman, Colin Firth, Hugh Grant, Rowan ...",[Julia Davis],Love_Actually.txt,0.0


After further analysis I found that labeling method is wrong. It pulls cast from prepared database (json file created from DBPedia), 
but crawled html files may not have some actors mentioned. 
That's why I changed `label_data.py` script to add only actors, which are present in text.
This improved F1 score for couple percent: 

In [186]:
rules = [ActorPerformanceMentionRule(), ActorCastRule(), FilmStartsRule(),  FilmStartsWithRule(),
         FilmStarsRuleAsNoun(), IncludingRule(), PlayedByRule(), IncludesRuleDObjWithCompount(),
         AnsembleOfCastOfRule(), InBrackets()]
model = RuleModel(rules)

train['pred'] = model.predict(train['X'], analyse=False)
scores = train[['cast', 'pred']].apply(lambda x: score_row(x['cast'], x['pred']), axis=1)

print('Train F1 score:', scores.mean())

Train F1 score: 0.5501887400760639


In [187]:
results = pd.concat([train[['cast', 'pred', 'movie']], scores], axis=1).reset_index(drop=True).sort_values([0], ascending=True)
results.head(20)

Unnamed: 0,cast,pred,movie,0
56,"[Brian d'Arcy James, Liev Schreiber, Rachel Mc...",[Gary Galone],Spotlight_(film).txt,0.0
27,"[Finn Wittrock, Rosemarie DeWitt, Ryan Gosling...","[Best Production Design, Best Film Editing, Be...",La_La_Land_(film).txt,0.133333
32,"[Tom Hanks, Matt Damon, Tom Sizemore, Edward B...","[Ranger Battalion, Vin Diesel, Inglourious Bas...",Saving_Private_Ryan.txt,0.136364
61,"[Tom Hanks, Hugh Grant, Hugo Weaving, Halle Be...","[Cala Tuent, Jim Broadbent, Music Magazine, Su...",Cloud_Atlas_(film).txt,0.155844
50,"[Samuel L. Jackson, Chris Hemsworth, Cobie Smu...","[Infinity Stones, Claudia Kim, MCU Phase, Marv...",Avengers:_Age_of_Ultron.txt,0.16
67,"[Alec Baldwin, Rebecca Ferguson, Simon Pegg, V...","[Rebecca Ferguson, Tom Cruise Productions, Tom...",Mission:_Impossible_–_Rogue_Nation.txt,0.166667
44,"[Chris Pine, Robin Wright, Gal Gadot, Connie N...","[Geoff Johns, Steve Trevor, Joel Silver, Elena...",Wonder_Woman_(2017_film).txt,0.166667
77,"[Jaeden Lieberher, Bill Skarsgård]","[The Invasion (2007, Tania McGowan, Bill Skars...",It_(2017_film).txt,0.173913
63,"[Harrison Ford, Oscar Isaac, Adam Driver, John...","[Starkiller Base, Ridley, Fisher, The Detroit ...",Star_Wars:_The_Force_Awakens.txt,0.180556
80,"[Eddie Redmayne, Emily Watson, David Thewlis, ...","[Eddie Redmayne, Eddie Redmayne[2][1, Anthony ...",The_Theory_of_Everything_(2014_film).txt,0.2


In [189]:
test['pred'] = model.predict(test['X'], analyse=False)
scores = test[['cast', 'pred']].apply(lambda x: score_row(x['cast'], x['pred']), axis=1)

print('Test F1 score:', scores.mean())

Test F1 score: 0.48334154968984683
