# Fact checker

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import json
import wikipedia
import datetime
import re
import spacy
from nltk.corpus import wordnet as wn
import numpy as np

In [3]:
pd.set_option('display.max_colwidth', 400)

## DBPedia

На сторінках Вікіпедії 
- поле рік релізу буває заповненим, але не обов'язково
- я помітила, що є rdf_type тип, вигляду yago:WikicatXXXXFilms, де XXXX - рік виходу фільму, витягуємо рядки такого вигялду і приводимо до вигляду дати, але також не проставляє дату для усіх фільмів
- поле, вигляду resource/Category:XXXX_films надає майже всім фільмам рік

In [8]:
def sparql_to_df(query):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    cols = results['head']['vars']
    out = []
    for row in results['results']['bindings']:
            item = []
            for c in cols:
                item.append(row.get(c, {}).get('value'))
            out.append(item)

    return pd.DataFrame(out, columns=cols)        

In [238]:
query = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX yago: <http://dbpedia.org/class/yago/>
PREFIX dct:    <http://purl.org/dc/terms/>
SELECT ?film ?released ?rdf_type ?subject_year 
WHERE {
?film rdf:type dbo:Film .
?film dbo:starring dbr:Dakota_Fanning.
OPTIONAL{?film dbo:releaseDate ?released }.
OPTIONAL{?film dct:subject ?subject_year 
FILTER (regex (?subject_year, '\\\d+_films', 'i') ) }.
OPTIONAL{?film rdf:type ?rdf_type 
Filter (regex (?rdf_type , 'Wikicat\\\d+Films', 'i') ) }.
}
"""

In [239]:
df = sparql_to_df(query)

In [243]:
df.head()

Unnamed: 0,film,released,rdf_type,subject_year,date_from_cat_2
0,http://dbpedia.org/resource/Dreamer_(2005_film),2005-09-10,http://dbpedia.org/class/yago/Wikicat2005Films,http://dbpedia.org/resource/Category:2005_films,2005
1,http://dbpedia.org/resource/Dreamer_(2005_film),2005-10-21,http://dbpedia.org/class/yago/Wikicat2005Films,http://dbpedia.org/resource/Category:2005_films,2005
2,http://dbpedia.org/resource/Trapped_(2002_film),2002-09-20,http://dbpedia.org/class/yago/Wikicat2002Films,http://dbpedia.org/resource/Category:2002_films,2002
3,http://dbpedia.org/resource/Very_Good_Girls,2014-06-24,http://dbpedia.org/class/yago/Wikicat2013Films,http://dbpedia.org/resource/Category:2013_films,2013
4,http://dbpedia.org/resource/Nine_Lives_(2005_film),,http://dbpedia.org/class/yago/Wikicat2005Films,http://dbpedia.org/resource/Category:2005_films,2005


In [242]:
df['date_from_cat_2'] = df['subject_year'].map(lambda x : x[-10:-6] if bool(x) else '')

In [244]:
df['date_from_cat'] = df['rdf_type'].map(lambda x : x[-9:-5] if bool(x) else '')

In [245]:
df['film_name']= df['film'].map(lambda x: (x.rsplit('/', 1)[-1]).rsplit('(', 1)[0].rstrip('_'))\
                .map(lambda x: x.replace('_', ' '))

In [247]:
df[['film_name', 'date_from_cat', 'date_from_cat_2', 'released']]

Unnamed: 0,film_name,date_from_cat,date_from_cat_2,released
0,Dreamer,2005.0,2005.0,2005-09-10
1,Dreamer,2005.0,2005.0,2005-10-21
2,Trapped,2002.0,2002.0,2002-09-20
3,Very Good Girls,2013.0,2013.0,2014-06-24
4,Nine Lives,2005.0,2005.0,
5,Winged Creatures,2008.0,2008.0,
6,Father Xmas,2001.0,2001.0,
7,Cutlass,2007.0,2007.0,
8,The Cat in the Hat,2003.0,2003.0,
9,Uptown Girls,2003.0,2003.0,


## Wiki 
витягую сторінку акторки Дакоти Фанінг

In [109]:
wikipedia.set_lang("en")
page = wikipedia.page("Dakota_Fanning")

# Get content before the "See also" section
content = page.content.split('== See also ==')[0]

# disable=["tagger"]
nlp = spacy.load("en")
doc = nlp(content)
doc[:100]

Hannah Dakota Fanning (born February 23, 1994) is an American actress and model. She rose to prominence at the age of seven for her performance as Lucy Dawson in the drama film I Am Sam (2001), for which she received a Screen Actors Guild Award nomination at age eight, making her the youngest nominee in SAG history. Fanning played major roles in the films Uptown Girls (2003), The Cat in the Hat (2003), Man on Fire (2004), War of the Worlds (

### The occurrence of dbpedia text in row text

In [110]:
sentences = list(doc.sents)

In [248]:
def get_stat(row):
    found = [
                 sent.text\
                 for sent in sentences\
                 if row["film_name"] in sent.text
            ]
    
    film_occurrence = len(found)
    year_occurrence = len([1 for sent_text in found if row["date_from_cat_2"] in sent_text])
    return (film_occurrence, year_occurrence, found)

df['stat'] = df.apply(lambda row: get_stat(row), axis=1)

In [327]:
df_stat = df.merge(df['stat'].apply(lambda s: pd.Series({'film_cnt':s[0], 'year_cnt':s[1], 'sents':s[2]})), 
    left_index=True, right_index=True)\
    .drop('stat', axis=1)\
    [['film_name', 'date_from_cat_2', 'film_cnt', 'year_cnt', 'sents']]

In [250]:
df_stat

Unnamed: 0,film_name,date_from_cat_2,film_cnt,year_cnt,sents
0,Dreamer,2005.0,4,2,"[Fanning played major roles in the films Uptown Girls (2003), The Cat in the Hat (2003), Man on Fire (2004), War of the Worlds (2005), Dreamer (2005), and Charlotte's Web (2006).\n, Fanning completed filming on Dreamer, While promoting her role in Dreamer, Fanning became a registered member of Girl Scouts of the USA at a special ceremony, which was followed by a screening of the film for membe..."
1,Dreamer,2005.0,4,2,"[Fanning played major roles in the films Uptown Girls (2003), The Cat in the Hat (2003), Man on Fire (2004), War of the Worlds (2005), Dreamer (2005), and Charlotte's Web (2006).\n, Fanning completed filming on Dreamer, While promoting her role in Dreamer, Fanning became a registered member of Girl Scouts of the USA at a special ceremony, which was followed by a screening of the film for membe..."
2,Trapped,2002.0,1,0,"[By this time, she had received positive notices from several film critics, including Tom Shales of The Washington Post, who wrote that Fanning ""has the perfect sort of otherworldly look about her, an enchanting young actress called upon ... to carry a great weight.""In the same year, Fanning appeared in three films: as a kidnap victim who proves to be more than her abductors bargained for in T..."
3,Very Good Girls,2013.0,1,1,"[Now Is Good (2012) and Night Moves (2013), the comedy-drama Very Good Girls (2013), and the biographical film Effie Gray (2014).]"
4,Nine Lives,2005.0,1,1,"[She also had a small part in the Rodrigo García film Nine Lives (released in October 2005), in which she shared an unbroken nine-minute scene with actress Glenn Close, who had her own praise for Fanning: ""She's definitely an old soul.]"
5,Winged Creatures,2008.0,1,0,"[Later that year, she was ranked 4th in Forbes list of ""Top-Earning Stars Aged Under 21"", having earned an estimated $4 million in 2006.In the spring of 2007, she filmed Fragments – Winged Creatures alongside Kate Beckinsale, Guy Pearce, Josh Hutcherson, and Academy Award winners Forest Whitaker and Jennifer Hudson.]"
6,Father Xmas,2001.0,0,0,[]
7,Cutlass,2007.0,2,0,"[In July, Fanning appeared on a short film titled Cutlass, one of Glamour's ""Reel Moments"" based on readers' personal essays., Cutlass was directed by Kate Hudson.\n]"
8,The Cat in the Hat,2003.0,2,1,"[Fanning played major roles in the films Uptown Girls (2003), The Cat in the Hat (2003), Man on Fire (2004), War of the Worlds (2005), Dreamer (2005), and Charlotte's Web (2006).\n, A year later, she starred in two prominent films: playing the uptight child to an immature nanny played by Brittany Murphy in Uptown Girls, and as Sally in The Cat in the Hat.]"
9,Uptown Girls,2003.0,2,1,"[Fanning played major roles in the films Uptown Girls (2003), The Cat in the Hat (2003), Man on Fire (2004), War of the Worlds (2005), Dreamer (2005), and Charlotte's Web (2006).\n, A year later, she starred in two prominent films: playing the uptight child to an immature nanny played by Brittany Murphy in Uptown Girls, and as Sally in The Cat in the Hat.]"


### From raw wiki page text

**extract film names from raw text**

In [114]:
works_of_art = [X.text for X in doc.ents if X.label_ == 'WORK_OF_ART']
print(len(works_of_art))
works_of_art

17


['I Am Sam (2001',
 'Man on Fire',
 'The Twilight Saga',
 'Once Upon a Time',
 'ER',
 'I Am Sam',
 'the Best Young Actor/Actress',
 'Allie"',
 'Sweet Home Alabama',
 'Kids Stuff',
 'Man on Fire as Pita',
 'a True Story',
 'Reel Moments',
 'The Secret Life of Bees',
 'Most Valuable Young Stars',
 'Night Moves',
 'Viena']

In [117]:
words =['series', 'movie','drama', 'cartoon']

synonyms = []
for word in words:
    for syn in wn.synsets(word, pos=wn.NOUN):
        for l in syn.lemmas():
            synonyms.append(l.name().replace('_', ' '))

synonyms =set(synonyms)
synonyms

{'animated cartoon',
 'cartoon',
 'drama',
 'dramatic event',
 'dramatic play',
 'film',
 'flick',
 'motion picture',
 'motion-picture show',
 'movie',
 'moving picture',
 'moving-picture show',
 'pic',
 'picture',
 'picture show',
 'play',
 'serial',
 'serial publication',
 'series',
 'sketch',
 'toon'}

In [295]:
def extract_year_from_sent(sent):
        reg_exp_year = re.compile(r'\d{4}?')
        re_year = reg_exp_year.findall(sent.text)
        if re_year:
             return re_year[0]
        else:
            return None   

In [306]:
films_0 = []
years_0 = []
for syn in synonyms:
    for token in doc:
        if syn == token.lemma_ and token.dep_ == 'pobj':
            info = [ [token.lemma_, token.dep_, child, child.pos_, child.dep_, child.ent_iob_] for child in token.children\
                   if child.pos_ in('PROPN', 'NOUN') and child.dep_=='appos'] 
            if info:
                films_0.append(info[0][2].text)
                year = extract_year_from_sent(info[0][2].sent)     
                years_0.append(year)
                print(info)

[['series', 'pobj', Guy, 'PROPN', 'appos', 'O']]
[['film', 'pobj', Girls, 'NOUN', 'appos', 'I']]
[['film', 'pobj', Lilo, 'PROPN', 'appos', 'B']]
[['film', 'pobj', Lives, 'NOUN', 'appos', 'O']]
[['film', 'pobj', Hounddog, 'PROPN', 'appos', 'B']]
[['film', 'pobj', Keeper, 'PROPN', 'appos', 'B']]
[['film', 'pobj', Brimstone, 'PROPN', 'appos', 'O']]
[['movie', 'pobj', Yellowbird, 'PROPN', 'appos', 'B']]


In [339]:
df_from_raw_text_0 = pd.DataFrame({'film_extracted' : films_0,
                                'year_extracted' : years_0 })\
                    .drop_duplicates()
df_from_raw_text_0    

Unnamed: 0,film_extracted,year_extracted
0,Guy,
1,Girls,2003.0
2,Lilo,
3,Lives,2005.0
4,Hounddog,2006.0
5,Keeper,2008.0
6,Brimstone,2015.0
7,Yellowbird,2015.0


З гарних результатів:
    Hounddog, Brimstone, Yellowbird (назви фільмів довжиною в 1 слово)
іншим бракує цілих власних назв

In [292]:
films = []
years = []
for syn in synonyms:
    for sent in sentences:
        for token in sent:
    
            if syn == token.lemma_: #token.text
#                 print(sent)
                if sent.ents:
#                     print(sent.ents)
                    for ent in sent.ents: 
#                         print([ent for i in ent if i.ent_type_ in ('ORG', 'WORK_OF_ART') and i.dep_ in ('pobj', 'appos')])
                       info = [(ent, i, i.pos_, i.ent_type_, i.dep_ )\
                               for i in ent if i.ent_type_ in ('ORG', 'WORK_OF_ART')\
                               and i.dep_ in ('pobj', 'appos')\
                              ]

                       if info:
                            print(info)
                            films.append(ent.text)
                            year = extract_year_from_sent(sent)
                            years.append(year)
  

[(My Neighbor Totoro, Totoro, 'PROPN', 'ORG', 'pobj')]
[(Kids Stuff, Stuff, 'PROPN', 'WORK_OF_ART', 'appos')]
[(Justice League Unlimited, Unlimited, 'PROPN', 'ORG', 'pobj')]
[(My Neighbor Totoro, Totoro, 'PROPN', 'ORG', 'pobj')]
[(Kids Stuff, Stuff, 'PROPN', 'WORK_OF_ART', 'appos')]
[(Justice League Unlimited, Unlimited, 'PROPN', 'ORG', 'pobj')]
[(Oats Studios, Studios, 'PROPN', 'ORG', 'pobj')]
[(I Am Sam (2001, Am, 'VERB', 'WORK_OF_ART', 'appos'), (I Am Sam (2001, 2001, 'NUM', 'WORK_OF_ART', 'appos')]
[(ER, ER, 'PROPN', 'WORK_OF_ART', 'pobj')]
[(Uptown Girls, Girls, 'NOUN', 'ORG', 'appos')]
[(Man on Fire, Fire, 'PROPN', 'WORK_OF_ART', 'pobj')]
[(The Secret Life of Bees, Life, 'PROPN', 'ORG', 'pobj'), (The Secret Life of Bees, Bees, 'PROPN', 'ORG', 'pobj')]
[(Coraline, Coraline, 'PROPN', 'ORG', 'pobj')]
[(The Twilight Saga, Saga, 'NOUN', 'WORK_OF_ART', 'pobj')]
[(ER, ER, 'PROPN', 'WORK_OF_ART', 'pobj')]
[(Uptown Girls, Girls, 'PROPN', 'ORG', 'pobj')]
[(Uptown Girls, Girls, 'PROPN', 'OR

In [337]:
df_from_raw_text = pd.DataFrame({'film_extracted' : films,
                                'year_extracted' : years })\
                    .drop_duplicates()

**З'єдную два dataframes, знайдених фільмів з роками двом способами (б, в)**

In [340]:
frames = [df_from_raw_text_0, df_from_raw_text]

df_from_raw_text_full = pd.concat(frames)
df_from_raw_text_full

Unnamed: 0,film_extracted,year_extracted
0,Guy,
1,Girls,2003.0
2,Lilo,
3,Lives,2005.0
4,Hounddog,2006.0
5,Keeper,2008.0
6,Brimstone,2015.0
7,Yellowbird,2015.0
0,My Neighbor Totoro,
1,Kids Stuff,


## Metrics

In [341]:
df_stat.drop('sents', axis=1)

Unnamed: 0,film_name,date_from_cat_2,film_cnt,year_cnt
0,Dreamer,2005.0,4,2
1,Dreamer,2005.0,4,2
2,Trapped,2002.0,1,0
3,Very Good Girls,2013.0,1,1
4,Nine Lives,2005.0,1,1
5,Winged Creatures,2008.0,1,0
6,Father Xmas,2001.0,0,0
7,Cutlass,2007.0,2,0
8,The Cat in the Hat,2003.0,2,1
9,Uptown Girls,2003.0,2,1


In [342]:
def get_stat(row):

    return int(bool(row["film_name"] in list(df_from_raw_text_full['film_extracted'])))

df_stat['in_extracted_film_list'] = df_stat.apply(lambda row: get_stat(row), axis=1)

In [344]:
df_evaluation = df_stat.drop('sents', axis=1 )\
    .merge(df_from_raw_text_full, how='outer', left_on='film_name', right_on='film_extracted')

In [346]:
df_evaluation['is_extracted_film_year'] = (df_evaluation['date_from_cat_2'] == df_evaluation['year_extracted'])\
                                            .map(lambda x: int(x))

In [376]:
df_evaluation

Unnamed: 0,film_name,date_from_cat_2,film_cnt,year_cnt,in_extracted_film_list,film_extracted,year_extracted,is_extracted_film_year
0,Dreamer,2005,4.0,2.0,0.0,,,0
1,Dreamer,2005,4.0,2.0,0.0,,,0
2,Trapped,2002,1.0,0.0,0.0,,,0
3,Very Good Girls,2013,1.0,1.0,0.0,,,0
4,Nine Lives,2005,1.0,1.0,0.0,,,0
5,Winged Creatures,2008,1.0,0.0,0.0,,,0
6,Father Xmas,2001,0.0,0.0,0.0,,,0
7,Cutlass,2007,2.0,0.0,0.0,,,0
8,The Cat in the Hat,2003,2.0,1.0,0.0,,,0
9,Uptown Girls,2003,2.0,1.0,1.0,Uptown Girls,2003,1


In [396]:
set1 = set(df_evaluation['film_name'].dropna()) 
set2 = set(df_evaluation['film_extracted'].dropna()) 
intersection_set = list(set1 & set2) 
print('We have {} films from dbpedia and {} films extracted from the raw text. The intersection is {}'.format(\
    len(set1), len(set2), len(intersection_set)
) ) 

We have 33 films from dbpedia and 38 films extracted from the raw text. The intersection is 9


In [367]:
df_evaluation[df_evaluation['in_extracted_film_list']==1]\
[['is_extracted_film_year']].sum()

is_extracted_film_year    2
dtype: int64

In [409]:
intersection_score = len(intersection_set)/ np.mean([ len(set1),  len(set2)])

In [410]:
intersection_score

0.2535211267605634

Наскільки наші 'правдиві' фільми з роками потрапили у текст?

In [413]:
facts_in_raw_text_score = df_evaluation[(df_evaluation['film_cnt'] > 0) &\
                  ( df_evaluation['year_cnt'] > 0)]['film_name'].count()\
/ df_evaluation['film_name'].nunique()
facts_in_raw_text_score

0.6363636363636364

In [415]:
extracted_entities_equals_true_ent_score = len(intersection_set)/df_evaluation['film_extracted'].nunique()
extracted_entities_equals_true_ent_score

0.23684210526315788

**Result metrics**

In [422]:
np.mean([intersection_score, facts_in_raw_text_score, extracted_entities_equals_true_ent_score])

0.3755756227957859

# Project metrics

In [433]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

In [434]:
y_true = [(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)]
y_pred = [(0, 0), (0, 0), (1, 1), (0, 1), (1, 0)]

m = MultiLabelBinarizer().fit(y_true)

In [437]:
accuracy_score(m.transform(y_true),
               m.transform(y_pred))

0.6

In [435]:
f1_score(m.transform(y_true),
         m.transform(y_pred),
         average='macro')

0.8333333333333333

In [438]:
# Calculate metrics for each instance, and find their average 
f1_score(m.transform(y_true),
         m.transform(y_pred),
         average='samples')

0.8666666666666666