### Aкторка Emma Stone і фільми, в яких вона знімалась

- https://en.wikipedia.org/wiki/Emma_Stone

## Data from DBpedia

In [121]:
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import spacy
import nltk
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT ?film ?released
    WHERE {
    ?film rdf:type dbo:Film .
    ?film dbo:starring dbr:Emma_Stone .
    OPTIONAL{?film dbo:releaseDate ?released .}
    }
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

result_list = []
for result in results["results"]["bindings"]:

    res = result.get("film").get("value")
    
    date = None
    if result.get("released"):
        date = result.get("released").get("value")
    
    cut_str = res.split('/')[-1]
    cut_str = cut_str.replace('_', ' ')
    
    movie_name = cut_str.split('(')[0]
    if re.search('\(\d{4}', cut_str):
        movie_year = re.search('\(\d{4}', cut_str).group(0)[1:]
    elif date:
        movie_year = date.split('-')[0]
    else:
        movie_year = ''
         
    result_list.append((movie_name, movie_year))
               
    
df = pd.DataFrame(result_list, columns=['movie', 'year'])
# df.loc[:, 'movie'] = pd.Series(df['raw'].map(lambda x: x.split('/')[-1]))

df

Unnamed: 0,movie,year
0,The House Bunny,2008.0
1,Easy A,
2,Paper Man,2009.0
3,Aloha,
4,The Amazing Spider-Man,2012.0
5,La La Land,
6,Gangster Squad,
7,Battle of the Sexes,
8,"Crazy, Stupid, Love",
9,Marmaduke,


## Data from Wikipedia API

In [64]:
import wikipedia
wikipedia.set_lang("en")
emma_page = wikipedia.page("Emma_Stone")

# Get content before the "See also" section
emma_content = emma_page.content.split('== See also ==')[0]

nlp = spacy.load("en") #, disable=['parser', 'textcat'])
tokens = nlp(emma_content)
tokens[:100]

Emily Jean "Emma" Stone (born November 6, 1988) is an American actress. The recipient of numerous accolades, including an Oscar, a BAFTA Award, and a Golden Globe Award, she was the highest-paid actress in the world in 2017. Stone has appeared in Forbes Celebrity 100 in 2013 and 2017, and was featured by Time as one of the 100 most influential people in the world.
Born and raised in Scottsdale, Arizona, Stone began acting as a child, in a theater production

In [56]:
re_movie = re.compile('.*(movie|film|comedy|drama|role|musical) ([A-Z][a-z]+).*')
re_movie.search('A brief appearance in the sex comedy Friends with Benefits (2011) reunited her with Gluck.').group(2)

'Friends'

In [63]:
# tokens.ents
movie_year = []
for sent in tokens.sents:
    
    sent_start = sent[0].idx
    
    re_movie = re.compile('.*(movie|film|comedy|drama|role|musical) ([A-Z][a-z]*).*')
    re_year = re.compile('\d{4}')
    year = ''
    start_pos = 0
    
    if re_movie.match(sent.string):
#         print(sent.string)
        
        if re_movie.search(sent.string).group(2):
            ent_pos, _ = re_movie.search(sent.string).span(2)
#             print(ent_pos)
        
        if sent.ents:
            for ent in sent.ents:
                for token in ent:
#                     print(token.text, token.idx)
#                     print(ent_pos - sent_start)
                    if ent_pos == token.idx - sent_start:
                        entity = ent
                
        if re_year.search(sent.string):
            year = re_year.search(sent.string).group(0)
            
        if entity and year:
            if not entity.string in [ent for ent, _ in movie_year]: # first mention
                movie_year.append([entity.string, year])
            
        
df_wiki =  pd.DataFrame(movie_year, columns=['movie', 'year'])
df_wiki
# print([ent for ent, _ in movie_year])

Unnamed: 0,movie,year
0,Easy A,2010
1,The Amazing Spider-Man,2012
2,Birdman,2014
3,Cabaret,2014
4,La La Land,2016
5,The Favourite,2017
6,Heroes,2007
7,Superbad,2007
8,Saturday,2010
9,Friends with Benefits,2011


In [110]:
# Join all movie names
reference = ' '.join(df['movie'].tolist())
hypothesis = ' '.join(df_wiki['movie'].tolist())
print('Reference:\n', reference)
print('Hypothesis:\n', hypothesis)

Reference:
 The House Bunny Easy A Paper Man  Aloha  The Amazing Spider-Man  La La Land  Gangster Squad Battle of the Sexes  Crazy, Stupid, Love Marmaduke  The Croods Irrational Man  Magic in the Moonlight Ghosts of Girlfriends Past Movie 43
Hypothesis:
 Easy A  The Amazing Spider-Man Birdman  Cabaret  La La Land  The Favourite  Heroes  Superbad  Saturday  Friends with Benefits  Crazy Birdman Battle  Love 


In [70]:
BLEUscore = nltk.translate.bleu_score.sentence_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25))
f'BLEU4: {BLEUscore}'

'BLEU4: 0.6643548861507491'

In [None]:
# https://github.com/cocodataset/cocoapi
# https://stackoverflow.com/questions/49311195/how-to-install-coco-pythonapi-in-python3

In [103]:
# https://pypi.org/project/textmetrics/
# https://github.com/mbforbes/textmetrics/
# from textmetrics.main import meteor
# meteor('olya', 'galya')

##### Rouge = overlapping_ngrams_count / all_ngrams_count

In [183]:
from nltk import ngrams

def rouge(reference, hypothesis, ngrams_count):
    
    def get_ngrams(line, n=1):
        ngrams_list = []
        for n in range(1, n + 1):
            for gram in ngrams(line.split(), n):
                if not ' '.join(gram).lower() in stopwords:
                    ngrams_list.append(' '.join(gram))
        return set(ngrams_list)
    
    ref_ngrams = get_ngrams(reference, n=ngrams_count)
    hypo_ngrams = get_ngrams(hypothesis, n=ngrams_count)
    
    overlapping_count = 0
    for r_gram in ref_ngrams:
        for h_gram in hypo_ngrams:
            if r_gram == h_gram:
                overlapping_count += 1
    
    all_ngrams_count = round((len(ref_ngrams) + len(hypo_ngrams))/2)
    
#     print(overlapping_count)
#     print(len(hypo_ngrams))
#     print(len(ref_ngrams))
#     print(all_ngrams_count)
    
    return overlapping_count/(all_ngrams_count + 1)

rouge1 = rouge(reference, hypothesis, ngrams_count=1)
rouge2 = rouge(reference, hypothesis, ngrams_count=2)
rouge3 = rouge(reference, hypothesis, ngrams_count=3)
rouge4 = rouge(reference, hypothesis, ngrams_count=4)
pd.DataFrame([rouge1, rouge2, rouge3, rouge4], columns = ['rouge_n'], index=[1,2,3,4])

Unnamed: 0,rouge_n
1,0.304348
2,0.235294
3,0.177215
4,0.130841


## Висновки:
Після того як дані отримано з двох джерел, назви фільмів з'єднала в рядки. Відповдні рядки з двох джерел порівнювала за допомогою метрик. Побудувала дві метрики для оцінки отриманих даних: BLEU i ROUNGE.<br>
1. BLEU з кількістю взятих n-grams = 4 p з вагами відповідних n-grams (0.25, 0.25, 0.25, 0.25).<br>
Результат = 0.664<br>
2. ROUNGE з кількістю n-grams 1, 2, 3, 4.<br>
Зрозуміло, що кількість перекриття n-grams для метрики rouge1 (де враховується тільки unigram) найбільша по відношенню до загальної кількості.<br>

#### Для проекту
Мій проект полягає у написанні узагальнення для документа. Візьму датасет вже з розміченими даними у вигляді узагальнень.
Метрики:
1. BLEU4
2. Використаю ROUNGE, але трохи модифікувану, по-перше, використаю не слова, а леми слів і поставлю ваги юніграм, біграм, триграм, чотири-грам.
3. METEOR метрика<br>
Вона враховує повне співпадіння слів, стеми, синоніми (за WordNet, ConceptNet наприклад).