In [None]:
# set the environment path to find Recommenders
import sys
sys.path.append("../../")

import pandas as pd
from reco_utils.dataset.wikidata import (
    find_wikidataID,
    query_entity_links,
    read_linked_entities,
    query_entity_description
)

## 1. Create a KG from linked entities in Wikidata from a list

In [None]:
name = ["The Godfather", "Al Pacino", "Tom Hanks", "Forrest Gump", "Julia Roberts", "fake movie", "My Best Friend's Wedding"]
result_linked = pd.DataFrame()

for n in name:
    entity_id = find_wikidataID(n)
    if entity_id != "entityNotFound":
        json_links = query_entity_links(entity_id)
        (related_entities,related_names) = read_linked_entities(json_links)
        d = pd.DataFrame({"original_entity":[entity_id]* len(related_entities),
                          "linked_entities":related_entities,
                          "name_linked_entities":related_names})
        result_linked = pd.concat([result_linked, d])

In [None]:
result_linked[1:10]

### Visualize KG using networkx

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

G=nx.from_pandas_edgelist(result_linked, 'original_entity', 'linked_entities')

In [None]:
plt.figure(figsize=(12,12)) 
nx.draw(G, with_labels=True,node_size=60,font_size=9, width = 0.3)
plt.show()

## 2. Create an item description with short description and related entitites

In [None]:
# Create entity description with small description and string of linked entities
import pandas as pd
name = ["The Godfather", "Al Pacino", "Tom Hanks", "Forrest Gump", "Julia Roberts", "fake movie", "My Best Friend's Wedding"]
result_description = pd.DataFrame()

for n in name:
    entity_id = find_wikidataID(n)
    if entity_id != "entityNotFound":
        json_links = query_entity_links(entity_id)
        entity_description = query_entity_description(entity_id)
        (related_entities,related_names) = read_linked_entities(json_links)
        d = pd.DataFrame({"name": n,
                          "original_entity": entity_id,
                          "description":entity_description,
                          "related_names":', '.join(related_names)}, index = [0])
        result_description = pd.concat([result_description, d])

In [None]:
result_description.head(10)

## 3. Real Example with Movielens Dataset

In [None]:
# Obtain pairs of Movie Title - IDs from Movielens
from reco_utils.dataset import movielens
df = movielens.load_pandas_df('100k', ('UserId', 'ItemId', 'Rating', 'Timestamp'),
             title_col='Title',
             genres_col='Genres',
             year_col='Year'
        )
movies = df[["Title", "ItemId"]].drop_duplicates()

In [None]:
movies["Title"][1:5]

In [None]:
movies.shape

In [None]:
# Clean titles from year in parenthesis
import re
def format_title(title):
    r = re.compile(r'\([^)]*\)')
    title = r.sub("", title)
    commas = title.rpartition(",")
    if len(commas) > 1:
        title = commas[2].lstrip()+commas[0]
    return title.strip()

In [None]:
# Get KG dataset for movies in Movielens
%%time
result_linked = pd.DataFrame()

for index, n in movies.iterrows():
    entity_id = find_wikidataID(format_title(n["Title"]))
    if entity_id != "entityNotFound":
        json_links = query_entity_links(entity_id)
        (related_entities,related_names) = read_linked_entities(json_links)
        d = pd.DataFrame({"original_entity":[entity_id]* len(related_entities),
                          "linked_entities":related_entities,
                          "name_linked_entities":related_names,
                          "movielens_title": n["Title"],
                          "movielens_id": n["ItemId"],
                         })
        result_linked = pd.concat([result_linked, d])

In [None]:
result_linked["movielens_title"].value_counts()