# DBPedia Mining to get Movie Metadata


We use SPARQL queries to get dbpedia metadata for movies released in a year. Then the plot summaries are scraped from Wikipedia for each of these movies.


In [7]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

movies_for_year_query = """
                PREFIX dbpont: <http://dbpedia.org/ontology/>
                PREFIX dbpprop: <http://dbpedia.org/property/>
                PREFIX dbres: <http://dbpedia.org/resource/>
                PREFIX dbc: <http://dbpedia.org/resource/Category:>
                PREFIX foaf: <http://xmlns.com/foaf/0.1/>
                PREFIX dcterms: <http://purl.org/dc/terms/>

                SELECT DISTINCT ?title 
                       (group_concat(distinct ?language;separator=", ") as ?languages) 
                       (group_concat(distinct ?country;separator=", ") as ?countries) 
                       ?released
                       ?gross
                       ?comment
                       ?abstract
                       (?movie as ?dbpediaLink)
                       ?wikipediaLink
                WHERE
                {
                        ?movie rdf:type dbpont:Film .
                        ?movie rdfs:label ?title .
                        ?movie rdfs:comment ?comment .
                        ?movie dbpont:abstract ?abstract .
                        ?movie dcterms:subject dbc:%(year)s_films . 
                        OPTIONAL { ?movie dbpprop:released ?released }
                        OPTIONAL { ?movie dbpprop:country ?country } 
                        OPTIONAL { ?movie dbpont:gross ?gross }
                        OPTIONAL { ?movie dbpprop:language ?language }
                        OPTIONAL { ?movie foaf:isPrimaryTopicOf ?wikipediaLink }

                        FILTER (lang(?title) = 'en')
                        FILTER (lang(?abstract) = 'en')
                        FILTER (lang(?comment) = 'en')
                }
                """

def sparql_json_to_df(results, year):
    movie_dicts = []

    for movie in results['results']['bindings']:
        title = movie['title']['value']
        languages = movie.get('languages', None)
        if(languages):
            languages = languages['value']
        countries = movie.get('countries', None)
        if(countries):
            countries = countries['value']
        released = movie.get('released', None)
        if(released):
            released = released['value']
        gross = movie.get('gross', None)
        if(gross):
            gross = gross['value']
        comment = movie.get('comment', None)
        if(comment):
            comment = comment['value']
        abstract = movie.get('abstract', None)
        if(abstract):
            abstract = abstract['value']
        dbpediaLink = movie.get('dbpediaLink', None)
        if(dbpediaLink):
            dbpediaLink = dbpediaLink['value']
        wikipediaLink = movie.get('wikipediaLink', None)
        if(wikipediaLink):
            wikipediaLink = wikipediaLink['value']

        movie_dicts.append({'year':year,
                            'title':title,
                            'languages':languages,
                            'countries':countries,
                            'released':released,
                            'gross':gross,
                            'comment':comment,
                            'abstract':abstract,
                            'dbpediaLink':dbpediaLink,
                            'wikipediaLink':wikipediaLink})

    df = pd.DataFrame(movie_dicts, columns=['year'] + results['head']['vars'])
    return(df)

def get_dbpedia_data_for_year(year):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery(movies_for_year_query % {'year': str(year)})
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    year_df = sparql_json_to_df(results, year)
    return(year_df)


In [8]:
years = range(2000, 2016)

In [13]:
import time

for year in years:
    dbp_df = get_dbpedia_data_for_year(year)
    csv_filename = "dbpedia_meta_" + str(year) + ".csv"
    dbp_df.to_csv(csv_filename, sep='\t', encoding='utf-8')
    print(str(len(dbp_df)), " records for the year ", str(year), "written to ", csv_filename)
    time.sleep(1)


('1171', ' records for the year ', '2000', 'written to ', 'dbpedia_meta_2000.csv')
('1212', ' records for the year ', '2001', 'written to ', 'dbpedia_meta_2001.csv')
('1257', ' records for the year ', '2002', 'written to ', 'dbpedia_meta_2002.csv')
('1374', ' records for the year ', '2003', 'written to ', 'dbpedia_meta_2003.csv')
('1499', ' records for the year ', '2004', 'written to ', 'dbpedia_meta_2004.csv')
('1622', ' records for the year ', '2005', 'written to ', 'dbpedia_meta_2005.csv')
('1900', ' records for the year ', '2006', 'written to ', 'dbpedia_meta_2006.csv')
('1970', ' records for the year ', '2007', 'written to ', 'dbpedia_meta_2007.csv')
('2012', ' records for the year ', '2008', 'written to ', 'dbpedia_meta_2008.csv')
('2135', ' records for the year ', '2009', 'written to ', 'dbpedia_meta_2009.csv')
('1991', ' records for the year ', '2010', 'written to ', 'dbpedia_meta_2010.csv')
('1960', ' records for the year ', '2011', 'written to ', 'dbpedia_meta_2011.csv')
('20