# Moview Genre Classification on DBpedia

In [1]:
import time
from IPython.display import Code

from rdfframes.knowledge_graph import KnowledgeGraph
from rdfframes.utils.constants import JoinType
from rdfframes.client.http_client import HttpClientDataFormat, HttpClient

## Choose the graph and define the SPARQL endpoint URI

In [2]:
graph = KnowledgeGraph(graph_name='dbpedia')

endpoint = 'http://10.161.202.101:8890/sparql/'
output_format = HttpClientDataFormat.PANDAS_DF

client = HttpClient(endpoint_url=endpoint, return_format=output_format)

## Build a dataframe of movies from the graph 

In [3]:
dataset = graph.feature_domain_range('dbpp:starring', 'movie', 'actor')\
    
dataset = dataset.expand('actor', [('dbpp:birthPlace', 'actor_country'), ('rdfs:label', 'actor_name')])\
    .expand('movie', [('rdfs:label', 'movie_name'), ('dcterms:subject', 'subject'),
                     ('dbpp:country', 'movie_country'), ('dbpo:genre', 'genre', True)]).cache()

american_actors = dataset.filter({'actor_country': ['regex(str(?actor_country), "USA")']})

prolific_actors = dataset.group_by(['actor'])\
    .count('movie', 'movie_count', unique=True).filter({'movie_count': ['>= 20']})

movies = american_actors.join(prolific_actors, join_col_name1='actor', join_type=JoinType.OuterJoin)\
    .join(dataset, join_col_name1='actor')\
    .select_cols(["movie_name", "movie_country", "subject", "actor_name", "actor_country", "genre"])

  "join columns".format(warn_cols))


## Execute RDFframes code to get the result in a dataframe

In [None]:
df = movies.execute(client, return_format=output_format)

In [None]:
df.sample(n=10)

## Show the SPARQL query generated by RDFframes

In [None]:
display(Code(movies.to_sparql(), language='SPARQL'))

## Visualize the top 20 genres in the dataframe

In [None]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)
df['genre'] = df['genre'].apply(lambda x: x[x.rfind('/')+1:] if type(x) is str else x)
all_genres = nltk.FreqDist(df['genre'].values)
all_genres_df = pd.DataFrame({'Genre':list(all_genres.keys()), 'Count':list(all_genres.values())})
g = all_genres_df.nlargest(columns='Count', n=20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data=g, x='Count', y='Genre')
ax.set(ylabel='Movie')

## Tokenize the data, Split into train and test, and Re-scale the features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df= df.apply(lambda col: pd.factorize(col, sort=True)[0])
features = ["movie_name", "movie_country", "subject", "actor_name", "actor_country"]
df = df.dropna(subset=['genre'])
x = df[features]
y = df['genre']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

## Train using off-the-shelf RandomForest classifier and evaluate the performance

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

print('Mean Absolute Error = ', metrics.mean_absolute_error(y_test, y_pred))