In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

Construction of the Movielens dataset starting from the Movielens completed database
---
Data in the Movielens database are are spread in different files that were combined in a single csv file to be used in 
ER and SM tasks. To do so, multiple preprocessing and cleaning steps were required and are reported in this notebook. 

There are two datasets of interest that will be worked on and merged, `movies_metadata.csv`, which contains metainformation 
about each movie, such as year of release, country, production company, language; and `credits.csv`, which contains the 
cast of each movie, as well as information about the crew. 

The two combined datasets contain a large amount of information that is not present in the `imdb` dataset that should be
compared to this database, for this reason it was necessary to drop most of the columns when preparing the experimental 
version. 


### Preparing the metadata file



In [4]:
# Reading the metadata file
df = pd.read_csv('../pipeline/experiments/movies_metadata.csv', engine='python')


In [5]:
# Observing the fraction of NAN present in each column
n_lines = len(df)
for col in df.columns:
    countna = df[col].isnull().sum()
    countfull = df[col].count()
    frac = countna/n_lines*100
    print('Column {:30}: {} missing values, {:.2f}% of the total'.format(col, countna, frac))


Column adult                         : 0 missing values, 0.00% of the total
Column belongs_to_collection         : 40972 missing values, 90.12% of the total
Column budget                        : 0 missing values, 0.00% of the total
Column genres                        : 0 missing values, 0.00% of the total
Column homepage                      : 37684 missing values, 82.88% of the total
Column id                            : 0 missing values, 0.00% of the total
Column imdb_id                       : 17 missing values, 0.04% of the total
Column original_language             : 11 missing values, 0.02% of the total
Column original_title                : 0 missing values, 0.00% of the total
Column overview                      : 954 missing values, 2.10% of the total
Column popularity                    : 5 missing values, 0.01% of the total
Column poster_path                   : 386 missing values, 0.85% of the total
Column production_companies          : 3 missing values, 0.01% of the to

In [6]:
# Dropping the first columns (note that some of these columns contain a large fraction of null values). 

df = df.drop(['belongs_to_collection', 
              'homepage', 
              'imdb_id', 
              'overview', 
              'spoken_languages', 
              'tagline',
              'poster_path',
              'popularity',
             ], axis=1)


In [7]:
# All lines containing an empty title value are dropped.
df = df.dropna(subset=['title'])

# The id column is redefined as an integer. 
df.id = df.id.astype(int)


Additional custom functions were defined to handle missing values and badly formatted entries. Any errors are handled by 
setting the value to "UKN". This value can then be replaced as needed depending on the null value strategy. 

In [9]:
# When cleaning genres, only the first value is chosen (multiple genres are possible). If no genre is specified, then 
# 'Unknown' is given. 
def clean_genres(ll):
    g = ast.literal_eval(ll)
    try:
        l1 = g[0]['name']
        return l1
    except IndexError:
        return 'Unknown'

# The same approach is given to production companies and countries: only the first value in each list is chosen, missing
# values are set to unknown. 
def clean_production_companies(ll):
    try:
        g = ast.literal_eval(ll)
    except ValueError:
        return 'UKN'
    except SyntaxError:
        print(ll)
    try:
        l1 = g[0]['name']
        return l1
    except IndexError:
        return 'UKN'
    except TypeError:
        return 'UKN'

def clean_production_country(ll):
    try:
        g = ast.literal_eval(ll)
    except ValueError:
        return 'UKN'
    try:
        l1 = g[0]['iso_3166_1']
        return l1
    except IndexError:
        return 'UKN'
    except TypeError:
        return 'UKN'

In [10]:
# After defining the cleaning functions, they are applied to the proper columns. 
df.genres = df.genres.apply(clean_genres)
df.production_companies = df.production_companies.apply(clean_production_companies)
df.production_countries = df.production_countries.apply(clean_production_country)

### Preparing the crew dataset
To reflect the structure of the imdb dataset, we are interested in extracting only a limited subset of the cast, namely
the director and the first three billed actors. 

In [11]:
# clean_cast extracts the first 3 actor names contained by the column main actors.
def clean_cast(ll):
    g = ast.literal_eval(ll)
    main_actors = g[:3]
    cleaned_list = [_['name'] for _ in main_actors]
    return cleaned_list

# clean_crew extracts only the director from the crew. If the director is missing, it is set as Unknown.
def clean_crew(ll):
    g = ast.literal_eval(ll)
    for _ in g:
        if _['job'] == 'Director':
            return _['name']
    return 'Unknown'

In [15]:
df_cast = pd.read_csv('../pipeline/experiments/credits.csv')

# Actors and directors are extracted and the columns are appended to the complete cast dataset. 
cast = df_cast.cast.apply(clean_cast)
c = pd.DataFrame(cast.tolist(), columns=['actor_1', 'actor_2', 'actor_3'])
df_cast = pd.concat([df_cast, c], axis=1)
df_cast.crew = df_cast.crew.apply(clean_crew)
# The crew attribute is replaced by the simpler director attribute
df_cast['director'] = df_cast['crew']
df_cast.drop('cast', inplace=True, axis=1)

df_cast = df_cast.reindex(['id', 'actor_1', 'actor_2', 'actor_3', 'director'], axis=1)

# Merging metadata and cast
To obtain the complete dataset, it is then necessary to merge metadata and cast datasets. Some additional cleaning steps are performed here. It is possible to merge the two datasets thanks to the presence of an "id" column. 

In [16]:
# Merge metadata and cast 
df_movies = pd.merge(df, df_cast, on='id')

df_movies.release_date = df_movies.release_date.astype('datetime64')
df_movies.budget = df_movies.budget.astype('float64')

# This is the complete dataset, which includes many attributes we are not interested in.
df_movies.to_csv('movielens.csv', index=False)

In [17]:
# Most unimportant attributes are dropped
df_reduced = df_movies.drop(['adult', 'budget', 'id', 'original_title', 'video', 'vote_count', 'revenue', 'runtime'], axis=1)
df_reduced['release_date_rounded'] = df_reduced.release_date.dt.strftime('%Y-%m')
df_reduced.vote_average = df_reduced.vote_average.round()

df_reduced.to_csv('movieslens-reduced.csv', index=False)