In [1]:
import numpy as np
import pandas as pd
import azureml.dataprep as dprep



# Load and Explore Movie Data
Using the new Microsoft Data Preparation SDK (in public preview)

In [2]:
movies = dprep.read_csv(path='data/ml-latest-small/movies.csv', 
                        inference_arguments=dprep.InferenceArguments.current_culture())
movies.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Error Count,Lower Quartile,Median,Upper Quartile,Standard Deviation,Mean
movieId,FieldType.DECIMAL,1,193609,9742.0,0.0,0.0,3239.69,7288.78,76386.3,52160.5,42200.4
title,FieldType.STRING,'71 (2014),À nous la liberté (Freedom for Us) (1931),9742.0,0.0,0.0,,,,,
genres,FieldType.STRING,(no genres listed),Western,9742.0,0.0,0.0,,,,,


In [3]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2.0,Jumanji (1995),Adventure|Children|Fantasy
2,3.0,Grumpier Old Men (1995),Comedy|Romance
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5.0,Father of the Bride Part II (1995),Comedy


# Extract Genres
This is the part where we extract all of the genres

In [4]:
df = movies.to_pandas_dataframe()
g = set()
for item in df['genres']:
    g = g | set([i.strip() for i in str(item).split('|')])
print(g)

{'Western', 'Fantasy', 'Animation', 'Drama', 'Musical', 'Horror', 'Crime', 'Documentary', 'Action', 'Film-Noir', 'Thriller', 'Comedy', 'IMAX', '(no genres listed)', 'Children', 'War', 'Mystery', 'Adventure', 'Romance', 'Sci-Fi'}


In [5]:
genres = ['Documentary', 'Children', 'Action', 'Film-Noir', 'Drama', 
          'Adventure', 'Fantasy', 'IMAX', 'Mystery', 'Musical', 'Sci-Fi', 
          'War', 'Crime', '(no genres listed)', 'Romance', 'Comedy', 'Thriller', 
          'Western', 'Animation', 'Horror']

# Add Column Per Genre
This is to add a one-hot style encoding to genres

In [6]:
for item in genres:
    movies = movies.add_column(new_column_name=item, 
                               prior_column='genres', 
                               expression=dprep.col('genres').contains(item))
movies.head(5)

Unnamed: 0,movieId,title,genres,Horror,Animation,Western,Thriller,Comedy,Romance,(no genres listed),...,Musical,Mystery,IMAX,Fantasy,Adventure,Drama,Film-Noir,Action,Children,Documentary
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,False,True,False,False,True,False,False,...,False,False,False,True,True,False,False,False,True,False
1,2.0,Jumanji (1995),Adventure|Children|Fantasy,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,False
2,3.0,Grumpier Old Men (1995),Comedy|Romance,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance,False,False,False,False,True,True,False,...,False,False,False,False,False,True,False,False,False,False
4,5.0,Father of the Bride Part II (1995),Comedy,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


# Save Data Preparation Steps
Create and save dprep file

In [7]:
movies = movies.set_name(name='movies')
package = dprep.Package(arg=movies)
package = package.save(file_path='movies.dprep')