In [6]:
import neo4j
import pandas as pd

## =========================== Data Prep ===========================

In [8]:
df = pd.read_csv('data/netflix_titles.csv')
print(df.shape)
df.head(3)

(6234, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."


### Rename columns to more accurate headers

In [9]:
df.rename({'show_id': 'title_id', 'listed_in': 'genre'}, axis=1, inplace=True)

### The 'director', 'cast', 'country', and 'genre' columns contain lists of values per title. So the individual values must be extracted in order to create nodes.

In [10]:
def extract_values(df, starting_col, target_col):
    temp_df = pd.DataFrame(columns=['title_id', target_col])
    for index, row in df[['title_id', starting_col]].dropna().iterrows():
        list_of_values = row[1].split(', ')
        title_id = row[0]
        temp_df = pd.concat([temp_df, pd.DataFrame({'title_id': title_id, target_col: list_of_values})])
    return temp_df

In [11]:
directors_df = extract_values(df, 'director', 'director')
actors_df = extract_values(df, 'cast', 'actor')
countries_df = extract_values(df, 'country', 'country')
genres_df = extract_values(df, 'genre', 'genre')

print(directors_df.head(3), f'\n{directors_df.shape}\n--------------------')
print(actors_df.head(3), f'\n{actors_df.shape}\n--------------------')
print(countries_df.head(3), f'\n{countries_df.shape}\n--------------------')
print(genres_df.head(3), f'\n{genres_df.shape}\n--------------------')

   title_id          director
0  81145628      Richard Finn
1  81145628        Tim Maltby
0  80125979  Fernando Lebrija 
(4852, 2)
--------------------
   title_id          actor
0  81145628  Alan Marriott
1  81145628    Andrew Toth
2  81145628   Brian Dobson 
(44311, 2)
--------------------
   title_id        country
0  81145628  United States
1  81145628          India
2  81145628    South Korea 
(7179, 2)
--------------------
   title_id                     genre
0  81145628  Children & Family Movies
1  81145628                  Comedies
0  80117401           Stand-Up Comedy 
(13670, 2)
--------------------


In [12]:
df.head(5)

Unnamed: 0,title_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


### Reformat date for easier importing

In [13]:
df.date_added = pd.to_datetime(df.date_added).dt.strftime('%m/%d/%Y')
df[['title', 'date_added']].head()

Unnamed: 0,title,date_added
0,Norm of the North: King Sized Adventure,09/09/2019
1,Jandino: Whatever it Takes,09/09/2016
2,Transformers Prime,09/08/2018
3,Transformers: Robots in Disguise,09/08/2018
4,#realityhigh,09/08/2017


### Exstract numeric values from 'duration'

In [14]:
df.duration = df.duration.str.split(' ', expand=True)[0]
df[['title', 'duration']].head()

Unnamed: 0,title,duration
0,Norm of the North: King Sized Adventure,90
1,Jandino: Whatever it Takes,94
2,Transformers Prime,1
3,Transformers: Robots in Disguise,1
4,#realityhigh,99


In [15]:
df.head(5)

Unnamed: 0,title_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",09/09/2019,2019,TV-PG,90,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,09/09/2016,2016,TV-MA,94,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,09/08/2018,2013,TV-Y7-FV,1,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,09/08/2018,2016,TV-Y7,1,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,09/08/2017,2017,TV-14,99,Comedies,When nerdy high schooler Dani finally attracts...


### Remove the 'director', 'cast', 'country', and 'genre' columns

In [16]:
df = df[['title_id', 'title', 'date_added', 'release_year', 'rating', 'duration',
                'description', 'type']]
df.head()

Unnamed: 0,title_id,title,date_added,release_year,rating,duration,description,type
0,81145628,Norm of the North: King Sized Adventure,09/09/2019,2019,TV-PG,90,Before planning an awesome wedding for his gra...,Movie
1,80117401,Jandino: Whatever it Takes,09/09/2016,2016,TV-MA,94,Jandino Asporaat riffs on the challenges of ra...,Movie
2,70234439,Transformers Prime,09/08/2018,2013,TV-Y7-FV,1,"With the help of three human allies, the Autob...",TV Show
3,80058654,Transformers: Robots in Disguise,09/08/2018,2016,TV-Y7,1,When a prison ship crash unleashes hundreds of...,TV Show
4,80125979,#realityhigh,09/08/2017,2017,TV-14,99,When nerdy high schooler Dani finally attracts...,Movie


### Since we plan to import Movies and TV shows as two different labels, split them into two files for easier importing

In [17]:
movies_df = df[df.type == 'Movie']
del movies_df['type']
movies_df.head()

Unnamed: 0,title_id,title,date_added,release_year,rating,duration,description
0,81145628,Norm of the North: King Sized Adventure,09/09/2019,2019,TV-PG,90,Before planning an awesome wedding for his gra...
1,80117401,Jandino: Whatever it Takes,09/09/2016,2016,TV-MA,94,Jandino Asporaat riffs on the challenges of ra...
4,80125979,#realityhigh,09/08/2017,2017,TV-14,99,When nerdy high schooler Dani finally attracts...
6,70304989,Automata,09/08/2017,2014,R,110,"In a dystopian future, an insurance adjuster f..."
7,80164077,Fabrizio Copano: Solo pienso en mi,09/08/2017,2017,TV-MA,60,Fabrizio Copano takes audience participation t...


In [18]:
shows_df = df[df.type == 'Movie']
del shows_df['type']
shows_df.head()

Unnamed: 0,title_id,title,date_added,release_year,rating,duration,description
0,81145628,Norm of the North: King Sized Adventure,09/09/2019,2019,TV-PG,90,Before planning an awesome wedding for his gra...
1,80117401,Jandino: Whatever it Takes,09/09/2016,2016,TV-MA,94,Jandino Asporaat riffs on the challenges of ra...
4,80125979,#realityhigh,09/08/2017,2017,TV-14,99,When nerdy high schooler Dani finally attracts...
6,70304989,Automata,09/08/2017,2014,R,110,"In a dystopian future, an insurance adjuster f..."
7,80164077,Fabrizio Copano: Solo pienso en mi,09/08/2017,2017,TV-MA,60,Fabrizio Copano takes audience participation t...


## Final Dataframes after cleaning

In [19]:
movies_df.head(3)

Unnamed: 0,title_id,title,date_added,release_year,rating,duration,description
0,81145628,Norm of the North: King Sized Adventure,09/09/2019,2019,TV-PG,90,Before planning an awesome wedding for his gra...
1,80117401,Jandino: Whatever it Takes,09/09/2016,2016,TV-MA,94,Jandino Asporaat riffs on the challenges of ra...
4,80125979,#realityhigh,09/08/2017,2017,TV-14,99,When nerdy high schooler Dani finally attracts...


In [20]:
shows_df.head(3)

Unnamed: 0,title_id,title,date_added,release_year,rating,duration,description
0,81145628,Norm of the North: King Sized Adventure,09/09/2019,2019,TV-PG,90,Before planning an awesome wedding for his gra...
1,80117401,Jandino: Whatever it Takes,09/09/2016,2016,TV-MA,94,Jandino Asporaat riffs on the challenges of ra...
4,80125979,#realityhigh,09/08/2017,2017,TV-14,99,When nerdy high schooler Dani finally attracts...


In [21]:
actors_df.head(3)

Unnamed: 0,title_id,actor
0,81145628,Alan Marriott
1,81145628,Andrew Toth
2,81145628,Brian Dobson


In [22]:
directors_df.head(3)

Unnamed: 0,title_id,director
0,81145628,Richard Finn
1,81145628,Tim Maltby
0,80125979,Fernando Lebrija


In [23]:
genres_df.head(3)

Unnamed: 0,title_id,genre
0,81145628,Children & Family Movies
1,81145628,Comedies
0,80117401,Stand-Up Comedy


In [24]:
countries_df.head(3)

Unnamed: 0,title_id,country
0,81145628,United States
1,81145628,India
2,81145628,South Korea


In [25]:
movies_df.to_csv('data/movies.csv', index=False)
shows_df.to_csv('data/shows.csv', index=False)
actors_df.to_csv('data/actors.csv', index=False)
directors_df.to_csv('data/directors.csv', index=False)
countries_df.to_csv('data/countries.csv', index=False)
genres_df.to_csv('data/genres.csv', index=False)

## ========================= Cypher Import Script ==========================