In [1]:
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt  
from pathlib import Path
import re

In [2]:
PATH = Path('../../')

In [3]:
list(PATH.iterdir())

[WindowsPath('../../.git'),
 WindowsPath('../../.vscode'),
 WindowsPath('../../code'),
 WindowsPath('../../data'),
 WindowsPath('../../figures'),
 WindowsPath('../../products'),
 WindowsPath('../../README.md'),
 WindowsPath('../../requirements.txt')]

In [4]:
DATA= PATH/'data'
CODE= PATH/'code'
FIGURES=PATH/'figures'
PRODUCTS=PATH/'products'

In [5]:
WORKING_DATA= DATA/'working_data'

In [6]:
ratings = pd.read_csv(WORKING_DATA/'ratings_small.csv')
links = pd.read_csv(WORKING_DATA/'links_small.csv')
metadata = pd.read_csv(WORKING_DATA/'movies_metadata.csv')

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [8]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [10]:
# Percentage of null values in each columns
100*ratings.isnull().sum()/len(ratings)

userId       0.0
movieId      0.0
rating       0.0
timestamp    0.0
dtype: float64

In [11]:
# Percentage of null values in each columns
100*links.isnull().sum()/len(links)

movieId    0.000000
imdbId     0.000000
tmdbId     0.142466
dtype: float64

In [12]:
100*metadata.isnull().sum()/len(metadata)

adult                     0.000000
belongs_to_collection    90.115691
budget                    0.000000
genres                    0.000000
homepage                 82.883913
id                        0.000000
imdb_id                   0.037391
original_language         0.024194
original_title            0.000000
overview                  2.098271
popularity                0.010997
poster_path               0.848986
production_companies      0.006598
production_countries      0.006598
release_date              0.191352
revenue                   0.013197
runtime                   0.578454
spoken_languages          0.013197
status                    0.191352
tagline                  55.104914
title                     0.013197
video                     0.013197
vote_average              0.013197
vote_count                0.013197
dtype: float64

In [13]:
id_with_title = metadata[['id','original_title']]
id_with_title.rename(columns={'id':'movieId'}, inplace=True)
# movie id of date format YYYY-MM-DD
# search for hyphenated movie id 
drop_ids=[]
for id_num in id_with_title['movieId']:
    x=  re.findall(r'\w+(?:-\w+)+',id_num )
    if x:
       drop_ids.append(id_num)
# drop movie id

for to_drop in drop_ids:
    id_with_title=id_with_title[id_with_title['movieId']!=to_drop]

id_with_title['movieId']=id_with_title['movieId'].astype('int64')
ratings_with_title=ratings.merge(id_with_title)
ratings_with_title.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,1,1371,2.5,1260759135,Rocky III
1,4,1371,4.0,949810302,Rocky III
2,7,1371,3.0,851869160,Rocky III
3,19,1371,4.0,855193404,Rocky III
4,21,1371,3.0,853852263,Rocky III


In [14]:
len(ratings)

100004

In [15]:
len(ratings_with_title)

44994

In [16]:
ratings_with_title[ratings_with_title['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,1,1371,2.5,1260759135,Rocky III
47,1,1405,1.0,1260759203,Greed
93,1,2105,4.0,1260759139,American Pie
140,1,2193,2.0,1260759198,My Tutor
182,1,2294,2.0,1260759108,Jay and Silent Bob Strike Back
235,1,2455,2.5,1260759113,Vivement dimanche!


In [17]:
ratings_with_title[ratings_with_title['movieId']==1371]

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,1,1371,2.5,1260759135,Rocky III
1,4,1371,4.0,949810302,Rocky III
2,7,1371,3.0,851869160,Rocky III
3,19,1371,4.0,855193404,Rocky III
4,21,1371,3.0,853852263,Rocky III
5,22,1371,2.0,1131662302,Rocky III
6,41,1371,3.5,1093886662,Rocky III
7,78,1371,4.0,1344470332,Rocky III
8,118,1371,3.0,951009005,Rocky III
9,130,1371,3.0,1138999999,Rocky III
