In [1]:
import os
project_name = "reco-tut-mll"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [2]:
if not os.path.exists(project_path):
    !cp /content/drive/MyDrive/mykeys.py /content
    import mykeys
    !rm /content/mykeys.py
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    import sys; sys.path.append(path)
    !git config --global user.email "recotut@recohut.com"
    !git config --global user.name  "reco-tut"
    !git init
    !git remote add origin https://"{mykeys.git_token}":x-oauth-basic@github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout main
else:
    %cd "{project_path}"

/content/reco-tut-mll
Initialized empty Git repository in /content/reco-tut-mll/.git/
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 24 (delta 3), reused 21 (delta 1), pack-reused 0[K
Unpacking objects: 100% (24/24), done.
From https://github.com/sparsh-ai/reco-tut-mll
 * branch            main       -> FETCH_HEAD
 * [new branch]      main       -> origin/main
Branch 'main' set up to track remote branch 'main' from 'origin'.
Switched to a new branch 'main'


In [17]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdata/silver/movie_ratings.parquet.gzip[m

nothing added to commit but untracked files present (use "git add" to track)


In [18]:
!git add . && git commit -m 'commit' && git push origin "{branch}"

[main 941b303] commit
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/silver/movie_ratings.parquet.gzip
Counting objects: 5, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 1.25 MiB | 4.07 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/sparsh-ai/reco-tut-mll.git
   5c63131..941b303  main -> main


---

## Setup

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Data Loading

In [5]:
movies = pd.read_parquet('./data/bronze/movies.parquet.gzip')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
ratings = pd.read_parquet('./data/bronze/ratings.parquet.gzip')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


---

## Wrangling

Organize ratings

In [9]:
ratings.sort_values(by='movieId', inplace=True)
ratings.reset_index(inplace=True, drop=True)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,517,1,4.0,1487954343
2,213,1,3.5,1316196157
3,514,1,4.0,1533872400
4,214,1,3.0,853937855


Modify rating timestamp format (from seconds to datetime year)

In [10]:
ratings.timestamp = pd.to_datetime(ratings.timestamp, unit='s', origin='unix')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,517,1,4.0,2017-02-24 16:39:03
2,213,1,3.5,2011-09-16 18:02:37
3,514,1,4.0,2018-08-10 03:40:00
4,214,1,3.0,1997-01-22 12:57:35


Split title and release year in separate columns in movies dataframe. Convert year to timestamp.

In [11]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


Categorize movies genres properly

In [12]:
genres_unique = pd.DataFrame(movies.genres.str.split('|').tolist()).stack().unique()
genres_unique = pd.DataFrame(genres_unique, columns=['genres']) # Format into DataFrame to store later
genres_unique.head()

Unnamed: 0,genres
0,Adventure
1,Animation
2,Children
3,Comedy
4,Fantasy


In [13]:
movies = movies.join(movies.genres.str.get_dummies().astype(bool))
movies.drop('genres', inplace=True, axis=1)
movies.head()

Unnamed: 0,movieId,title,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,1995.0,False,False,True,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji,1995.0,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men,1995.0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,4,Waiting to Exhale,1995.0,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False
4,5,Father of the Bride Part II,1995.0,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


Check and clean NaN values

In [14]:
print ("Number of movies Null values: ", max(movies.isnull().sum()))
print ("Number of ratings Null values: ", max(ratings.isnull().sum()))
movies.dropna(inplace=True)
ratings.dropna(inplace=True)

Number of movies Null values:  13
Number of ratings Null values:  0


In [None]:
movies.sort_values(by='movieId', inplace=True)
movies.reset_index(inplace=True, drop=True)

ratings.sort_values(by='movieId', inplace=True)
ratings.reset_index(inplace=True, drop=True)

In [None]:
!mkdir ./data/silver

In [None]:
movies.to_parquet('./data/silver/movies.parquet.gzip', compression='gzip')
ratings.to_parquet('./data/silver/ratings.parquet.gzip', compression='gzip')

In [15]:
### Creating joined dataset
movies = pd.read_parquet('./data/bronze/movies.parquet.gzip')
ratings = pd.read_parquet('./data/bronze/ratings.parquet.gzip')
ratings.columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings.sort_values(by='item_id', inplace=True)
ratings.reset_index(inplace=True, drop=True)
ratings.timestamp = pd.to_datetime(ratings.timestamp, unit='s', origin='unix')
movies.columns = ['item_id', 'title', 'genres']
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year
movies.title = movies.title.str[:-7]
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,1,4.0,2000-07-30 18:45:03,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,517,1,4.0,2017-02-24 16:39:03,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
2,213,1,3.5,2011-09-16 18:02:37,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
3,514,1,4.0,2018-08-10 03:40:00,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
4,214,1,3.0,1997-01-22 12:57:35,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0


In [16]:
movie_ratings.to_parquet('./data/silver/movie_ratings.parquet.gzip', compression='gzip')