In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

In [2]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

### Import custom module/class

In [3]:
cwd = os.getcwd()
path = os.path.join(cwd, '..', 'src')
if not path in sys.path:
    sys.path.append(path)
del cwd, path

In [4]:
from MovieRecommender import MovieFilter

### Load data

In [5]:
cwd = os.getcwd()
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))

In [6]:
movies.shape

(27278, 3)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings_filtered.csv"))

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [11]:
ratings.groupby('movieId')['rating'.count()

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,49695,49695,49695
2,22243,22243,22243
3,12735,12735,12735
4,2756,2756,2756
5,12161,12161,12161
6,23899,23899,23899
7,12961,12961,12961
8,1415,1415,1415
9,3960,3960,3960
10,29005,29005,29005


## Filter movies

In [8]:
mf = MovieFilter(movies)

In [9]:
len('The Lord of the Rings: The Fellowship of the Ring')

49

In [10]:
mf.filter_string_length(length=60)

filter_string_length filtered out 1094 movies. Num before: 27278. Num after: 26184


In [11]:
mf.filter_english_words(num_allow=2)

filter_english_words filtered out 3683 movies. Num before: 26184. Num after: 22501


In [12]:
mf.filter_release_year(min_year=1990)

filter_release_year filtered out 8619 movies. Num before: 22501. Num after: 13882


In [13]:
cwd = os.getcwd()
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings.csv"))
freq = ratings['movieId'].value_counts()

In [14]:
freq.describe()

count    26744.000000
mean       747.841123
std       3085.818268
min          1.000000
25%          3.000000
50%         18.000000
75%        205.000000
max      67310.000000
Name: movieId, dtype: float64

In [None]:
mf.filter_rating_freq()

In [16]:
mf.movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Persist filtered list of moves

In [19]:
mf.movies.sample(frac=1).to_csv('Ryans_Movie_Ratings_original.csv', index=False)

## Reduce ratings dataset based on filtered movies 

In [14]:
cwd = os.getcwd()
ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings.csv"))

In [15]:
movies = mf.movies['movieId'].tolist()

In [19]:
ratings.shape

(20000263, 4)

In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [21]:
mask = ratings['movieId'].isin(mf.movies['movieId'])

In [22]:
mask.sum()

12391806