In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import KFold
from surprise.model_selection.validation import cross_validate

# Data

In [None]:
data = pd.read_csv(
    '/kaggle/input/netflix-prize-data/combined_data_1.txt',
    header=None,
    names=['user_id', 'rating'],
    usecols=[0,1],
    dtype={'rating': float}
)

In [None]:
data.head()

# Data exploration

In [None]:
movie_count = data.isnull().sum()[1]

print(f"There are {movie_count} movies in the dataset.")

In [None]:
user_count = data['user_id'].nunique() - movie_count

print(f"There are {user_count} users in the dataset.")

In [None]:
rating_count = data['user_id'].count() - movie_count

print(f"There are {rating_count} ratings in the dataset.")

In [None]:
ratings = data.groupby('rating')['rating'].agg(['count'])
ax = ratings.plot(kind='barh', figsize = (15,10))

# Data cleaning

In [None]:
df_nan = pd.DataFrame(pd.isnull(data.rating))
df_nan = df_nan[df_nan['rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:], df_nan['index'][:-1]):
    temp = np.full((1, i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
last_record = np.full((1, len(data) - df_nan.iloc[-1, 0] - 1), movie_id)
movie_np = np.append(movie_np, last_record)

In [None]:
data = data[pd.notnull(data['rating'])]

data['movie_id'] = movie_np.astype(int)
data['user_id'] = data['user_id'].astype(int)

# Data selection

In [None]:
USER_MIN_REVIEWS = 100
MOVIE_MIN_REVIEWS = 5000

In [None]:
f = ['count','mean']

df_movie_summary = data.groupby('movie_id')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = MOVIE_MIN_REVIEWS # round(df_movie_summary['count'].quantile(0.8),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

# print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = data.groupby('user_id')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = USER_MIN_REVIEWS # round(df_cust_summary['count'].quantile(0.8),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

# print('Customer minimum times of review: {}'.format(cust_benchmark))

In [None]:
print('Original Shape: {}'.format(data.shape))
data = data[~data['movie_id'].isin(drop_movie_list)]
data = data[~data['user_id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(data.shape))

In [None]:
pivot = pd.pivot_table(data, values='rating', index='user_id', columns='movie_id')

# Data mapping

In [None]:
titles = pd.read_csv(
    '/kaggle/input/netflix-prize-data/movie_titles.csv',
    encoding='ISO-8859-1',
    header=None,
    names=['movie_id', 'year', 'name'],
    dtype={'year': 'Int64'}
)
titles.set_index('movie_id', inplace=True)

In [None]:
titles.head()

# Recommendations

In [None]:
# data = data[:100000]

In [None]:
reader = Reader()

# get just top 100K rows for faster run time
dataset = Dataset.load_from_df(
    data[['user_id', 'movie_id', 'rating']][:100000],
    reader
)

# kf = KFold(n_splits=3)
# kf.split(data)

svd = SVD()
cross_validate(svd, dataset, measures=['RMSE', 'MAE'])

In [None]:
df_785314 = data[(data['user_id'] == 785314) & (data['rating'] == 5)]
df_785314 = df_785314.set_index('movie_id')
df_785314 = df_785314.join(titles)['name']
df_785314.head()

In [None]:
user_785314 = titles.copy()
user_785314 = user_785314.reset_index()
user_785314 = user_785314[~user_785314['movie_id'].isin(drop_movie_list)]

# getting full dataset
dataset = Dataset.load_from_df(
    data[['user_id', 'movie_id', 'rating']][:10000000],
    reader
)

trainset = dataset.build_full_trainset()
svd.fit(trainset)

user_785314['estimate_score'] = user_785314['movie_id'].apply(lambda x: svd.predict(785314, x).est)

user_785314 = user_785314.drop('movie_id', axis=1)

user_785314 = user_785314.sort_values('estimate_score', ascending=False)
user_785314.head()