In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Amazon - Movies and TV Ratings.csv")
df.head(2)

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,


## Exploratory Data Analysis

In [50]:
# Which movies have maximum views?

df.drop('user_id',axis=1).count().sort_values(ascending=False).to_frame().head()

Unnamed: 0,0
Movie127,2313
Movie140,578
Movie16,320
Movie103,272
Movie29,243


In [51]:
# Which movies have maximum ratings?

df.drop('user_id',axis=1).sum().sort_values(ascending=False).to_frame().head()

Unnamed: 0,0
Movie127,9511.0
Movie140,2794.0
Movie16,1446.0
Movie103,1241.0
Movie29,1168.0


In [54]:
# What is the average rating for each movie?

df.drop('user_id',axis=1).mean().sort_values(ascending=False).to_frame().head()

Unnamed: 0,0
Movie1,5.0
Movie66,5.0
Movie76,5.0
Movie75,5.0
Movie74,5.0


In [66]:
# Define the top 5 movies with the maximum ratings

z = df.drop('user_id',axis=1).mean().sort_values(ascending=False).to_frame()
list(z[0][z[0]==5].index)

['Movie1',
 'Movie66',
 'Movie76',
 'Movie75',
 'Movie74',
 'Movie143',
 'Movie72',
 'Movie145',
 'Movie70',
 'Movie147',
 'Movie68',
 'Movie148',
 'Movie65',
 'Movie78',
 'Movie149',
 'Movie63',
 'Movie150',
 'Movie61',
 'Movie152',
 'Movie153',
 'Movie157',
 'Movie57',
 'Movie56',
 'Movie55',
 'Movie77',
 'Movie79',
 'Movie164',
 'Movie123',
 'Movie106',
 'Movie101',
 'Movie112',
 'Movie116',
 'Movie98',
 'Movie118',
 'Movie96',
 'Movie120',
 'Movie122',
 'Movie93',
 'Movie128',
 'Movie142',
 'Movie131',
 'Movie132',
 'Movie133',
 'Movie87',
 'Movie135',
 'Movie85',
 'Movie84',
 'Movie136',
 'Movie82',
 'Movie139',
 'Movie54',
 'Movie165',
 'Movie2',
 'Movie189',
 'Movie181',
 'Movie23',
 'Movie22',
 'Movie21',
 'Movie183',
 'Movie186',
 'Movie18',
 'Movie187',
 'Movie188',
 'Movie15',
 'Movie13',
 'Movie180',
 'Movie12',
 'Movie11',
 'Movie10',
 'Movie9',
 'Movie8',
 'Movie7',
 'Movie191',
 'Movie198',
 'Movie4',
 'Movie199',
 'Movie25',
 'Movie27',
 'Movie167',
 'Movie40',
 'Movie5

In [67]:
# Define the top 5 movies with the least audience.

df.drop('user_id',axis=1).count().sort_values(ascending=True).to_frame().head()

Unnamed: 0,0
Movie1,1
Movie71,1
Movie145,1
Movie69,1
Movie68,1


## Recommendation Model

In [68]:
#importing libiraies
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import train_test_split

In [70]:
# rearranging data

movie_data = df.melt(id_vars = df.columns[0],value_vars=df.columns[1:],var_name="Movies",value_name="Rating")
movie_data.head()

Unnamed: 0,user_id,Movies,Rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,


In [71]:
# Divide the data into training and test data

rd = Reader(rating_scale=(-1,10))
data = Dataset.load_from_df(movie_data.fillna(0),reader=rd)
train_data,test_data = train_test_split(data,test_size=0.25)

In [73]:
# building recommender system
svd = SVD()
svd.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1764c1ddcd0>

In [80]:
# Make predictions on the test data

pred = svd.test(test_data)
accuracy.rmse(pred)

RMSE: 0.2807


0.2806615821400472

In [82]:
pred[:5]

[Prediction(uid='A2WOPJ88JJN43X', iid='Movie98', r_ui=0.0, est=0.021625586363591257, details={'was_impossible': False}),
 Prediction(uid='A3NQU1649SH0Q4', iid='Movie91', r_ui=0.0, est=0.031243767864821165, details={'was_impossible': False}),
 Prediction(uid='APASOUCF2UD42', iid='Movie157', r_ui=0.0, est=-0.0002891684836278264, details={'was_impossible': False}),
 Prediction(uid='A236ZKW9KLRLJJ', iid='Movie45', r_ui=0.0, est=-0.016336983122809214, details={'was_impossible': False}),
 Prediction(uid='A2WNGZZVOFR40Y', iid='Movie25', r_ui=0.0, est=0.0026618093043789684, details={'was_impossible': False})]

## Plotting

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, n_iter=500, verbose=3, random_state=1)
books_embedding = tsne.fit_transform(svd.qi)
projection = pd.DataFrame(columns=['x', 'y'], data=books_embedding)
projection['title'] = list(df.columns)[1:]

In [93]:
import plotly.express as px

fig = px.scatter(projection, x='x', y='y', text='title')
fig.show()