# Import data

In [8]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import warnings

warnings.filterwarnings('ignore')

In [9]:
!gdown 1Q9UJtrN_v_dS-garl5gQ1I_SotGhye_1
!gdown 1HOFWUAMFlYbd-gk1B2IyV2-hXDZI7gKR
!gdown 1b7_yRRBs3s3atp1WQHN2GU577vxY8u_h

Downloading...
From: https://drive.google.com/uc?id=1Q9UJtrN_v_dS-garl5gQ1I_SotGhye_1
To: /content/movies.csv
100% 516k/516k [00:00<00:00, 8.09MB/s]
Downloading...
From: https://drive.google.com/uc?id=1HOFWUAMFlYbd-gk1B2IyV2-hXDZI7gKR
To: /content/ratings.csv
100% 2.48M/2.48M [00:00<00:00, 17.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1b7_yRRBs3s3atp1WQHN2GU577vxY8u_h
To: /content/users.csv
100% 16.8k/16.8k [00:00<00:00, 39.5MB/s]


Movies: all movies details

Users: All user details

Ratings: Ratings for the movies given by the users

# Preprocessing

In [10]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [11]:
movies.shape , ratings.shape

((10329, 3), (105339, 4))

In [12]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [13]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246


In [14]:
ratings['movieId'].value_counts()

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
296,325
356,311
318,308
480,294
593,290
...,...
133583,1
132961,1
132883,1
132800,1


Get movies with an ample amounts of ratings while ignoring unpopular movies

In [15]:
selected_movies = ratings['movieId'].value_counts()[:1000].index.to_list()
movies = movies.loc[movies.movieId.isin(selected_movies)]
movies.shape

(1000, 3)

Similarly filter ratings df with the selected movies

In [16]:
ratings = ratings.loc[ratings.movieId.isin(selected_movies)]
ratings.shape

(63250, 4)

In [17]:
m = movies.copy()
m.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


# Content Based Filtering

Splitting on the basis of pipe available on the genre column

In [18]:
m['genres'] = m['genres'].str.split('|')
m.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"


Create rows keeping individual elements w.r.t genres column

In [19]:
m = m.explode('genres')
m.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [20]:
m = m.pivot(index='movieId', columns='genres', values='title')
m.head()

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,,Toy Story (1995),Toy Story (1995),Toy Story (1995),Toy Story (1995),,,,Toy Story (1995),,,,,,,,,,
2,,Jumanji (1995),,Jumanji (1995),,,,,Jumanji (1995),,,,,,,,,,
3,,,,,Grumpier Old Men (1995),,,,,,,,,,Grumpier Old Men (1995),,,,
5,,,,,Father of the Bride Part II (1995),,,,,,,,,,,,,,
6,Heat (1995),,,,,Heat (1995),,,,,,,,,,,Heat (1995),,


In [21]:
m = ~m.isna()
m=m.astype(int)

m.head()

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [22]:
m.shape

(1000, 19)

Hamming distance? can be used for finding the similarity between two movies

hamming distance should be small for movies to be similar

In [23]:
def hamming_distance(a,b):
  return (a!=b).sum()

In [24]:
hamming_distance(m.iloc[1].values, m.loc[10].values)

np.int64(4)

Create a item similarity matrix with the hamming distance

In [25]:
ranks = []
for i in m.index:
  for j in m.index:
    if i==j:
      continue
    ranks.append([i, j, hamming_distance(m.loc[i].values, m.loc[j].values)])

In [26]:
ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])
ranks.head()

Unnamed: 0,query,candidate,distance
0,1,2,2
1,1,3,5
2,1,5,4
3,1,6,8
4,1,7,5


In [27]:
ranks = ranks.merge(movies[['movieId', 'title']], left_on='query', right_on='movieId').rename(columns={'title': 'query_tittle'}).drop(columns=['movieId'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='candidate', right_on='movieId').rename(columns={'title': 'candidate_tittle'}).drop(columns=['movieId'])
ranks = ranks.sort_values(by=['query', 'distance'])
ranks.head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
541,1,2294,0,Toy Story (1995),Antz (1998)
671,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
796,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
185,1,673,1,Toy Story (1995),Space Jam (1996)
553,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


Recommendation for who have watched titanic

In [28]:
ranks[ranks['query']==1721].head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
441567,1721,17,0,Titanic (1997),Sense and Sensibility (1995)
441572,1721,25,0,Titanic (1997),Leaving Las Vegas (1995)
441590,1721,105,0,Titanic (1997),"Bridges of Madison County, The (1995)"
441707,1721,509,0,Titanic (1997),"Piano, The (1993)"
441708,1721,515,0,Titanic (1997),"Remains of the Day, The (1993)"


✈--------------------------------------------------------------------------

✈--------------------------------------------------------------------------

# Regression Problem

In [29]:
users = pd.read_csv('users.csv')
users.shape

(668, 3)

In [30]:
users.head(3)

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303
2,3,20,4.521478


In [31]:
r = ratings.copy()
r['hour'] = r['timestamp'].apply(lambda x: dt.fromtimestamp(x).hour)
r.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,0
1,1,24,1.5,1217895807,0
2,1,32,4.0,1217896246,0


In [32]:
users = users.merge(r.groupby('userId').rating.mean().reset_index(), on='userId')
users = users.merge(r.groupby('userId').hour.mean().reset_index(), on='userId')

users.head()

Unnamed: 0,userId,age,time_spent_per_day,rating,hour
0,1,16,3.976315,3.691589,0.0
1,2,24,1.891303,3.923077,16.0
2,3,20,4.521478,3.806452,9.0
3,4,23,2.095284,4.147059,2.058824
4,5,35,1.75986,2.864865,19.0


In [33]:
u = users.copy()
u = u.set_index('userId')
u.columns = ['age', 'time_spent_per_day', 'u_avg_rating', 'hour']
u.head()

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,3.976315,3.691589,0.0
2,24,1.891303,3.923077,16.0
3,20,4.521478,3.806452,9.0
4,23,2.095284,4.147059,2.058824
5,35,1.75986,2.864865,19.0


In [34]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)
u.head()

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.071853,-1.900589
2,-0.135616,-1.079947,0.42859,0.610169
3,-0.802954,0.712624,0.176463,-0.488288
4,-0.30245,-0.940926,0.912806,-1.577514
5,1.699565,-1.169532,-1.859109,1.080936


In [35]:
data = ratings[['movieId','userId','rating']].copy()
data = data.merge(u.reset_index(), on='userId', how='right')
data = data.merge(m.reset_index(), on='movieId', how='right')
data.head()

Unnamed: 0,movieId,userId,rating,age,time_spent_per_day,u_avg_rating,hour,Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,2,5.0,-0.135616,-1.079947,0.42859,0.610169,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,5,4.0,1.699565,-1.169532,-1.859109,1.080936,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,8,5.0,0.364888,0.298545,0.162516,0.453247,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,11,4.0,-1.303458,0.513712,-0.379135,-0.424484,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,14,4.0,-0.30245,1.251552,-0.377947,-0.488288,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [36]:
data.drop(columns=['movieId','userId'], inplace=True)
data.head()

Unnamed: 0,rating,age,time_spent_per_day,u_avg_rating,hour,Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,5.0,-0.135616,-1.079947,0.42859,0.610169,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,4.0,1.699565,-1.169532,-1.859109,1.080936,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5.0,0.364888,0.298545,0.162516,0.453247,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,4.0,-1.303458,0.513712,-0.379135,-0.424484,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,4.0,-0.30245,1.251552,-0.377947,-0.488288,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [37]:
y  =  data['rating']
X = data.drop(columns=['rating'])

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
model  = GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [41]:
y_train_pred = model.predict(X_train)

In [42]:
mse(y_test, y_pred)**0.5

0.8776167796245963

on an avg our predictions are deviating by 0.88 units

Can we improve it further.

I see the dates of release also included in the movie names. Lets try including them as movie features as well

In [57]:
m_date = movies[['movieId','title']].copy()
m_date['year'] = m_date['title'].apply(lambda x: x[-5:-1])
m_date.drop(columns=['title'], inplace=True)

In [58]:
data2 = ratings[['movieId','userId','rating']].copy()
data2 = data2.merge(u.reset_index(), on='userId', how='right')
data2 = data2.merge(m.reset_index(), on='movieId', how='right')
data2 = data2.merge(m_date, on='movieId', how='right')
data2.head()

Unnamed: 0,movieId,userId,rating,age,time_spent_per_day,u_avg_rating,hour,Action,Adventure,Animation,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,2,5.0,-0.135616,-1.079947,0.42859,0.610169,0,1,1,...,0,0,0,0,0,0,0,0,0,1995
1,1,5,4.0,1.699565,-1.169532,-1.859109,1.080936,0,1,1,...,0,0,0,0,0,0,0,0,0,1995
2,1,8,5.0,0.364888,0.298545,0.162516,0.453247,0,1,1,...,0,0,0,0,0,0,0,0,0,1995
3,1,11,4.0,-1.303458,0.513712,-0.379135,-0.424484,0,1,1,...,0,0,0,0,0,0,0,0,0,1995
4,1,14,4.0,-0.30245,1.251552,-0.377947,-0.488288,0,1,1,...,0,0,0,0,0,0,0,0,0,1995


In [63]:
data2.drop(columns=['movieId','userId'], inplace=True)
y = data2['rating']
X = data2.drop('rating', axis=1)
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
model_dt = GradientBoostingRegressor()
model_dt.fit(X_train_dt, y_train_dt)
y_pred_dt = model_dt.predict(X_test_dt)

In [66]:
mse(y_test_dt, y_pred_dt)**0.5

0.8680221330892266

We improved very little

Can we use other models as well

In [69]:
from xgboost import XGBRegressor

In [73]:
xgb_model = XGBRegressor()
xgb_model_dt = XGBRegressor()

xgb_model.fit(X_train, y_train)
# xgb_model_dt.fit(X_train_dt, y_train_dt)

y_pred_xgb = xgb_model.predict(X_test)
# y_pred_xgb_dt = xgb_model.predict(X_test_dt)

In [74]:
mse(y_test, y_pred_xgb)**5

0.251622260071034

# Recommendation

In [75]:
# should u3 watch m1 or not ?
u3m1 = pd.concat((u.loc[3], m.loc[1]))
model.predict(u3m1.values.reshape(1, -1))

array([3.80759019])

In [76]:
u3m1 = pd.concat((u.loc[3], m.loc[1]))
xgb_model.predict(u3m1.values.reshape(1, -1))

array([4.0015087], dtype=float32)

We look at drastic better predictions