<a href="https://colab.research.google.com/github/silvhua/Netflix-Recommender-Engines-Challenge/blob/main/recommender_engines_II_Colab_version.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
! pip install scikit-surprise
# import SVD from surprise
from surprise import SVD

# # import dataset from surprise
from surprise import Dataset
from surprise import Reader

# import accuracy from surprise
from surprise import accuracy

# import train_test_split from surprise.model_selection
from surprise.model_selection import train_test_split
# import GridSearchCV from surprise.model_selection
from surprise.model_selection import GridSearchCV
# import cross_validate from surprise.model_selection
from surprise.model_selection import cross_validate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 12.6 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626455 sha256=da26e84575245300219108ed6dd2a057e20debf79ec812fe64325ac72c6d4035
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


# Load Data

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2])
df1.head()

Unnamed: 0,Customer,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


Data Prep

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
from collections import deque 
def reshape_df(df):
    tmp_movies = df[df['Rating'].isna()]['Customer'].reset_index()
    movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

    # Shift the movie_indices by one to get start and endpoints of all movies
    shifted_movie_index = deque(movie_index)
    shifted_movie_index.rotate(-1)

    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

In [None]:
# tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
# movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
df1 = reshape_df(df1)
df1.head()

Shape: (24053764, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
df1['Movie_ID'].unique()

array([   1,    2,    3, ..., 4497, 4498, 4499])

# Surprise

In [None]:
reader = Reader(rating_scale=(1, 5))# Loads Pandas dataframe
data = Dataset.load_from_df(df1[['Customer', 'Movie_ID', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=.15)
model1 = SVD()
fit_model1 = model1.fit(trainset)

In [None]:
testset[0:5]

[('140174', 3165, 3.0),
 ('1132410', 3427, 2.0),
 ('796093', 1975, 3.0),
 ('2229253', 3925, 1.0),
 ('1226844', 1369, 2.0)]

In [None]:
# Make predictions for first 5 users in test set
predictions1 = model1.test(testset[0:5])
predictions1
# `est` refers to predicted rating from that user (uid) for that movie (iid, or item id)

[Prediction(uid='140174', iid=3165, r_ui=3.0, est=3.4407710472274458, details={'was_impossible': False}),
 Prediction(uid='1132410', iid=3427, r_ui=2.0, est=2.812289905220415, details={'was_impossible': False}),
 Prediction(uid='796093', iid=1975, r_ui=3.0, est=2.5541376180429203, details={'was_impossible': False}),
 Prediction(uid='2229253', iid=3925, r_ui=1.0, est=3.129383801780239, details={'was_impossible': False}),
 Prediction(uid='1226844', iid=1369, r_ui=2.0, est=2.020837887324352, details={'was_impossible': False})]

In [None]:
rmse = accuracy.rmse(predictions1)
print(f'Model RMSE on test set: {rmse:.2f}')

RMSE: 1.0571
Model RMSE on test set: 1.06


## Recommend a new movie

In [None]:
def recommend_movie(customer_id, model, ratings=df1, movies=movies):
  """2022-12-05 19:36
  Recommend 1 movie for a given customer.
  Parameters:
    - customer_id: Customer_id
    - model: A surprise model that has been fit.
    - ratings: DataFrame containing customer ratings.
    - movies: DataFrame containing Movie_Id and title.

  """
  rated = ratings.loc[ratings['Customer']==customer_id, 'Movie_ID'].values
  unrated = np.setdiff1d(movies['Movie_Id'], rated)
  testset = [[customer_id, movie, 1] for movie in unrated]
  predictions = model.test(testset)
  predicted_ratings = np.array([pred.est for pred in predictions])
  recommendation_index = predicted_ratings.argmax()
  recommendation_movie_id = unrated[recommendation_index]
  recommendation = movies.loc[movies['Movie_Id']==recommendation_movie_id, 'Name'].values
  return recommendation


def batch_recommend(customers_df, model=model1, movies=movies):
  recommendations = pd.DataFrame(index=customers_df.index)
  rated_movie = [movies[movies['Movie_Id'] == id]['Name'].values[0] for id in customers_df['Movie_ID'].values]
  recommendations['Rated Movie'] = rated_movie
  recommendations['Recommendation'] = customers_df['Customer'].apply(lambda x: recommend_movie(x, model))
  recommendations.index.name = 'Customer'
  return recommendations


batch_recommend(df1.head(5))

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Dinosaur Planet,[Stargate SG-1: Season 7]
2,Dinosaur Planet,[Elfen Lied]
3,Dinosaur Planet,[Sex and the City: Season 4]
4,Dinosaur Planet,[Six Feet Under: Season 4]
5,Dinosaur Planet,[Smallville: Season 1]


In [None]:
batch_recommend(df1.tail(5))

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
24058258,In My Skin,[The West Wing: Season 3]
24058259,In My Skin,[Aqua Teen Hunger Force: Vol. 1]
24058260,In My Skin,[Michael Moore's The Awful Truth: Season 2]
24058261,In My Skin,[Seven Samurai]
24058262,In My Skin,[Spirited Away]


## Test with select movie titles

In [None]:
movies.head(30)

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
6,7,1992.0,8 Man
7,8,2004.0,What the #$*! Do We Know!?
8,9,1991.0,Class of Nuke 'Em High 2
9,10,2001.0,Fighter


In [None]:
df1[(df1['Movie_ID'] == 28) & (df1['Rating'] > 4)].head()

Unnamed: 0,Customer,Rating,Date,Movie_ID
52553,1990901,5.0,2004-02-16,28
52554,2626356,5.0,2005-07-08,28
52572,1456155,5.0,2005-07-21,28
52578,1632018,5.0,2005-07-25,28
52579,577397,5.0,2003-05-19,28


In [None]:
# Movie_id 28 is for Lilo and Stitch; what are the recommendations for people who like that movie?
batch_recommend(df1[(df1['Movie_ID'] == 28) & (df1['Rating'] > 4)].head())

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
52553,Lilo and Stitch,[Chappelle's Show: Season 1]
52554,Lilo and Stitch,[Aqua Teen Hunger Force: Vol. 1]
52572,Lilo and Stitch,[The Wire: Season 1]
52578,Lilo and Stitch,[SpongeBob SquarePants: Season 2]
52579,Lilo and Stitch,[I Love Lucy: Season 2]


In [None]:
# What are the recommendations for people who like Sesame Street: Elmo's World: ?
batch_recommend(df1[(df1['Movie_ID'] == 27) & (df1['Rating'] > 3)].head())

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
52277,Sesame Street: Elmo's World: The Street We Liv...,[CSI: Season 1]
52278,Sesame Street: Elmo's World: The Street We Liv...,[Ghosts of Rwanda: Frontline]
52279,Sesame Street: Elmo's World: The Street We Liv...,[Coupling: Season 1]
52281,Sesame Street: Elmo's World: The Street We Liv...,[Stargate SG-1: Season 7]
52282,Sesame Street: Elmo's World: The Street We Liv...,[Lonesome Dove]


In [None]:
# What are the recommendations for people who like Something's Gotta Give
batch_recommend(df1[(df1['Movie_ID'] == 30) & (df1['Rating'] > 4)].head())

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
92828,Something's Gotta Give,[Stargate SG-1: Season 3]
92829,Something's Gotta Give,[The Recruit]
92837,Something's Gotta Give,[Friends: Season 6]
92850,Something's Gotta Give,[Aqua Teen Hunger Force: Vol. 1]
92851,Something's Gotta Give,[The O.C.: Season 1]


In [None]:
# Recommendations for those who liked My Bloody Valentine
batch_recommend(df1[(df1['Movie_ID'] == 24) & (df1['Rating'] > 4)].head())

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
43896,My Bloody Valentine,[Lord of the Rings: The Return of the King: Ex...
43922,My Bloody Valentine,[Lord of the Rings: The Return of the King: Ex...
43949,My Bloody Valentine,[The Rise and Fall of ECW]
43956,My Bloody Valentine,[Dinosaur Planet]
43976,My Bloody Valentine,[Isle of Man TT 2004 Review]


In [None]:
# Recommendations for those who liked Nature Antarctica
batch_recommend(df1[(df1['Movie_ID'] == 14) & (df1['Rating'] > 4)].head())

Unnamed: 0_level_0,Rated Movie,Recommendation
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1
21239,Nature: Antarctica,[Reno 911: Season 2]
21246,Nature: Antarctica,[Isle of Man TT 2004 Review]
21251,Nature: Antarctica,[The Best of Friends: Vol. 4]
21256,Nature: Antarctica,[The West Wing: Season 3]
21262,Nature: Antarctica,[Princess Mononoke]


# User-user memory-based collaborative filtering

In [None]:
import pandas as pd
import numpy as np

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=100000)
df1.head()

Unnamed: 0,Customer,Rating,Date
0,1:,,
1,1488844,3.0,2005-09-06
2,822109,5.0,2005-05-13
3,885013,4.0,2005-10-19
4,30878,4.0,2005-12-26


In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df(df, movie_index, shifted_movie_index):
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Shape: (99971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
df1['Movie_ID'].value_counts()

28    39752
8     14910
18    10722
30     7173
17     7108
26     5861
16     2699
3      2012
24     1333
25     1207
5      1140
6      1019
23      615
1       547
12      546
19      539
29      523
15      290
27      273
10      249
21      218
22      203
11      198
2       145
4       142
13      125
14      118
20      116
9        95
7        93
Name: Movie_ID, dtype: int64

## Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
customers = df1.sample(frac=0.05, random_state=0).pivot_table(index='Customer', columns='Movie_ID', values='Rating')
print(customers.shape)
customers.head()

(4939, 30)


Movie_ID,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000868,,,,,,,,1.0,,,...,,,,,,,,,,
1001129,,,,,,,,,,,...,,,,3.0,,,,,,
1002494,,,,,,,,,,,...,,,,,,,,3.0,,
1002870,,,,,,,,,,,...,,,,,,3.0,,,,
1003172,,,,,,,,,,,...,,,,,,,,5.0,,


In [None]:
customers = customers.fillna(0)
cosine_sim = cosine_similarity(customers, customers)


In [None]:
print(customers.shape)
print(cosine_sim.shape)

(1000, 8)
(1000, 1000)


In [None]:
cosine_sim[0]

array([1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
       0., 1., 1., 1., 1.

In [None]:
# I realize now that cosine similarity won't work given that most users haven't rated most movies.

## KNeighbors

In [None]:
customers = df1.pivot_table(index='Customer', columns='Movie_ID', values='Rating')
print(customers.shape)
customers.head()

(81473, 30)


Movie_ID,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,,,3.0,,,,,,,,...,,,,,,,,,,
1000079,,,,,,,,,,,...,,,,,,,,2.0,,
1000105,,,,,,,,,,,...,,,,,,,,4.0,,
1000158,,,,,,,,,,,...,,,,,,,,3.0,,
1000192,,,,,,,,,,,...,,,,,,,,2.0,,


In [None]:
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
from sklearn.neighbors import NearestNeighbors
customers = customers.fillna(0)
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(customers)

NearestNeighbors()

In [None]:
def uu_kn_recommend(customer, ratings_table=customers, model=neigh, movies=movies):
  """
  Recommend movies for a given user. 
  Parameters:
    - customer (int): Customer ID for a single customer. Must by an array, i.e. put in square brackets.
    - ratings_table: Dense matrix of movie ratings with customers as rows and movies as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  """
  
  # Find similar users based on movie ratings
  neighbors = model.kneighbors(ratings_table.loc[customer], return_distance=False).ravel()
  # print('Neighbors:', neighbors)

  # Filter the ratings table to only show rows with most similar customers
  filtered_table = ratings_table.iloc[neighbors]
  # Transpose the table so each row is for a movie. 
  movie_ratings = filtered_table.transpose()
  # Get the sum of all similar users' ratings per movie
  movie_ratings['sum'] = movie_ratings.sum(axis=1)

  # Sort movies by sum
  sorted_movies_list = movie_ratings.sort_values('sum', ascending=False).index.to_list()

  # Remove movies that the user has already rated
  rated_movies = [column for column, value in ratings_table.loc[customer].items() if value[0] > 0]
  new_movies = [movie for movie in sorted_movies_list if movie not in rated_movies]
  recommended_movie = movies[movies['Movie_Id'] == new_movies[0]]['Name'].values[0]
  # print('Rated movies:',rated_movies)
  # print('Sorted movies:', sorted_movies_list)
  # print('Unrated movies:', new_movies)
  # print('Recommended movie: ',recommended_movie)
  # return movies[movies['Movie_Id'] == new_movies[0]]
  return rated_movies, recommended_movie #rated_movies #movie_ratings.sort_values('sum', ascending=False).head()


uu_kn_recommend(customers.iloc[100:101].index)
  

([28], 'Dinosaur Planet')

In [None]:
customers.iloc[100:101].transpose()[customers.iloc[100:101].transpose()[customers.iloc[100:101].index[0]] > 0]

Customer,1002876
Movie_ID,Unnamed: 1_level_1
17,2.0


In [None]:
uu_kn_recommend(customers.iloc[500:501].index)

Neighbors: [17635 17640 70432 17632 70430]
Rated movies: [28]
Sorted movies: [28, 1, 2, 29, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 30]
Unrated movies: [1, 2, 29, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 30]
Recommended movie:  ['Dinosaur Planet']


Customer,1517010,1517290,67992,151692,679860,sum
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28,4.0,4.0,4.0,4.0,4.0,20.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0
27,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
uu_kn_recommend(customers.iloc[9000:9001].index)

Neighbors: [58838 78773 44726 28622 12864]
Rated movies: [26]
Sorted movies: [26, 1, 2, 29, 28, 27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 30]
Unrated movies: [1, 2, 29, 28, 27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 30]
Recommended movie:  ['Dinosaur Planet']


Customer,342533,92341,2311187,1836533,1377486,sum
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26,3.0,3.0,3.0,3.0,3.0,15.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
customers.sum()

Movie_ID
1       2051.0
2        516.0
3       7326.0
4        389.0
5       4468.0
6       3143.0
7        198.0
8      47560.0
9        249.0
10       792.0
11       600.0
12      1866.0
13       569.0
14       357.0
15       953.0
16      8363.0
17     20636.0
18     40576.0
19      1792.0
20       365.0
21       755.0
22       456.0
23      2187.0
24      3991.0
25      4792.0
26     16374.0
27       963.0
28    151982.0
29      1882.0
30     26958.0
dtype: float64

In [None]:
uu_kn_recommend(['1246015'])

Neighbors: [2250 1543 3078 4059 1885]
Rated movies: [26]
Sorted movies: [26, 1, 2, 29, 28, 27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 30]
Unrated movies: [1, 2, 29, 28, 27, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 30]
Recommended movie:  0    Dinosaur Planet
Name: Name, dtype: object


Customer,206138,1738346,2458394,566142,1887468,sum
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
26,1.0,1.0,1.0,1.0,1.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def run_uu_kn_recommend(customers_df, ratings_table=customers, model=neigh, movies=movies):
  """
  Make recommendations for several customers.

  Parameters:
    - customers_df: Any DataFrame where the index is the customer ID.
    - customer_matrix: Dense matrix of movie ratings with customers as rows and movies as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  """
  customers_list = customers_df.index.to_list()
  recommendations = [uu_kn_recommend([customer], ratings_table, model, movies) for customer in customers_list]
  recommendations_df = pd.DataFrame(recommendations, columns=['Previously Rated','Recommendations'])
  return recommendations_df

In [None]:
# run the recommendation on half of the customers to see the distribution of recommendations
recommend = run_uu_kn_recommend(customers.sample(frac=0.5))
recommend['Recommendations'].value_counts()

Dinosaur Planet               2196
Isle of Man TT 2004 Review     265
What the #$*! Do We Know!?       4
Never Die Alone                  2
Lilo and Stitch                  1
Sick                             1
Screamers                        1
Name: Recommendations, dtype: int64

In [None]:
# user-to-user memory-based collaborative filtering doesn't work because most users have only rated 1 movie, 
# thus the same is with the similar neighbors. 
# This causes recommender to simply recommend movies also not seen by similar users, just based on sort order.

# Item-item memory-based collaborative

## KNeighbors

In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=100000)

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df(df, movie_index, shifted_movie_index):
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Shape: (99971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
movies_matrix = df1.pivot_table(index='Movie_ID', columns='Customer', values='Rating')
print(movies_matrix.shape)
movies_matrix.head()

(30, 81473)


Customer,100006,1000079,1000105,1000158,1000192,1000232,100029,1000301,1000303,1000328,...,99970,999743,999756,999768,999894,999895,999901,999907,999913,999935
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,3.0,,,,,,5.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=100000)

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df(df, movie_index, shifted_movie_index):
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Shape: (99971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
movies_matrix = df1.pivot_table(index='Movie_ID', columns='Customer', values='Rating')
print(movies_matrix.shape)
movies_matrix.head()

(30, 81473)


Customer,100006,1000079,1000105,1000158,1000192,1000232,100029,1000301,1000303,1000328,...,99970,999743,999756,999768,999894,999895,999901,999907,999913,999935
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,3.0,,,,,,5.0,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
from sklearn.neighbors import NearestNeighbors
movies_matrix = movies_matrix.fillna(0)
neigh2 = NearestNeighbors(n_neighbors=3) # n_neighbors=3 to only recommend 3 movies.
neigh2.fit(movies_matrix)

NearestNeighbors(n_neighbors=3)

In [None]:
def uu_kn_recommend2(ratings, ratings_table, model, movies=movies):
  """
  Recommend movies most similar to a given movie. 
  Parameters:
    - ratings: DataFrame with Customer and Movie_ID
    - ratings_table: Dense matrix of movie ratings with movies as rows and customers as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  """
  movie_id = ratings.loc['Movie_ID']
  customer = ratings.loc['Customer']
  rated_movies = ratings_table[ratings_table[customer] !=0].index
  # unrated_movies = np.setdiff1d(ratings_table.index, rated_movies)

  # Find similar movies based on movie ratings
  neighbors = model.kneighbors(ratings_table.loc[[movie_id]], return_distance=False).ravel()
  # print('Neighbors:', neighbors)

  # Filter the ratings table to only show rows with most similar movies; index is the movie_id
  similar_movies = ratings_table.iloc[neighbors].index

  recommended_movies_id = [movie for movie in similar_movies if movie not in rated_movies]
  recommended_movies = [movies[movies['Movie_Id'] == id]['Name'].values[0] for id in recommended_movies_id]
  return recommended_movies # Return an array the length of the recommended movies

  # return rated_movies, recommended_movie
def run_uu_kn_recommend2(ratings_df, ratings_table=movies_matrix, model=neigh2, movies=movies):
  """
  Parameters:
    - ratings_df: DataFrame containing Customers and their movie ratings.
    - ratings_table: Dense matrix of movie ratings with movies as rows and customers as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  Returns:
    - DataFrame containing a list of up to 5 new movie recommendations for each customer.
  """
  results = ratings_df.transpose().apply(lambda x: uu_kn_recommend2(x, ratings_table, model)
    ).transpose().values
  list = []
  list.append([result for result in results])
  return pd.DataFrame(list, index=['Recommendations'], columns=ratings_df['Customer']).transpose()


run_uu_kn_recommend2(df1.head(2))


Unnamed: 0_level_0,Recommendations
Customer,Unnamed: 1_level_1
1488844,"[8 Man, Class of Nuke 'Em High 2]"
822109,"[8 Man, Class of Nuke 'Em High 2]"


In [None]:
run_uu_kn_recommend2(df1.tail())

Unnamed: 0_level_0,Recommendations
Customer,Unnamed: 1_level_1
254710,"[8 Man, Class of Nuke 'Em High 2]"
865725,"[8 Man, Class of Nuke 'Em High 2]"
568153,"[8 Man, Class of Nuke 'Em High 2]"
2502775,"[8 Man, Class of Nuke 'Em High 2]"
1732588,"[8 Man, Class of Nuke 'Em High 2]"


In [None]:
def uu_kn_recommend3(ratings, ratings_table, model, movies=movies):
  """
  Recommend movies most similar to a given movie. 
  Parameters:
    - ratings: DataFrame with Customer and Movie_ID
    - ratings_table: Dense matrix of movie ratings with movies as rows and customers as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  """
  movie_id = ratings.loc['Movie_ID']
  customer = ratings.loc['Customer']
  rated_movies = ratings_table[ratings_table[customer] !=0].index

  # Find similar movies based on movie ratings
  neighbors = model.kneighbors(ratings_table.loc[[movie_id]], return_distance=False).ravel()

  # Filter the ratings table to only show rows with most similar movies; index is the movie_id
  similar_movies = ratings_table.iloc[neighbors].index
  try:
    recommended_movies_id = [movie for movie in similar_movies if movie not in rated_movies][0]
    recommended_movies = movies[movies['Movie_Id'] == recommended_movies_id]['Name'].values
    return recommended_movies 
  except:
    return None

def run_uu_kn_recommend3(ratings_df, ratings_table=movies_matrix, model=neigh2, movies=movies):
  """
  Recommend one movie.
  Parameters:
    - ratings_df: DataFrame containing Customers and their movie ratings.
    - ratings_table: Dense matrix of movie ratings with movies as rows and customers as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  Returns:
    - DataFrame containing a single new movie recommendation for each customer.
  """
  results = ratings_df.transpose().apply(lambda x: uu_kn_recommend3(x, ratings_table, model)
    ).transpose()
  results.columns=['Recommendation']
  return results


run_uu_kn_recommend3(df1.head(4))


Unnamed: 0,Recommendation
1,8 Man
2,8 Man
3,8 Man
4,8 Man


In [None]:
run_uu_kn_recommend3(df1.tail(4))

Unnamed: 0,Recommendation
99997,8 Man
99998,8 Man
99999,8 Man
100000,8 Man


In [None]:
# run the recommendation on many customers to see the distribution of recommendations
recommend2 = run_uu_kn_recommend3(df1.sample(frac=0.001, random_state=0))
recommend2['Recommendation'].value_counts()

8 Man                                                                   65
Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo    33
Class of Nuke 'Em High 2                                                 1
Name: Recommendation, dtype: int64

### Filter out movies with ratings below threshold

In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=200000)
print(df1.shape)

(200001, 3)


In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df(df, movie_index, shifted_movie_index):
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Shape: (199971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
movies_matrix2 = df1.pivot_table(index='Movie_ID', columns='Customer', values='Rating')
print(movies_matrix2.shape)
movies_matrix2.head()

(30, 150677)


Customer,1000033,1000038,100006,1000062,1000079,1000094,1000104,1000105,1000153,1000158,...,999836,999892,999894,999895,999901,999907,999913,99993,999935,999988
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,3.0,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# Filter out movies with average rating < 3 and fewer than a certain number of ratings
movies_matrix2 = movies_matrix2[(movies_matrix2.mean(axis=1) > 3.5) & (movies_matrix2.count(axis=1) > 500)]

In [None]:
movies_matrix2.shape

(9, 150677)

In [None]:
from sklearn.neighbors import NearestNeighbors
movies_matrix2 = movies_matrix2.fillna(0)
neigh3 = NearestNeighbors(n_neighbors=3) # n_neighbors=3 to only recommend 3 movies.
neigh3.fit(movies_matrix2)

NearestNeighbors(n_neighbors=3)

In [None]:
def uu_kn_recommend4(ratings, ratings_table, model, movies=movies):
  """
  Recommend movies most similar to a given movie. 
  Parameters:
    - ratings: DataFrame with Customer and Movie_ID
    - ratings_table: Dense matrix of movie ratings with movies as rows and customers as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  """
  movie_id = ratings.loc['Movie_ID']
  customer = ratings.loc['Customer']
  rated_movies = ratings_table[ratings_table[customer] !=0].index

  # Find similar movies based on movie ratings
  try:
    neighbors = model.kneighbors(ratings_table.loc[[movie_id]], return_distance=False).ravel()
  except:
    return None

  # Filter the ratings table to only show rows with most similar movies; index is the movie_id
  similar_movies = ratings_table.iloc[neighbors].index
  try:
    recommended_movies_id = [movie for movie in similar_movies if movie not in rated_movies][0]
    recommended_movies = movies[movies['Movie_Id'] == recommended_movies_id]['Name'].values
    return recommended_movies 
  except:
    return None

def run_uu_kn_recommend4(ratings_df, ratings_table=movies_matrix, model=neigh2, movies=movies):
  """
  Recommend one movie.
  Parameters:
    - ratings_df: DataFrame containing Customers and their movie ratings.
    - ratings_table: Dense matrix of movie ratings with movies as rows and customers as columns.
    - model: NearestNeighbors instance that has been fit.
    - movies: DataFrame containing movie info.
  Returns:
    - DataFrame containing a single new movie recommendation for each customer.
  """
  results = ratings_df.transpose().apply(lambda x: uu_kn_recommend4(x, ratings_table, model)
    ).transpose()
  results.columns=['Recommendation']
  return results

# run the recommendation on many customers to see the distribution of recommendations
recommend3 = run_uu_kn_recommend4(df1.sample(frac=0.0005, random_state=0), ratings_table=movies_matrix2, model=neigh3)
recommend3['Recommendation'].value_counts()

Boycott                                                                 55
Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo    28
Name: Recommendation, dtype: int64

In [None]:
recommend3

Unnamed: 0,Recommendation
86167,Clifford: Clifford Saves the Day! / Clifford's...
180210,Boycott
54827,Clifford: Clifford Saves the Day! / Clifford's...
79030,Clifford: Clifford Saves the Day! / Clifford's...
67526,Clifford: Clifford Saves the Day! / Clifford's...
...,...
10085,
83775,Clifford: Clifford Saves the Day! / Clifford's...
140683,Boycott
20806,


## Cosine similarity

In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=300000)

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df2(df, movie_index, shifted_movie_index):
    print('Original shape:', df.shape)
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df2(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Original shape: (300001, 3)
Shape: (299924, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
movies_matrix3 = df1.pivot_table(index='Movie_ID', columns='Customer', values='Rating')
print(movies_matrix3.shape)
# Filter out movies with average rating < 3 and fewer than a certain number of ratings
movies_matrix4 = movies_matrix3[(movies_matrix3.mean(axis=1) > 3.5) & (movies_matrix3.count(axis=1) > 500)]
print(movies_matrix4.shape)
movies_matrix4.head()

(77, 180710)
(22, 180710)


Customer,1000033,1000035,1000038,100006,1000062,1000079,1000094,1000104,1000105,1000122,...,999892,999894,999895,999901,999907,999913,99993,999935,999944,999988
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
3,,,,3.0,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
18,,,,,,,,,,,...,,,,,,2.0,,,,
23,,,,,,,,,,,...,,,,,,,,,,


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
movies_matrix4 = movies_matrix4.fillna(0)
cosine_sim2 = cosine_similarity(movies_matrix4, movies_matrix4)
cosine_sim2.shape

(22, 22)

In [None]:
cosine_sim2

array([[1.        , 0.00920994, 0.02416584, 0.02889091, 0.04014232,
        0.01424255, 0.04391108, 0.01605614, 0.02596063, 0.02134985,
        0.01506128, 0.01898541, 0.0162999 , 0.04039192, 0.01381236,
        0.00603037, 0.02702619, 0.04767221, 0.02008003, 0.0115393 ,
        0.04013059, 0.02964798],
       [0.00920994, 1.        , 0.00589936, 0.06602712, 0.00708683,
        0.03067514, 0.02805881, 0.01910036, 0.05626365, 0.00807406,
        0.0102784 , 0.04648459, 0.0151716 , 0.01137473, 0.05417322,
        0.06988679, 0.07345499, 0.01718364, 0.00752899, 0.05699341,
        0.01221804, 0.01875201],
       [0.02416584, 0.00589936, 1.        , 0.01240213, 0.01665952,
        0.00599493, 0.03093303, 0.00708779, 0.02651988, 0.01263261,
        0.05113718, 0.00768291, 0.01142958, 0.0339696 , 0.0170954 ,
        0.00620731, 0.00717472, 0.02688431, 0.02050645, 0.00484752,
        0.01476316, 0.0209552 ],
       [0.02889091, 0.06602712, 0.01240213, 1.        , 0.01996644,
        0.0446086

In [None]:
def recommend_similar(ratings_input, similarity_matrix, movies, all_ratings, movie_indices):
  """

  """
  movie_id = ratings_input.loc['Movie_ID']
  customer = ratings_input.loc['Customer']
  matrix_index = movie_indices[movie_indices == movie_id]
  sorted_indices = similarity_matrix[matrix_index].argsort()
  closest_indices = sorted_indices[-3:] # 3 most similar
  rated_movies = all_ratings[all_ratings['Customer'] == customer]['Movie_ID'].values
  recommend_movie_id = [movie_indices[closest_index] for closest_index in closest_indices if movie_indices[closest_index] not in rated_movies]
  recommend_title = str(recommend_movie_id[0])+': '+movies[movies['Movie_Id'] == recommend_movie_id[0]]['Name'].values
  return recommend_title


def batch_recommend_similar(ratings_input, similarity_matrix=cosine_sim2,
        movies=movies, all_ratings=df1, movie_indices=movies_matrix4.index.to_list()):
  """
  Recommend one movie.
  Parameters:
    - ratings_df: DataFrame containing Customers and their movie ratings.
    - similarity_matrix: Cosine similarity matrix
    - movies: DataFrame containing movie info.
  Returns:
    - DataFrame containing a single new movie recommendation for each customer.
  """
  ratings_input['Recommendation'] = ratings_input.transpose().apply(lambda x: recommend_similar(x, 
    similarity_matrix, movies, all_ratings, movie_indices)).transpose()
  # results.columns=['Recommendation']
  return ratings_input

batch_recommend_similar(df1.sample(5, random_state=0))

Unnamed: 0,Customer,Rating,Date,Movie_ID,Recommendation
70407,653936,4.0,2005-08-16,28,56: Carandiru
233835,1536419,2.0,2004-04-05,45,56: Carandiru
82121,1856701,3.0,2004-11-13,28,56: Carandiru
23002,1739440,2.0,2001-05-25,16,56: Carandiru
56225,2152506,4.0,2003-03-30,28,56: Carandiru


In [None]:
recommend4 = batch_recommend_similar(df1.sample(100, random_state=0))
recommend4['Recommendation'].value_counts()

56: Carandiru      99
57: Richard III     1
Name: Recommendation, dtype: int64

In [None]:
recommend4['Movie_ID'].value_counts()

30    35
28    15
46     6
58     5
44     4
45     4
77     4
8      4
33     3
17     3
16     3
70     2
76     2
18     2
9      1
48     1
52     1
57     1
65     1
56     1
26     1
15     1
Name: Movie_ID, dtype: int64

In [None]:
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1['Movie_ID'].value_counts()

30    118413
28     39752
58     17405
8      14910
18     10722
       ...  
9         95
64        95
7         93
41        93
51        90
Name: Movie_ID, Length: 77, dtype: int64

# apyori

In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=200000)

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df2(df, movie_index, shifted_movie_index):
    print('Original shape:', df.shape)
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df2(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Original shape: (200001, 3)
Shape: (199971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
df1['Movie_ID'].value_counts()

30    107173
28     39752
8      14910
18     10722
17      7108
26      5861
16      2699
3       2012
24      1333
25      1207
5       1140
6       1019
23       615
1        547
12       546
19       539
29       523
15       290
27       273
10       249
21       218
22       203
11       198
2        145
4        142
13       125
14       118
20       116
9         95
7         93
Name: Movie_ID, dtype: int64

In [None]:
rated_movies = [df1[df1['Customer'] == id]['Movie_ID'].to_list() for id in df1['Customer'].unique()]
print(len(rated_movies))

150677


In [None]:
rated_movies[0:5]

[[1, 8, 17, 30], [1], [1, 5], [1, 5, 18, 28, 30], [1, 8, 17, 28, 30]]

In [None]:
pd.Series(rated_movies[0:5])

0        [1, 8, 17, 30]
1                   [1]
2                [1, 5]
3    [1, 5, 18, 28, 30]
4    [1, 8, 17, 28, 30]
dtype: object

In [None]:
# def get_movies(df):
#   """
#   To be used with .apply with transposed dataframe. Returns the Movie_ID if the rating is greater than zero.
#   Returns a Pandas series.
#   """
#   return df[df>0].index.tolist()

# def batch_get_movies(df):

#   # Call the function using a transposed version of the dataframe
#   data = df.transpose().apply(get_movies)
#   print(type(data))
#   print(len(data))
#   return data


In [None]:
! pip install apyori

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
Building wheels for collected packages: apyori
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5973 sha256=0d855f8edeeb42289dc37ce9aa6508e4ea5009c5fb80929c071ad19f264c2e20
  Stored in directory: /root/.cache/pip/wheels/1b/02/6c/a45230be8603bd95c0a51cd2b289aefdd860c1a100eab73661
Successfully built apyori
Installing collected packages: apyori
Successfully installed apyori-1.1.2


In [None]:
# * instantiate apriori and set the input params based on following constraints:
#     * we want only movies that are rated at least 500 times
#     * the minimum confidence for the rules is 20%
#     * the minumum lift is 3
from apyori import apriori

min_support = 100/len(rated_movies)
min_confidence = 0.2
min_lift = 1.3

association_rules = apriori(rated_movies, min_length=2, min_support=min_support,
    min_confidence=min_confidence, min_lift=min_lift
)
association_results = list(association_rules)
print(f'Number of association rules: {len(association_results)}')

Number of association rules: 27


In [None]:
def make_apriori_df(association_results):
  """
  Make apriori association_results list into a dataframe
  """
  results = []
  for item in association_results:
      # first index of the inner list contains base item and add item.
      # A pair may contain more than 2 items because each "item" in the pair can contain more than 1 movie.
      pair = item[0]
      items = [item for item in pair]
      title1 = str(items[0])
      title2 = str(items[1])
      # support = (item[1]).str.replace('\w+=(\d*.\d*)', '\\1', regex=True)
      support = item[1]
      #third index of the list located at 0th of the third index of the inner list
      confidence = item[2][0][2]
      lift = item[2][0][3]
      rows = (title1, title2, support, confidence, lift)
      results.append(rows)
  labels = ['Title 1','Title 2','Support','Confidence','Lift']
  return pd.DataFrame.from_records(results, columns = labels)

apriori_df = make_apriori_df(association_results)
apriori_df.head()


Unnamed: 0,Title 1,Title 2,Support,Confidence,Lift
0,1,28,0.00148,0.407678,1.545274
1,18,12,0.000843,0.232601,3.268754
2,12,28,0.001394,0.384615,1.457856
3,16,28,0.00669,0.373472,1.415617
4,28,23,0.002363,0.578862,2.194133


In [None]:
def recommend_apriori(ratings_input, apriori_df, movies):
  
  movie_id = ratings_input.loc['Movie_ID']
  rated_movie_title = movies.loc[movies['Movie_Id'] == movie_id, 'Name'].values[0]
  customer = ratings_input.loc['Customer']
  try:
    recommendation_id = int(apriori_df.loc[apriori_df['Title 1'] == str(movie_id), 'Title 2'].values[0])
    recommendation = movies.loc[movies['Movie_Id'] == recommendation_id, 'Name'].values[0]
    return pd.Series([rated_movie_title, recommendation], index=['Rated Movie', 'Recommended Movie'])
  except:
    return pd.Series([rated_movie_title, None], index=['Rated Movie', 'Recommended Movie'])

def batch_recommend_aprior(ratings_input, apriori_df=apriori_df.sort_values('Lift', ascending=False), 
                           movies=movies):
  """
  Default is to sort apriori_df by 'Lift'
  """
  results = ratings_input.transpose().apply(lambda x: recommend_apriori(x, 
    apriori_df, movies)).transpose()
  recommendations = pd.concat([ratings_input['Customer'], results], axis=1)
  print(f"**Input movies count**: {len(recommendations['Rated Movie'].value_counts())}\n{recommendations['Rated Movie'].value_counts()}")
  print(f"\n**Recommended movies count**: {len(recommendations['Recommended Movie'].value_counts())}\n{recommendations['Recommended Movie'].value_counts()}")
  return recommendations

batch_recommend_aprior(df1.head())

**Input movies count**: 1
Dinosaur Planet    5
Name: Rated Movie, dtype: int64

**Recommended movies count**: 1
Lilo and Stitch    5
Name: Recommended Movie, dtype: int64


Unnamed: 0,Customer,Rated Movie,Recommended Movie
1,1488844,Dinosaur Planet,Lilo and Stitch
2,822109,Dinosaur Planet,Lilo and Stitch
3,885013,Dinosaur Planet,Lilo and Stitch
4,30878,Dinosaur Planet,Lilo and Stitch
5,823519,Dinosaur Planet,Lilo and Stitch


In [None]:
df1.head()

Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
movies.head(1)

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet


In [None]:
apriori_recommendations = batch_recommend_aprior(df1.sample(100))
apriori_recommendations

**Input movies count**: 14
Something's Gotta Give                           51
Lilo and Stitch                                  18
What the #$*! Do We Know!?                        8
Immortal Beloved                                  6
Never Die Alone                                   4
7 Seconds                                         4
My Favorite Brunette                              2
My Bloody Valentine                               1
Character                                         1
Sick                                              1
Screamers                                         1
Inspector Morse 31: Death Is Now My Neighbour     1
Fighter                                           1
Boycott                                           1
Name: Rated Movie, dtype: int64

**Recommended movies count**: 6
Something's Gotta Give    18
Screamers                  8
Character                  6
Never Die Alone            4
Immortal Beloved           3
Lilo and Stitch            2
Name: 

Unnamed: 0,Customer,Rated Movie,Recommended Movie
97231,2300312,Something's Gotta Give,
185993,14080,Something's Gotta Give,
48914,2202167,Never Die Alone,
14590,2143457,What the #$*! Do We Know!?,Screamers
29888,397815,7 Seconds,Never Die Alone
...,...,...,...
37395,1203567,Immortal Beloved,Character
20760,1935937,My Favorite Brunette,Lilo and Stitch
11988,637929,What the #$*! Do We Know!?,Screamers
20801,347605,My Favorite Brunette,Lilo and Stitch


In [None]:
# Sort apriori recommendations by Confidence
apriori_recommendations2 = batch_recommend_aprior(df1.sample(100), 
        apriori_df=apriori_df.sort_values('Confidence', ascending=False))
apriori_recommendations2

**Input movies count**: 9
Something's Gotta Give        59
Lilo and Stitch               13
What the #$*! Do We Know!?    12
Immortal Beloved               6
Never Die Alone                3
7 Seconds                      3
Screamers                      2
The Rise and Fall of ECW       1
By Dawn's Early Light          1
Name: Rated Movie, dtype: int64

**Recommended movies count**: 4
Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo    13
Screamers                                                               12
Never Die Alone                                                          9
Immortal Beloved                                                         2
Name: Recommended Movie, dtype: int64


Unnamed: 0,Customer,Rated Movie,Recommended Movie
145824,2195651,Something's Gotta Give,
182807,809966,Something's Gotta Give,
196382,547041,Something's Gotta Give,
141159,2483114,Something's Gotta Give,
126969,713178,Something's Gotta Give,
...,...,...,...
191078,1499994,Something's Gotta Give,
14890,747719,What the #$*! Do We Know!?,Screamers
97848,553328,Something's Gotta Give,
175049,1317838,Something's Gotta Give,


# Word2Vec

In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])
movies.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=100000)

In [None]:
# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df2(df, movie_index, shifted_movie_index):
    print('Original shape:', df.shape)
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df2(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Original shape: (100001, 3)
Shape: (99971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
rated_movies2 = [df1[df1['Customer'] == id]['Movie_ID'].to_list() for id in df1['Customer'].unique()]

In [None]:
pd.Series(rated_movies2)

0            [1, 8, 17, 30]
1                       [1]
2                    [1, 5]
3        [1, 5, 18, 28, 30]
4        [1, 8, 17, 28, 30]
                ...        
81468                  [30]
81469                  [30]
81470                  [30]
81471                  [30]
81472                  [30]
Length: 81473, dtype: object

In [None]:
rated_movies2[:5]

[[1, 8, 17, 30], [1], [1, 5], [1, 5, 18, 28, 30], [1, 8, 17, 28, 30]]

In [None]:
# Data needs to be converted from a list to a format that word2vec accepts
pd.Series([[str(movie) for movie in customer] for customer in rated_movies2])

0            [1, 8, 17, 30]
1                       [1]
2                    [1, 5]
3        [1, 5, 18, 28, 30]
4        [1, 8, 17, 28, 30]
                ...        
81468                  [30]
81469                  [30]
81470                  [30]
81471                  [30]
81472                  [30]
Length: 81473, dtype: object

In [None]:
from gensim.models import Word2Vec
word2vec = Word2Vec(pd.Series([[str(movie) for movie in customer] for customer in rated_movies2]),
  min_count=10)
word2vec.wv.vocab

{'1': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa354f0>,
 '8': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa35b50>,
 '17': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa359a0>,
 '30': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa35f70>,
 '5': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa35e20>,
 '18': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa35f10>,
 '28': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34460>,
 '20': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34130>,
 '26': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa342e0>,
 '11': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34610>,
 '16': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34160>,
 '24': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34100>,
 '23': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34910>,
 '3': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34be0>,
 '6': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34490>,
 '21': <gensim.models.keyedvectors.Vocab at 0x7f8ceaa34430>,
 '25': <gensim.models.keyedve

In [None]:
def recommend_w2v(ratings_input, word2vec, all_ratings, movies):
  """
  Recommend a movie based on the customer and the movie that the customer rated. 
  Recommendation excludes movies already rated by the customer.
  Parameters:
    - ratings_input: Input data with customer, movie_id.
    - word2vec: word2vec instance that has been fit.
    - all_rating: Dataframe with all ratings in data set.
    - movies: DataFrame with movie titles.
  """  
  movie_id = ratings_input.loc['Movie_ID']
  rated_movie_title = movies.loc[movies['Movie_Id'] == movie_id, 'Name'].values[0]
  customer = ratings_input.loc['Customer']
  rated_movies = all_ratings[all_ratings['Customer'] == customer]['Movie_ID']
  # print(rated_movies)
  try:
    recommendation_id_list = [item[0] for item in word2vec.wv.most_similar(str(movie_id), topn=5)]
    unseen_recommendations = [movie for movie in recommendation_id_list if movie not in rated_movies]
    recommendation = movies.loc[movies['Movie_Id'] == int(unseen_recommendations[0]), 'Name'].values[0]
    # print(recommendation_id_list)
    # print(unseen_recommendations)
    return pd.Series([rated_movie_title, recommendation], index=['Rated Movie', 'Recommended Movie'])
  except:
    return pd.Series([rated_movie_title, None], index=['Rated Movie', 'Recommended Movie'])

def batch_recommend_w2v(ratings_input, word2vec=word2vec, all_ratings=df1, 
                           movies=movies):
  """
  Parameters:
    - ratings_input: Dataframe with ratings.
    - word2vec: word2vec instance that has been fit.
    - movies: DataFrame with movie titles.
  """
  results = ratings_input.transpose().apply(lambda x: recommend_w2v(x, 
    word2vec, all_ratings, movies)).transpose()
  recommendations = pd.concat([ratings_input['Customer'], results], axis=1)
  print(f"**Input movies count**: {len(recommendations['Rated Movie'].value_counts())}\n{recommendations['Rated Movie'].value_counts()}")
  print(f"\n**Recommended movies count**: {len(recommendations['Recommended Movie'].value_counts())}\n{recommendations['Recommended Movie'].value_counts()}")
  print(f"\n**No recommendations count**: {recommendations['Recommended Movie'].isna().sum()}")
  return recommendations

batch_recommend_w2v(df1.head())

**Input movies count**: 1
Dinosaur Planet    5
Name: Rated Movie, dtype: int64

**Recommended movies count**: 1
Neil Diamond: Greatest Hits Live    5
Name: Recommended Movie, dtype: int64

**No recommendations count**: 0


Unnamed: 0,Customer,Rated Movie,Recommended Movie
1,1488844,Dinosaur Planet,Neil Diamond: Greatest Hits Live
2,822109,Dinosaur Planet,Neil Diamond: Greatest Hits Live
3,885013,Dinosaur Planet,Neil Diamond: Greatest Hits Live
4,30878,Dinosaur Planet,Neil Diamond: Greatest Hits Live
5,823519,Dinosaur Planet,Neil Diamond: Greatest Hits Live


In [None]:
batch_recommend_w2v(df1.sample(100, random_state=0))

**Input movies count**: 17
Lilo and Stitch                                       33
What the #$*! Do We Know!?                            13
Immortal Beloved                                      11
Something's Gotta Give                                 9
Never Die Alone                                        8
7 Seconds                                              8
The Rise and Fall of ECW                               3
Character                                              2
Sick                                                   2
Inspector Morse 31: Death Is Now My Neighbour          2
By Dawn's Early Light                                  2
Screamers                                              2
Neil Diamond: Greatest Hits Live                       1
Fighter                                                1
My Bloody Valentine                                    1
Dinosaur Planet                                        1
Sesame Street: Elmo's World: The Street We Live On     1
Name

Unnamed: 0,Customer,Rated Movie,Recommended Movie
47918,2534534,Never Die Alone,Neil Diamond: Greatest Hits Live
90184,703724,Lilo and Stitch,Boycott
6130,1530767,What the #$*! Do We Know!?,8 Man
68061,98602,Lilo and Stitch,Boycott
58818,345568,Lilo and Stitch,Boycott
...,...,...,...
97302,800136,Something's Gotta Give,Dinosaur Planet
35824,795211,Immortal Beloved,Sesame Street: Elmo's World: The Street We Liv...
93909,2540855,Something's Gotta Give,Dinosaur Planet
14998,1343090,What the #$*! Do We Know!?,8 Man


# Randomized SVD

In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/movie_titles.csv', header=None, 
    encoding = "ISO-8859-1", # As per https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
    usecols=[0, 1, 2], # Required because some movie titles (column 2) have commas, causing parser error otherwise
    names=['Movie_Id', 'Year', 'Name'])

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/data exercises/W10/netflix-challenge/combined_data_1.txt', 
    header=None, names=['Customer', 'Rating', 'Date'], usecols = [0,1,2]).truncate(after=100000)

# Used this as an example: https://www.kaggle.com/code/morrisb/how-to-recommend-anything-deep-recommender
def reshape_df(df, movie_index, shifted_movie_index):
    user_data = []
    for [df_id1, movie_id1], [df_id2, movie_id2] in zip(movie_index, shifted_movie_index):
        # check if last movie in the file
        if df_id1 < df_id2:
            tmp_df = df.loc[df_id1+1: df_id2-1].copy()
        else:
            tmp_df = df.loc[df_id1+1:].copy()
        # create movie_id column
        tmp_df['Movie_ID'] = movie_id1
        user_data.append(tmp_df)
    df2 = pd.concat(user_data)
    del user_data, df, tmp_df
    print('Shape:', df2.shape)
    return df2

tmp_movies = df1[df1['Rating'].isna()]['Customer'].reset_index()
movie_index = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
from collections import deque 
shifted_movie_indices = deque(movie_index)
shifted_movie_indices.rotate(-1)

df1 = reshape_df(df1, movie_index, shifted_movie_indices)
del tmp_movies
df1.head()

Shape: (99971, 4)


Unnamed: 0,Customer,Rating,Date,Movie_ID
1,1488844,3.0,2005-09-06,1
2,822109,5.0,2005-05-13,1
3,885013,4.0,2005-10-19,1
4,30878,4.0,2005-12-26,1
5,823519,3.0,2004-05-03,1


In [None]:
utility_matrix = df1.pivot_table(index='Customer', columns='Movie_ID', values='Rating')
print(utility_matrix .shape)
utility_matrix .head()

(81473, 30)


Movie_ID,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,,,3.0,,,,,,,,...,,,,,,,,,,
1000079,,,,,,,,,,,...,,,,,,,,2.0,,
1000105,,,,,,,,,,,...,,,,,,,,4.0,,
1000158,,,,,,,,,,,...,,,,,,,,3.0,,
1000192,,,,,,,,,,,...,,,,,,,,2.0,,


## Perform SVD

Predict rating based on [Sci-kit Surprise's SVD algorithm](https://surprise.readthedocs.io/en/stable/matrix_factorization.html#matrix-factorization-based-algorithms): 
rui = mu + bu + bi + qi'*pu

In [None]:
# Based on W10d2 example
from sklearn.utils.extmath import randomized_svd
from scipy.sparse import csr_matrix


def run_svd2(matrix, n_components):
  U, Sigma, VT = randomized_svd(matrix, n_components=n_components, random_state=0)
  print('U shape: ',U.shape)
  print('Sigma shape: ', Sigma.shape)
  print('V shape: ',VT.shape)
  return U, Sigma, VT

users_matrix2, sigma2, movies_matrix2 = run_svd2(csr_matrix(utility_matrix.fillna(0)), 10)

U shape:  (81473, 10)
Sigma shape:  (10,)
V shape:  (10, 30)


In [None]:
np.dot(users_matrix2, movies_matrix2)

array([[ 6.17521483e-05,  2.69028133e-05,  1.77686571e-02, ...,
        -6.05668772e-05,  1.79000673e-04, -6.67262227e-05],
       [ 1.29678436e-06, -1.29633275e-06, -4.03763786e-05, ...,
         2.56718428e-03, -3.38443950e-06, -1.19880096e-04],
       [ 2.59356872e-06, -2.59266551e-06, -8.07527572e-05, ...,
         5.13436857e-03, -6.76887901e-06, -2.39760191e-04],
       ...,
       [ 7.29956391e-06,  4.85570673e-06, -2.25188542e-04, ...,
        -1.91089097e-04,  5.58457683e-06, -1.39577862e-04],
       [ 1.07926356e-05,  1.09707612e-06, -2.15031197e-04, ...,
        -1.92351358e-04,  1.80861839e-05, -7.49337778e-05],
       [ 5.47467293e-06,  3.64178005e-06, -1.68891407e-04, ...,
        -1.43316823e-04,  4.18843262e-06, -1.04683396e-04]])

In [None]:
# Reconstruct the decomposed matrix
reconst = users_matrix2.dot(np.diag(sigma2)).dot(movies_matrix2)

In [None]:
reconst[0][2]

2.9965347622865552

In [None]:
reconst[0][4]

-0.004542261038681522

In [None]:
reconst[0][0]

0.012026183342596244

In [None]:
# mean rating per user
print(utility_matrix.mean(axis=1).shape)
utility_matrix.mean(axis=1)

(81473,)


Customer
100006     3.0
1000079    2.0
1000105    4.0
1000158    3.0
1000192    2.0
          ... 
999895     3.0
999901     5.0
999907     4.0
999913     2.0
999935     3.0
Length: 81473, dtype: float64

In [None]:
# mean rating per movie
print(utility_matrix.mean().shape)
utility_matrix.mean()

(30,)


Movie_ID
1     3.749543
2     3.558621
3     3.641153
4     2.739437
5     3.919298
6     3.084396
7     2.129032
8     3.189805
9     2.621053
10    3.180723
11    3.030303
12    3.417582
13    4.552000
14    3.025424
15    3.286207
16    3.098555
17    2.903208
18    3.784369
19    3.324675
20    3.146552
21    3.463303
22    2.246305
23    3.556098
24    2.993998
25    3.970174
26    2.793721
27    3.527473
28    3.823254
29    3.598470
30    3.758260
dtype: float64

In [None]:
print(reconst[:2].shape)
reconst[:2]

(2, 30)


array([[ 1.20261833e-02,  4.85243549e-03,  2.99653476e+00,
         6.47062675e-03, -4.54226104e-03,  7.16412794e-02,
         4.58261743e-03, -9.24113530e-04,  4.00563154e-03,
         3.59142060e-02,  1.28908137e-02,  2.62856268e-02,
         4.57780394e-03,  2.34733137e-03,  6.66559504e-03,
        -6.51514039e-03, -5.78842108e-04, -1.46162858e-03,
         2.02410514e-02,  7.56596900e-03,  1.40436595e-02,
         5.88715987e-03,  8.76317797e-03,  2.93962305e-02,
        -3.69998369e-03, -2.13631985e-03,  9.00457743e-03,
        -5.59724439e-04,  3.14700214e-02, -1.74821914e-04],
       [ 7.76585227e-03,  1.79371104e-04, -3.72900406e-04,
         1.18754914e-03, -1.22729088e-03,  1.91868015e-03,
         1.49021545e-04, -8.07462691e-05,  7.60302131e-05,
         7.47309086e-04,  4.21118487e-04,  5.94987245e-03,
         1.41847236e-03,  7.08359046e-04,  1.69443929e-03,
        -1.81621221e-03, -1.68846774e-04, -2.40242764e-04,
         2.97154376e-03,  7.00991190e-04,  9.10623670e-

In [None]:
print(utility_matrix.mean().to_numpy().reshape(1,-1).shape)
utility_matrix.mean().to_numpy().reshape(1,-1)

(1, 30)


array([[3.74954296, 3.55862069, 3.64115308, 2.73943662, 3.91929825,
        3.08439647, 2.12903226, 3.1898055 , 2.62105263, 3.18072289,
        3.03030303, 3.41758242, 4.552     , 3.02542373, 3.2862069 ,
        3.09855502, 2.90320765, 3.78436859, 3.32467532, 3.14655172,
        3.46330275, 2.24630542, 3.55609756, 2.9939985 , 3.97017399,
        2.79372121, 3.52747253, 3.82325418, 3.59847036, 3.75826014]])

In [None]:
# add mean rating per movie
reconst[:2]+utility_matrix.mean().to_numpy().reshape(1,-1)

array([[3.76156914, 3.56347313, 6.63768784, 2.74590725, 3.91475598,
        3.15603775, 2.13361488, 3.18888139, 2.62505826, 3.2166371 ,
        3.04319384, 3.44386804, 4.5565778 , 3.02777106, 3.29287249,
        3.09203988, 2.90262881, 3.78290696, 3.34491638, 3.15411769,
        3.47734641, 2.25219258, 3.56486074, 3.02339473, 3.966474  ,
        2.79158489, 3.5364771 , 3.82269445, 3.62994038, 3.75808532],
       [3.75730881, 3.55880006, 3.64078018, 2.74062417, 3.91807095,
        3.08631515, 2.12918128, 3.18972475, 2.62112866, 3.1814702 ,
        3.03072415, 3.42353229, 4.55341847, 3.02613209, 3.28790134,
        3.09673881, 2.90303881, 3.78412835, 3.32764687, 3.14725272,
        3.46421338, 2.24674031, 3.5708986 , 3.00326806, 3.96921808,
        2.79324505, 3.53342577, 5.82301602, 3.60198118, 3.75823643]])

In [None]:
# Bias for movie: mean rating per movie minus mean rating for all movies
print(utility_matrix.mean().shape)
utility_matrix.mean() - utility_matrix.mean().mean()

(30,)


Movie_ID
1     0.445777
2     0.254854
3     0.337387
4    -0.564330
5     0.615532
6    -0.219370
7    -1.174734
8    -0.113961
9    -0.682714
10   -0.123044
11   -0.273463
12    0.113816
13    1.248234
14   -0.278343
15   -0.017560
16   -0.205211
17   -0.400559
18    0.480602
19    0.020909
20   -0.157215
21    0.159536
22   -1.057461
23    0.252331
24   -0.309768
25    0.666408
26   -0.510045
27    0.223706
28    0.519488
29    0.294704
30    0.454494
dtype: float64

In [None]:
# Bias for user: mean rating per user minus mean rating for all users
print(utility_matrix.mean(axis=1).shape)
utility_matrix.mean(axis=1) - utility_matrix.mean(axis=1).mean()

(81473,)


Customer
100006    -0.563703
1000079   -1.563703
1000105    0.436297
1000158   -0.563703
1000192   -1.563703
             ...   
999895    -0.563703
999901     1.436297
999907     0.436297
999913    -1.563703
999935    -0.563703
Length: 81473, dtype: float64

In [None]:
# add mean rating per user
print((reconst[:3] + (utility_matrix.mean(axis=1) - utility_matrix.mean(axis=1).mean())[:3].to_numpy().reshape(-1,1)).shape)
reconst[:3] + (utility_matrix.mean(axis=1) - utility_matrix.mean(axis=1).mean())[:3].to_numpy().reshape(-1,1)

(3, 30)


array([[-0.55167714, -0.55885089,  2.43283144, -0.5572327 , -0.56824559,
        -0.49206205, -0.55912071, -0.56462744, -0.55969769, -0.52778912,
        -0.55081251, -0.5374177 , -0.55912552, -0.561356  , -0.55703773,
        -0.57021847, -0.56428217, -0.56516496, -0.54346228, -0.55613736,
        -0.54965967, -0.55781617, -0.55494015, -0.5343071 , -0.56740331,
        -0.56583965, -0.55469875, -0.56426305, -0.53223331, -0.56387815],
       [-1.55593747, -1.56352396, -1.56407623, -1.56251578, -1.56493062,
        -1.56178465, -1.5635543 , -1.56378407, -1.5636273 , -1.56295602,
        -1.56328221, -1.55775345, -1.56228485, -1.56299497, -1.56200889,
        -1.56551954, -1.56387217, -1.56394357, -1.56073178, -1.56300234,
        -1.5627927 , -1.56326844, -1.54890228, -1.55443377, -1.56465923,
        -1.56417948, -1.55775009,  0.43605851, -1.56019251, -1.56372704],
       [ 0.45182838,  0.43665542,  0.43555087,  0.43867177,  0.43384209,
         0.44013403,  0.43659472,  0.43613518,  0

In [None]:
utility_matrix.values.reshape(-1)[0]

nan

In [None]:
utility_matrix.values.reshape(-1)[(utility_matrix.values.reshape(-1) >0) | (utility_matrix.values.reshape(-1) <0)].shape

(99971,)

In [None]:
print(utility_matrix.values.reshape(-1)[~np.isnan(utility_matrix.values.reshape(-1))].shape)
utility_matrix.values.reshape(-1)[~np.isnan(utility_matrix.values.reshape(-1))].mean()

(99971,)


3.5321143131508137

In [None]:
df1.mean()

  df1.mean()


Customer          inf
Rating       3.532114
Movie_ID    21.176051
dtype: float64

In [None]:
# Based on W10d2 example
from sklearn.utils.extmath import randomized_svd
from scipy.sparse import csr_matrix

def make_predictions_matrix(utility_matrix, n_components):
  """
  Predict missing ratings in the utility matrix (customers as rows, movies as columns).
  """
  U, Sigma, VT = randomized_svd(csr_matrix(utility_matrix.fillna(0)), n_components=n_components, random_state=0)
  # Reconstruct the decomposed matrix
  reconst = U.dot(np.diag(Sigma)).dot(VT)
  print('Original array shape: ', utility_matrix.shape)
  print('Reconstructed array shape:', reconst.shape)
  print('U shape: ',U.shape)
  print('Sigma shape: ', Sigma.shape)
  print('V shape: ',VT.shape)
  # Mean rating in array (nan values excluded)
  mu = utility_matrix.values.reshape(-1)[~np.isnan(utility_matrix.values.reshape(-1))].mean()

  # Array with bias per user. Reshape to be array with same number of rows as customers.
  bu = (utility_matrix.mean(axis=1) - utility_matrix.mean(axis=1).mean()).to_numpy().reshape(-1,1)

  # Array with bias per item. Reshape to be array with same number of columns as movies.
  bi = (utility_matrix.mean() - utility_matrix.mean().mean()).to_numpy().reshape(1,-1)

  predicted_ratings = (mu + bu + bi + reconst) - utility_matrix.fillna(0).values
  print('Predictions matrix shape:', predicted_ratings.shape)
  return pd.DataFrame(predicted_ratings, index=utility_matrix.index, columns=utility_matrix.columns)

predicted_ratings = make_predictions_matrix(utility_matrix, 10)

Original array shape:  (81473, 30)
Reconstructed array shape: (81473, 30)
U shape:  (81473, 10)
Sigma shape:  (10,)
V shape:  (10, 30)
Predictions matrix shape: (81473, 30)


In [None]:
predicted_ratings

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,3.426214,3.228118,3.302332,2.410552,3.579401,2.820682,1.798259,2.853526,2.289703,2.881282,...,3.141991,1.916837,3.229505,2.688039,3.631119,2.456229,3.201122,3.487339,3.294585,3.422730
1000079,2.421953,2.223445,2.305425,1.405269,2.582716,1.750960,0.793826,1.854369,1.285773,1.846115,...,2.128858,0.911385,2.235543,1.667913,2.633863,1.457890,2.198070,2.487661,2.266626,2.422881
1000105,4.429719,4.223624,4.305052,3.406456,4.581488,3.752878,2.793975,3.854289,3.285849,3.846862,...,4.129769,2.911820,4.250344,3.677182,4.632907,3.457413,4.204024,4.487422,4.270137,4.422857
1000158,3.425836,3.223534,3.305238,2.405863,3.582102,2.751919,1.793900,2.854329,2.285811,2.846488,...,3.129313,1.911602,3.242944,2.672547,3.633385,2.457652,3.201047,3.487542,3.268381,3.422869
1000192,2.421953,2.223445,2.305425,1.405269,2.582716,1.750960,0.793826,1.854369,1.285773,1.846115,...,2.128858,0.911385,2.235543,1.667913,2.633863,1.457890,2.198070,2.487661,2.266626,2.422881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999895,3.425836,3.223534,3.305238,2.405863,3.582102,2.751919,1.793900,2.854329,2.285811,2.846488,...,3.129313,1.911602,3.242944,2.672547,3.633385,2.457652,3.201047,3.487542,3.268381,3.422869
999901,5.423553,5.226829,5.304259,4.407315,5.582128,4.793048,3.794205,4.853962,4.287532,4.851432,...,5.134352,3.914968,5.223280,4.665046,5.633490,4.457663,5.195333,5.487697,5.271716,5.422833
999907,4.421680,4.226116,4.304567,3.406668,4.582491,3.784246,2.794100,3.854059,3.287165,3.850219,...,4.133071,2.914164,4.222772,3.663765,4.633756,3.457804,4.194690,4.487737,4.269996,4.422847
999913,2.422764,2.224459,2.304824,1.404859,2.582098,1.765779,0.794303,1.854197,1.287051,1.848821,...,2.132093,0.911956,2.224742,1.674403,2.633304,1.457578,2.194428,2.487659,2.273476,2.422842


In [None]:
utility_matrix

Movie_ID,1,2,3,4,5,6,7,8,9,10,...,21,22,23,24,25,26,27,28,29,30
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100006,,,3.0,,,,,,,,...,,,,,,,,,,
1000079,,,,,,,,,,,...,,,,,,,,2.0,,
1000105,,,,,,,,,,,...,,,,,,,,4.0,,
1000158,,,,,,,,,,,...,,,,,,,,3.0,,
1000192,,,,,,,,,,,...,,,,,,,,2.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999895,,,,,,,,,,,...,,,,,,,,3.0,,
999901,,,,,,,,5.0,,,...,,,,,,,,,,
999907,,,,,,,,4.0,,,...,,,,,,,,,,
999913,,,,,,,,,,,...,,,,,,,,,,


### Estimate ratings

In [None]:
def estimate_rating(ratings_input, predicted_ratings, movies, all_rating, movie_indices):
  movie_id = ratings_input.loc['Movie_ID']
  rated_movie_title = movies.loc[movies['Movie_Id'] == movie_id, 'Name'].values[0]
  customer = ratings_input.loc['Customer']
  predicted_rating = predicted_ratings.loc[customer, movie_id]
  return predicted_rating

def batch_estimate_rating(ratings_input, predicted_ratings=predicted_ratings, movies=movies, all_ratings=df1, 
                    movie_indices=utility_matrix.index.to_list()):
  results = ratings_input.transpose().apply(lambda x: estimate_rating(x, 
    predicted_ratings, movies, all_ratings, movie_indices)).transpose()
  results.name = 'Predicted Rating'
  recommendations = pd.concat([ratings_input[['Customer', 'Movie_ID', 'Rating']], results], axis=1)
  return recommendations

batch_estimate_rating(df1.sample(5, random_state=0))
  


Unnamed: 0,Customer,Movie_ID,Rating,Predicted Rating
47918,2534534,26,2.0,1.456276
90184,703724,28,5.0,5.487303
6130,1530767,8,4.0,3.854059
68061,98602,28,4.0,4.487422
58818,345568,28,4.0,4.487422


In [None]:
from sklearn.metrics import mean_squared_error
test_SVD = batch_estimate_rating(df1.sample(100, random_state=0))
mean_squared_error(test_SVD['Rating'], test_SVD['Predicted Rating'], squared=False)

0.9898581252789265

In [None]:
from sklearn.metrics import r2_score
r2_score(test_SVD['Rating'], test_SVD['Predicted Rating'])

0.08290985756204539

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(test_SVD['Rating'], test_SVD['Predicted Rating'])

0.6332266056943128

### Recommend a new movie

## Item-Item memory-based collaborative: Cosine Similarity
Use the movie matrix from matrix decomposition. 
Results were poor because the same movie was recommended to most users.

In [None]:
# Based on W10d2 example
from sklearn.utils.extmath import randomized_svd
from scipy.sparse import csr_matrix


def run_svd(matrix, n_components):
  U, Sigma, VT = randomized_svd(matrix, n_components=n_components, random_state=0)
  print('U shape: ',U.shape)
  print('V shape: ',VT.transpose().shape)
  return U, VT.transpose()

users_matrix, movies_matrix = run_svd(csr_matrix(movies_matrix3.fillna(0)), 10)

U shape:  (81473, 10)
V shape:  (30, 10)


### 10 components

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def create_similarity_matrix(matrix):
  sim_matrix = cosine_similarity(matrix, matrix)
  print('Output shape: ', sim_matrix.shape)
  return sim_matrix

movies_sim_matrix = create_similarity_matrix(movies_matrix)
movies_matrix[0]

Output shape:  (30, 30)


array([0.00535033, 0.00450838, 0.00341365, 0.00030266, 0.00586002,
       0.00367197, 0.02032532, 0.01639532, 0.01450361, 0.02331527])

In [None]:
def recommend_similar2(ratings_input, similarity_matrix, movies, all_ratings, movie_indices):
  """

  """
  movie_id = ratings_input.loc['Movie_ID']
  rated_movie_title = movies.loc[movies['Movie_Id'] == movie_id, 'Name'].values[0]
  customer = ratings_input.loc['Customer']
  try:
    matrix_index = movie_indices[movie_indices == movie_id]
    sorted_indices = similarity_matrix[matrix_index].argsort()
    closest_indices = sorted_indices[-3:] # 3 most similar
    rated_movies = all_ratings[all_ratings['Customer'] == customer]['Movie_ID'].values
    recommend_movie_id = [movie_indices[closest_index] for closest_index in closest_indices if movie_indices[closest_index] not in rated_movies]
    recommend_title = str(recommend_movie_id[0])+': '+movies[movies['Movie_Id'] == recommend_movie_id[0]]['Name'].values
    return pd.Series([rated_movie_title, recommend_title], index=['Rated Movie', 'Recommended Movie'])
  except: 
    return pd.Series([rated_movie_title, None], index=['Rated Movie', 'Recommended Movie'])


def batch_recommend_similar2(ratings_input, similarity_matrix=movies_sim_matrix,
        movies=movies, all_ratings=df1, movie_indices=movies_matrix3.columns.to_list()):
  """
  Recommend one movie.
  Parameters:
    - ratings_df: DataFrame containing Customers and their movie ratings.
    - similarity_matrix: Cosine similarity matrix
    - movies: DataFrame containing movie info.
    - movie_indices: List containing movie IDs in same sequence as in the similarity matrix.
  Returns:
    - DataFrame containing a single new movie recommendation for each customer.
  """
  results = ratings_input.transpose().apply(lambda x: recommend_similar2(x, 
    similarity_matrix, movies, all_ratings, movie_indices)).transpose()
  recommendations = pd.concat([ratings_input['Customer'], results], axis=1)
  print(f"**Input movies count**: {len(recommendations['Rated Movie'].value_counts())}\n{recommendations['Rated Movie'].value_counts()}")
  print(f"\n**Recommended movies count**: {len(recommendations['Recommended Movie'].value_counts())}\n{recommendations['Recommended Movie'].value_counts()}")
  print(f"\n**No recommendations count**: {recommendations['Recommended Movie'].isna().sum()}")

  return recommendations

batch_recommend_similar2(df1.sample(5, random_state=0))

**Input movies count**: 3
Lilo and Stitch               3
Never Die Alone               1
What the #$*! Do We Know!?    1
Name: Rated Movie, dtype: int64

**Recommended movies count**: 1
[14: Nature: Antarctica]    5
Name: Recommended Movie, dtype: int64

**No recommendations count**: 0


Unnamed: 0,Customer,Rated Movie,Recommended Movie
47918,2534534,Never Die Alone,[14: Nature: Antarctica]
90184,703724,Lilo and Stitch,[14: Nature: Antarctica]
6130,1530767,What the #$*! Do We Know!?,[14: Nature: Antarctica]
68061,98602,Lilo and Stitch,[14: Nature: Antarctica]
58818,345568,Lilo and Stitch,[14: Nature: Antarctica]


In [None]:
batch_recommend_similar2(df1.sample(100, random_state=0))

**Input movies count**: 17
Lilo and Stitch                                       33
What the #$*! Do We Know!?                            13
Immortal Beloved                                      11
Something's Gotta Give                                 9
Never Die Alone                                        8
7 Seconds                                              8
The Rise and Fall of ECW                               3
Character                                              2
Sick                                                   2
Inspector Morse 31: Death Is Now My Neighbour          2
By Dawn's Early Light                                  2
Screamers                                              2
Neil Diamond: Greatest Hits Live                       1
Fighter                                                1
My Bloody Valentine                                    1
Dinosaur Planet                                        1
Sesame Street: Elmo's World: The Street We Live On     1
Name

Unnamed: 0,Customer,Rated Movie,Recommended Movie
47918,2534534,Never Die Alone,[14: Nature: Antarctica]
90184,703724,Lilo and Stitch,[14: Nature: Antarctica]
6130,1530767,What the #$*! Do We Know!?,[14: Nature: Antarctica]
68061,98602,Lilo and Stitch,[14: Nature: Antarctica]
58818,345568,Lilo and Stitch,[14: Nature: Antarctica]
...,...,...,...
97302,800136,Something's Gotta Give,[14: Nature: Antarctica]
35824,795211,Immortal Beloved,[14: Nature: Antarctica]
93909,2540855,Something's Gotta Give,[14: Nature: Antarctica]
14998,1343090,What the #$*! Do We Know!?,[14: Nature: Antarctica]


### 200 components

In [None]:
users_matrix200, movies_matrix200 = run_svd(csr_matrix(movies_matrix3.fillna(0)), 200)

U shape:  (81473, 30)
V shape:  (30, 30)


In [None]:
movies_sim_matrix200 = create_similarity_matrix(movies_matrix200)
movies_matrix200[0]

Output shape:  (30, 30)


array([ 5.35032761e-03,  4.50838435e-03,  3.41365281e-03,  3.02657099e-04,
        5.86002277e-03,  3.67195460e-03,  2.03248115e-02,  1.63955157e-02,
        1.45081488e-02,  2.33153499e-02,  3.18525439e-02,  1.95207905e-02,
        5.25064217e-01,  8.19677119e-01, -1.55354655e-01, -1.47095781e-01,
       -4.67583852e-02, -2.99607738e-02, -8.76894295e-04, -1.64058482e-02,
       -1.83123619e-03, -7.16207739e-03, -8.41095673e-03, -3.17202247e-03,
       -1.28217977e-02, -2.86844373e-03, -1.61136265e-03, -1.98211595e-03,
       -1.63129729e-03, -7.30747950e-03])

In [None]:
batch_recommend_similar2(df1.sample(100, random_state=0), similarity_matrix=movies_sim_matrix200)

**Input movies count**: 17
Lilo and Stitch                                       33
What the #$*! Do We Know!?                            13
Immortal Beloved                                      11
Something's Gotta Give                                 9
Never Die Alone                                        8
7 Seconds                                              8
The Rise and Fall of ECW                               3
Character                                              2
Sick                                                   2
Inspector Morse 31: Death Is Now My Neighbour          2
By Dawn's Early Light                                  2
Screamers                                              2
Neil Diamond: Greatest Hits Live                       1
Fighter                                                1
My Bloody Valentine                                    1
Dinosaur Planet                                        1
Sesame Street: Elmo's World: The Street We Live On     1
Name

Unnamed: 0,Customer,Rated Movie,Recommended Movie
47918,2534534,Never Die Alone,[19: By Dawn's Early Light]
90184,703724,Lilo and Stitch,[19: By Dawn's Early Light]
6130,1530767,What the #$*! Do We Know!?,[19: By Dawn's Early Light]
68061,98602,Lilo and Stitch,[19: By Dawn's Early Light]
58818,345568,Lilo and Stitch,[19: By Dawn's Early Light]
...,...,...,...
97302,800136,Something's Gotta Give,[19: By Dawn's Early Light]
35824,795211,Immortal Beloved,[19: By Dawn's Early Light]
93909,2540855,Something's Gotta Give,[19: By Dawn's Early Light]
14998,1343090,What the #$*! Do We Know!?,[19: By Dawn's Early Light]


### 500 components

In [None]:
users_matrix500, movies_matrix500 = run_svd(csr_matrix(movies_matrix3.fillna(0)), 500)
movies_sim_matrix500 = create_similarity_matrix(movies_matrix500)

U shape:  (81473, 30)
V shape:  (30, 30)
Output shape:  (30, 30)


In [None]:
batch_recommend_similar2(df1.sample(100, random_state=0), similarity_matrix=movies_sim_matrix500)

**Input movies count**: 17
Lilo and Stitch                                       33
What the #$*! Do We Know!?                            13
Immortal Beloved                                      11
Something's Gotta Give                                 9
Never Die Alone                                        8
7 Seconds                                              8
The Rise and Fall of ECW                               3
Character                                              2
Sick                                                   2
Inspector Morse 31: Death Is Now My Neighbour          2
By Dawn's Early Light                                  2
Screamers                                              2
Neil Diamond: Greatest Hits Live                       1
Fighter                                                1
My Bloody Valentine                                    1
Dinosaur Planet                                        1
Sesame Street: Elmo's World: The Street We Live On     1
Name

Unnamed: 0,Customer,Rated Movie,Recommended Movie
47918,2534534,Never Die Alone,[25: Inspector Morse 31: Death Is Now My Neigh...
90184,703724,Lilo and Stitch,[25: Inspector Morse 31: Death Is Now My Neigh...
6130,1530767,What the #$*! Do We Know!?,[25: Inspector Morse 31: Death Is Now My Neigh...
68061,98602,Lilo and Stitch,[25: Inspector Morse 31: Death Is Now My Neigh...
58818,345568,Lilo and Stitch,[25: Inspector Morse 31: Death Is Now My Neigh...
...,...,...,...
97302,800136,Something's Gotta Give,[25: Inspector Morse 31: Death Is Now My Neigh...
35824,795211,Immortal Beloved,[25: Inspector Morse 31: Death Is Now My Neigh...
93909,2540855,Something's Gotta Give,[25: Inspector Morse 31: Death Is Now My Neigh...
14998,1343090,What the #$*! Do We Know!?,[25: Inspector Morse 31: Death Is Now My Neigh...
