**Sistem Rekomendasi Hybrid Menggunakan Algoritma Content Based dan Collaborative Filtering.**

In [None]:
#instal paket fuzzywuzzy
!pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
#instal paket surprise
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 1.9 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633954 sha256=13f305cc25ec57a7a2ef2ecc5ff4154ade17d66b094fcd94e6cd58d1da34c30e
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [None]:
#import module yang diperlukan
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

from surprise import SVD, SVDpp, KNNBasic
from surprise import Dataset
from surprise.model_selection import cross_validate,train_test_split, GridSearchCV
from surprise import NormalPredictor
from surprise import Reader

import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk



Membaca file dataset. Dataset yang digunakan adalah MovieLens Dataset dengan file "movies.csv" dan "ratings.csv" 

In [None]:
#mendownload daftar "stopwords" di NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#read file dari direktori
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [None]:
#Menampilkan 5 baris pertama pada dataset ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [None]:
#Menampilkan 10 baris pertama pada dataset movies
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
# Membuat array untuk ratings
ratings_array = ratings['rating'].unique()
max_rating = np.amax( ratings_array )
min_rating = np.amin( ratings_array )
print( ratings_array )

[2.5 3.  2.  4.  3.5 1.  5.  4.5 1.5 0.5]


In [None]:
# Membuat movie_map berdasarkan movieId
movie_map = pd.Series(movies.movieId.values,index=movies.title).to_dict()
reverse_movie_map = {v: k for k, v in movie_map.items()}
movieId_to_index_map = pd.Series(movies.index.values,index=movies.movieId).to_dict()
movieId_all_array = movies['movieId'].unique()

In [None]:
# Mencocokkan input dengan database judul movie
def get_movieId( movie_name ):
    """
    return movieId sesuai nama movie 
    
    Parameters
    ----------
    movie_name: string, nama movie disertai ataupun tanpa disertai tahun

    Return
    ------
    the movieId
    """

    # Jika nama movie kebetulan 100% sama dengan nama yang tertulis di database,
    # lalu kembalikan id yang sesuai dengan namanya.
    # Atau kita perlu mempertimbangkan kesamaan antara string
    if (movie_name in movie_map):
      return movie_map[movie_name]
    else:
      similar = []
      for title, movie_id in movie_map.items():
        ratio = fuzz.ratio(title.lower(), movie_name.lower())
        if ( ratio >= 60):
          similar.append( (title, movie_id, ratio ) )
      if (len(similar) == 0):
        print("Maaf, movie ini tidak tersedia di dalam data.")
      else:
        match_item = sorted( similar , key=lambda x: x[2] )[::-1]
        print( "Berikut adalah movie yang mungkin cocok :", match_item[0][0], ", dengan rasio kesamaan=",match_item[0][2] )
        return match_item[0][1]

**Membuat Algoritma Content Based Filtering menggunakan pendekatan Pairwise dalam TF-IDF Vector Space.**

In [None]:
def tokenizer(text):
  torkenized = [PorterStemmer().stem(word).lower() for word in text.split('|') if word not in stopwords.words('english')]
  return torkenized

In [None]:
tfid=TfidfVectorizer(analyzer='word', tokenizer=tokenizer)

In [None]:
tfidf_matrix = tfid.fit_transform(movies['genres'])

In [None]:
cos_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [None]:
print(tfidf_matrix.shape)
print(cos_sim.shape)
print(movies.shape)

(9125, 20)
(9125, 9125)
(9125, 3)


**Membuat Algoritma Collaborative Filtering dengan menggunakan Model Singular Value Decomposition (SVD) Matrix Factorization.**

In [None]:
features = ['userId','movieId', 'rating']
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[features], reader)
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [None]:
gs.fit(data)

In [None]:
# mencetak nilai akurasi RMSE
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])


0.9143720070497571
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [None]:
# algortima model SVD
best_params = gs.best_params['rmse']
model_svd = gs.best_estimator['rmse']
model_svd.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f79cad84310>

In [None]:
def get_rating_from_prediction( prediction, ratings_array ):
    """
    return angka rating terdekat dengan nilai prediksi

    Parameters
    ----------
    prediction: float, nilai prediksi dari model

    ratings_array: array 1D dari angka rating diskrit

    Return
    ------
    angka rating sesuai dengan nilai prediksi
    """
    rating = ratings_array[ np.argmin( [ np.abs(item - prediction) for item in ratings_array ] ) ]
    return rating

In [None]:
prediction = model_svd.predict(1,1)
print("rating", ratings[(ratings.userId ==1 ) & (ratings.movieId ==1 ) ]['rating']  )
print("prediction",prediction.est)

rating Series([], Name: rating, dtype: float64)
prediction 3.2268718837286525


**Membuat sistem rekomendasi movie dengan metode item-based**

In [None]:
def make_recommendation_item_based( similarity_matrix ,movieId_all_array, ratings_data, id_to_movie_map, movieId_to_index_map, fav_movie_list, n_recommendations, userId=-99):
    """
    return rekomendasi top n movie berdasarkan daftar input movie favorit pengguna
    Saat ini, fav_movie_list hanya mendukung satu input movie favorit

    Parameters
    ----------
    similarity_matrix: 2d array, the pairwise similarity matrix

    movieId_all_array: 1d array, array semua movie Id


    ratings_data: data ratings

    id_to_movie_map: map dari movieId ke movie title

    movieId_to_index_map: map dari movieId ke index dari dataframe movie

    fav_movie_list: list, daftar movie favorit pengguna

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            jika userId = -99, maka pengguna baru akan diambil/dibuat
            jika userId = -1, pengguna yang terakhir dimasukkan yang akan dipilih

    Return
    ------
    daftar rekomendasi top n movie

    """

    if (userId == -99):
      userId = np.amax( ratings_data['userId'].unique() ) + 1
    elif (userId == -1):
      userId = np.amax( ratings_data['userId'].unique() )

    movieId_list = []
    for movie_name in fav_movie_list:
      movieId_list.append( get_movieId(movie_name) )    

    # Mendapatkan movie id yang sesuai dengan movie yang belum pernah ditonton pengguna sebelumnya.
    movieId_user_exist = list( ratings_data[ ratings_data.userId==userId ]['movieId'].unique() )
    movieId_user_exist = movieId_user_exist + movieId_list
    movieId_input = []
    for movieId in movieId_all_array:
      if (movieId not in movieId_user_exist):
         movieId_input.append( movieId )


    index = movieId_to_index_map[movieId_list[0]]
    cos_sim_scores=list(enumerate(similarity_matrix[index]))
    cos_sim_scores=sorted(cos_sim_scores,key=lambda x:x[1],reverse=True) 
  
    topn_movieIndex = []
    icount = 0
    for i in range(len(cos_sim_scores)):
      if( cos_sim_scores[i][0] in [movieId_to_index_map[ids] for ids in movieId_input ]  ):
        icount += 1
        topn_movieIndex.append( cos_sim_scores[i][0] )
        if( icount == n_recommendations ):
          break
    
    topn_movie = [ movies.loc[index].title for index in topn_movieIndex ]
    return topn_movie

**Membuat rekomendasi movie dengan metode user based**

In [None]:
def make_recommendation_user_based(best_model_params, movieId_all_array, ratings_data, id_to_movie_map,
                        fav_movie_list, n_recommendations, userId=-99 ):
    """
    return rekomendasi top n movie berdasarkan daftar input movie favorit pengguna
    Saat ini, fav_movie_list hanya mendukung satu input movie favorit
    
    Parameters
    ----------
    best_model_params: dict, {'iterations': iter, 'rank': rank, 'lambda_': reg}

    movieId_all_array: array dari semua movie Id

    ratings_data: data ratings

    id_to_movie_map: map dari movieId ke movie title

    fav_movie_list: list, daftar movie favorit pengguna

    n_recommendations: int, top n recommendations

    userId: int optional (default=-99), the user Id
            jika userId = -99, maka pengguna baru akan diambil/dibuat
            jika userId = -1, pengguna yang terakhir dimasukkan yang akan dipilih

    Return
    ------
    daftar rekomendasi top n movie
    """

    movieId_list = []
    for movie_name in fav_movie_list:
      movieId_list.append( get_movieId(movie_name) )

    if (userId == -99):
      userId = np.amax( ratings_data['userId'].unique() ) + 1
    elif (userId == -1):
      userId = np.amax( ratings_data['userId'].unique() )

    ratings_array = ratings['rating'].unique()
    max_rating = np.amax( ratings_array )
    min_rating = np.amin( ratings_array )
    
    # membuat baris baru yang sesuai dengan data yang diinput
    user_rows = [[userId, movieId, max_rating] for movieId in movieId_list]
    df = pd.DataFrame(user_rows, columns =['userId', 'movieId', 'rating']) 
    train_data = pd.concat([ratings_data, df], ignore_index=True, sort=False)

    # Mendapatkan movie Id yang sesuai dengan movie yang belum pernah ditonton pengguna sebelumnya 
    movieId_user_exist = train_data[ train_data.userId==userId ]['movieId'].unique()
    movieId_input = []
    for movieId in movieId_all_array:
      if (movieId not in movieId_user_exist):
         movieId_input.append( movieId )

    reader = Reader(rating_scale=(min_rating, max_rating))

    data = Dataset.load_from_df(train_data, reader)

    model = SVD(**best_model_params)
    model.fit(data.build_full_trainset())

    predictions = []
    for movieId in movieId_input:
      predictions.append( model.predict(userId,movieId) )

    
    sort_index = sorted(range(len(predictions)), key=lambda k: predictions[k].est, reverse=True)
    topn_predictions = [ predictions[i].est for i in sort_index[0:min(n_recommendations,len(predictions))] ]
    topn_movieIds = [ movieId_input[i] for i in sort_index[0:min(n_recommendations,len(predictions))] ]
    topn_rating = [ get_rating_from_prediction( pre, ratings_array ) for pre in topn_predictions ]

    topn_movie = [ id_to_movie_map[ ids ] for ids in topn_movieIds ]
    return topn_movie

**Menjalankan Sistem Rekomendasi dengan menggunakan Gabungan Algoritma (Content based Filtering & Collaborative Filtering)**

In [None]:
my_favorite_movies = [input("Movie favoritku : ")]

# mendapatkan rekomendasi
n_recommendations = 10

recommends_item_based = make_recommendation_item_based( 
    similarity_matrix = cos_sim,
    movieId_all_array = movieId_all_array,
    ratings_data = ratings[features], 
    id_to_movie_map = reverse_movie_map, 
    movieId_to_index_map = movieId_to_index_map,
    fav_movie_list = my_favorite_movies, 
    n_recommendations = n_recommendations)

recommends_user_based = make_recommendation_user_based(
    best_model_params = best_params, 
    movieId_all_array = movieId_all_array,
    ratings_data = ratings[features], 
    id_to_movie_map = reverse_movie_map, 
    fav_movie_list = my_favorite_movies, 
    n_recommendations = n_recommendations)

print("\n-------------Pencarian berdasarkan kesamaan item content--------------------")
print('Daftar film yang sejenis dengan film' , my_favorite_movies , ':' )
for i, title in enumerate(recommends_item_based):
    print(i+1, title)  
if( len(recommends_item_based) < n_recommendations ):
  print("Maaf, batas rekomendasi telah mencapai maksimum.")    

print("\n--------------Pencarian berdasarkan kesamaan preferensi pengguna------------")
print('Pengguna yang menyukai' , my_favorite_movies , 'juga menyukai film berikut:')
for i, title in enumerate(recommends_user_based):
    print(i+1, title)
if( len(recommends_user_based) < n_recommendations ):
  print("Maaf, batas rekomendasi telah mencapai maksimum.") 

Movie favoritku : Toys Story
Berikut adalah movie yang mungkin cocok : Toy Story (1995) , dengan rasio kesamaan= 69
Berikut adalah movie yang mungkin cocok : Toy Story (1995) , dengan rasio kesamaan= 69

-------------Pencarian berdasarkan kesamaan item content--------------------
Daftar film yang sejenis dengan film ['Toys Story'] :
1 Antz (1998)
2 Toy Story 2 (1999)
3 Adventures of Rocky and Bullwinkle, The (2000)
4 Emperor's New Groove, The (2000)
5 Monsters, Inc. (2001)
6 Shrek the Third (2007)
7 Tale of Despereaux, The (2008)
8 Asterix and the Vikings (Astérix et les Vikings) (2006)
9 Turbo (2013)
10 Boxtrolls, The (2014)

--------------Pencarian berdasarkan kesamaan preferensi pengguna------------
Pengguna yang menyukai ['Toys Story'] juga menyukai film berikut:
1 Godfather, The (1972)
2 Shawshank Redemption, The (1994)
3 Modern Times (1936)
4 All About Eve (1950)
5 Treasure of the Sierra Madre, The (1948)
6 Maltese Falcon, The (1941)
7 Singin' in the Rain (1952)
8 Raging Bull (19