In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install -q kaggle

In [3]:
!cp '/content/gdrive/MyDrive/kaggle Competition/kaggle.json' 'kaggle.json'

In [4]:
!mkdir ~/.kaggle

In [5]:
!cp kaggle.json ~/.kaggle/

In [6]:
! chmod 600 ~/.kaggle/kaggle.json

In [7]:
! kaggle datasets download -d akshaypawar7/millions-of-movies

Downloading millions-of-movies.zip to /content
 98% 167M/170M [00:06<00:00, 31.4MB/s]
100% 170M/170M [00:06<00:00, 26.2MB/s]


In [8]:
! unzip /content/millions-of-movies.zip

Archive:  /content/millions-of-movies.zip
  inflating: movies.csv              


## Import Library

In [9]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

pd.options.mode.chained_assignment = None

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Data Understanding

In [97]:
data = pd.read_csv("/content/movies.csv")
data.head(2)

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
0,610150,Dragon Ball Super: Super Hero,Animation-Science Fiction-Action,ja,The Red Ribbon Army an evil organization that ...,16585.915,Shueisha-Fuji Television Network-Toei Animatio...,2022-06-11,0.0,70000000.0,99.0,Released,A super awakening and calamity is born.,7.953,985.0,Masako Nozawa-Toshio Furukawa-Yuko Minaguchi-H...,android-sequel-attack-based on manga-fighting-...,/rugyJdeoJm7cSJL1q4jBpTNbxyU.jpg,/ugS5FVfCI3RV0ZwZtBV3HAV75OX.jpg,507086-629015-616037-810693-361743-438148-7788...
1,616037,Thor: Love and Thunder,Action-Adventure-Fantasy,en,After his retirement is interrupted by Gorr th...,5372.967,Marvel Studios-Kevin Feige Productions,2022-07-06,250000000.0,746900000.0,119.0,Released,The one is not the only.,6.755,2224.0,Chris Hemsworth-Natalie Portman-Christian Bale...,ex-girlfriend-hero-greek mythology-sequel-supe...,/pIkRyD18kl4FhoCNQuWxWu5cBLM.jpg,/vvObT0eIWGlArLQx3K5wZ0uT812.jpg,2-438148-45920-507086-361743-919355-748918-718...


In [98]:
data = data.loc[:5000, ["title", "genres", "overview"]]
data.head()

Unnamed: 0,title,genres,overview
0,Dragon Ball Super: Super Hero,Animation-Science Fiction-Action,The Red Ribbon Army an evil organization that ...
1,Thor: Love and Thunder,Action-Adventure-Fantasy,After his retirement is interrupted by Gorr th...
2,Samaritan,Action-Drama-Science Fiction,Thirteen year old Sam Cleary suspects that hi...
3,DC League of Super-Pets,Animation-Action-Family-Science Fiction-Comedy,When Superman and the rest of the Justice Leag...
4,Prey,Thriller-Action,When danger threatens her camp the fierce and ...


In [99]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     5001 non-null   object
 1   genres    4940 non-null   object
 2   overview  4940 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB


In [100]:
data.isnull().sum()

title        0
genres      61
overview    61
dtype: int64

In [101]:
data.dropna(inplace=True)
data.isnull().sum()

title       0
genres      0
overview    0
dtype: int64

In [102]:
data[data["title"].duplicated()].count()

title       348
genres      348
overview    348
dtype: int64

In [103]:
data = data.drop_duplicates(subset=["title"])

In [104]:
data[data["title"].duplicated()].count()

title       0
genres      0
overview    0
dtype: int64

## Vectorize

In [105]:
def Vectorize(data, column, cosine):
  # Inisialisasi TfidfVectorizer
  tf = TfidfVectorizer()
  
  # Melakukan perhitungan idf pada data cuisine
  tf.fit(data[column])

  # Melakukan fit lalu ditransformasikan ke bentuk matrix
  tfidf_matrix = tf.fit_transform(data[column]) 
  
  if cosine == 0:
    cosine_dis =  cosine_distances(tfidf_matrix)
    cosine_dis_df = pd.DataFrame(cosine_dis, index=data['title'], columns=data['title'])
    return cosine_dis_df
  else :
    cosine_sim = cosine_similarity(tfidf_matrix)
    cosine_sim_df = pd.DataFrame(cosine_sim, index=data['title'], columns=data['title'])
    return cosine_sim_df

In [106]:
cosine_dis_overview = Vectorize(data, "overview", 0)
cosine_dis_overview

title,Dragon Ball Super: Super Hero,Thor: Love and Thunder,Samaritan,DC League of Super-Pets,Prey,Nope,Top Gun: Maverick,Jurassic World Dominion,RRR,Minions: The Rise of Gru,...,The Family,Old School,Secret Obsession,Meg 2: The Trench,Mr. Nobody,Toc Toc,Book of Shadows: Blair Witch 2,Wander,Red Eye,Problem Child
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dragon Ball Super: Super Hero,0.000000,0.937992,0.955249,0.966862,0.973043,0.988657,0.970605,0.916958,0.990217,0.964687,...,0.981224,0.984089,0.965720,0.990506,0.974979,0.965896,0.962312,0.975552,0.965107,0.984409
Thor: Love and Thunder,0.937992,0.000000,0.948120,0.936445,0.984319,0.993477,0.957560,0.949468,0.962779,0.960145,...,0.978954,0.992728,0.967119,0.973760,0.980313,0.973925,0.955188,0.954455,0.971530,0.984737
Samaritan,0.955249,0.948120,0.000000,0.957175,0.966454,0.992473,0.951112,0.950124,0.950857,0.955027,...,0.971214,0.996443,0.945900,0.974407,0.959565,0.975243,0.960385,0.950642,0.971487,0.966862
DC League of Super-Pets,0.966862,0.936445,0.957175,0.000000,0.972676,0.996032,0.962260,0.958085,0.976706,0.962225,...,0.973949,0.983991,0.957357,0.962596,0.982267,0.970134,0.956034,0.957295,0.957551,0.991540
Prey,0.973043,0.984319,0.966454,0.972676,0.000000,0.995836,0.992326,0.983229,0.996225,0.979848,...,0.991324,0.995764,0.934404,0.984250,0.989462,0.987714,0.959824,0.983363,0.956479,0.971602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Toc Toc,0.965896,0.973925,0.975243,0.970134,0.987714,0.993916,0.975448,0.982105,0.988558,0.973095,...,0.993143,1.000000,0.970998,0.987552,0.986646,0.000000,0.943474,0.949256,0.990929,0.995963
Book of Shadows: Blair Witch 2,0.962312,0.955188,0.960385,0.956034,0.959824,0.989063,0.973986,0.970146,0.968958,0.945731,...,0.982775,0.954057,0.949343,0.975444,0.985354,0.943474,0.000000,0.934361,0.951765,0.975581
Wander,0.975552,0.954455,0.950642,0.957295,0.983363,0.986336,0.959489,0.975487,0.981740,0.959363,...,0.959541,0.981451,0.951570,0.971739,0.970451,0.949256,0.934361,0.000000,0.972746,0.996257
Red Eye,0.965107,0.971530,0.971487,0.957551,0.956479,0.989949,0.985878,0.989081,0.989500,0.984723,...,0.981907,0.976307,0.866234,0.924065,0.993624,0.990929,0.951765,0.972746,0.000000,0.990193


In [107]:
cosine_sim_genres = Vectorize(data, "genres", 1)
cosine_sim_genres

title,Dragon Ball Super: Super Hero,Thor: Love and Thunder,Samaritan,DC League of Super-Pets,Prey,Nope,Top Gun: Maverick,Jurassic World Dominion,RRR,Minions: The Rise of Gru,...,The Family,Old School,Secret Obsession,Meg 2: The Trench,Mr. Nobody,Toc Toc,Book of Shadows: Blair Witch 2,Wander,Red Eye,Problem Child
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dragon Ball Super: Super Hero,1.000000,0.197896,0.787026,0.825777,0.265795,0.478657,0.280476,0.765498,0.280476,0.255065,...,0.189968,0.000000,0.000000,0.771704,0.510485,0.000000,0.000000,0.000000,0.000000,0.000000
Thor: Love and Thunder,0.197896,1.000000,0.206316,0.163417,0.328942,0.000000,0.347111,0.473761,0.347111,0.634955,...,0.235100,0.000000,0.000000,0.202299,0.292414,0.000000,0.000000,0.000000,0.000000,0.000000
Samaritan,0.787026,0.206316,1.000000,0.649908,0.277104,0.499024,0.594381,0.798070,0.594381,0.000000,...,0.198051,0.000000,0.284120,0.804540,0.679913,0.000000,0.000000,0.000000,0.000000,0.000000
DC League of Super-Pets,0.825777,0.163417,0.649908,1.000000,0.219487,0.395264,0.231610,0.632130,0.231610,0.369822,...,0.338602,0.355417,0.000000,0.637255,0.421547,0.355417,0.000000,0.000000,0.000000,0.355417
Prey,0.265795,0.328942,0.277104,0.219487,1.000000,0.270154,0.466207,0.269524,0.466207,0.000000,...,0.315764,0.000000,0.554208,0.615000,0.000000,0.000000,0.344506,0.331353,0.480481,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Toc Toc,0.000000,0.000000,0.000000,0.355417,0.000000,0.000000,0.000000,0.000000,0.000000,0.447911,...,0.511319,1.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,1.000000
Book of Shadows: Blair Witch 2,0.000000,0.000000,0.000000,0.000000,0.344506,0.784179,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.342045,0.211871,0.000000,0.000000,1.000000,0.671858,0.717003,0.000000
Wander,0.000000,0.000000,0.000000,0.000000,0.331353,0.526857,0.000000,0.000000,0.000000,0.000000,...,0.425969,0.000000,0.328985,0.203782,0.000000,0.000000,0.671858,1.000000,0.285220,0.000000
Red Eye,0.000000,0.000000,0.000000,0.000000,0.480481,0.562259,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.477047,0.295496,0.000000,0.000000,0.717003,0.285220,1.000000,0.000000


## Get Recomendation

In [108]:
def movie_recommendations(title, distance_data=cosine_dis_overview, similarity_data=cosine_sim_genres,items=data[["title", "genres", "overview"]], k=300):
 
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan    
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index_dis = distance_data.loc[:,title].to_numpy().argpartition(
        range(-1, -300, -1))

    index_sim = similarity_data.loc[:,title].to_numpy().argpartition(
        range(-1, -300, -1))
    
    # Mengambil data dengan similarity dan distance terbesar dari index yang ada
    closest_dis = distance_data.columns[index_dis[-1:-(300+2):-1]]
    closest_sim = similarity_data.columns[index_sim[-1:-(300+2):-1]]
    
    # Drop title agar nama resto yang dicari tidak muncul dalam daftar rekomendasi
    closest_dis = closest_dis.drop(title, errors='ignore')
    closest_sim = closest_sim.drop(title, errors='ignore')

    # Menggabungkan title yang di rekomendasikan berdasarkan overview dan genrenya
    closest = [x for x in tuple(closest_dis) if x in tuple(closest_sim)]
    
    # Pandas dataframe rekomendasi film
    rec_overview = data.set_index("title").loc[closest_dis, :].head()
    rec_genres = data.set_index("title").loc[closest_sim, :].head()
    full_rec = data.set_index("title").loc[closest, :].head()
    
    return rec_overview, rec_genres, full_rec

In [109]:
data[data.title.eq('Dragon Ball Super: Super Hero')]

Unnamed: 0,title,genres,overview
0,Dragon Ball Super: Super Hero,Animation-Science Fiction-Action,The Red Ribbon Army an evil organization that ...


In [110]:
rec_overview, rec_genres, full_rec = movie_recommendations('Dragon Ball Super: Super Hero')

In [111]:
rec_overview[]

Unnamed: 0_level_0,genres,overview
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Stranded,Drama,Tensions run high while food runs low as six i...
American Horror House,Horror,On Halloween night a sorority house is overrun...
The Legend of La Llorona,Family-Animation,"Leo San Juan el joven héroe de ""La Leyenda de ..."
Just Say Yes,Comedy-Romance,Incurable romantic Lotte finds her life upende...
mother!,Drama-Horror-Mystery,A couple's relationship is tested when uninvit...


In [112]:
rec_genres

Unnamed: 0_level_0,genres,overview
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Dragon Ball Z: Battle of Gods,Animation-Action-Science Fiction,The events of Battle of Gods take place some y...
Planet Hulk,Science Fiction-Animation-Action,When the Hulk's presence on Earth becomes too ...
Justice League: Doom,Action-Animation-Science Fiction,"An adaptation of Mark Waid's ""Tower of Babel"" ..."
Justice League vs. Teen Titans,Science Fiction-Action-Animation,Robin is sent by Batman to work with the Teen ...
Dragon Ball Z: Lord Slug,Action-Animation-Science Fiction,A Super Namekian named Slug comes to invade Ea...


In [113]:
full_rec

Unnamed: 0_level_0,genres,overview
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Dragon Ball & Dr. Slump Special,Animation-Action-Comedy-Science Fiction,A special continuous feature containing all of...
Altered Carbon: Resleeved,Animation-Science Fiction,On the planet Latimer Takeshi Kovacs must prot...
Suicide Squad: Hell to Pay,Science Fiction-Action-Animation,Task Force X targets a powerful mystical objec...
Judge Dredd,Science Fiction,In a dystopian future Dredd the most famous ju...
Meg 2: The Trench,Science Fiction-Action-Thriller,Plot unknown. Sequel to the 2018 film 'The Meg'.
