In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
credits = pd.read_parquet('./data/credits_data.parquet')
keywords = pd.read_parquet('./data/keywords_data.parquet')
movies = pd.read_parquet('./data/movies_data.parquet')


In [3]:
movies.head()

Unnamed: 0,id,title,genres,overview,adult,release_year,poster_url
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",False,2009,https://image.tmdb.org/t/p/w500/kyeqWdyUXW608q...
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",False,2007,https://image.tmdb.org/t/p/w500/jGWpG4YhpQwVmj...
2,206647,Spectre,"[Action, Adventure, Thriller]",A cryptic message from Bond’s past sends him o...,False,2015,https://image.tmdb.org/t/p/w500/672kUEMtTHcaVY...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,False,2012,https://image.tmdb.org/t/p/w500/85cWkCVftiVs0B...
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...",False,2012,https://image.tmdb.org/t/p/w500/lCxz1Yus07QCQQ...


In [4]:
credits.head()

Unnamed: 0,id,cast,director
0,19995,"[[Sam Worthington, /mflBcox36s9ZPbsZPVOuhf6axa...","[[James Cameron, /9NAZnTjBQ9WcXAQEzZpKy4vdQto...."
1,285,"[[Johnny Depp, /yZAoqNhBtlHnK8rC1n99bEzwOMF.jp...","[[Gore Verbinski, /rSQRdmLNAwdKxrtvBSSlBmWeSsj..."
2,206647,"[[Monica Bellucci, /pz3B128Uk3dgmiKB7eghidXdNQ...","[[Sam Mendes, /5z89X9rB76JDblqMQ52fviwXxAN.jpg]]"
3,49026,"[[Gary Oldman, /2v9FVVBUrrkW2m3QOcYkuhq9A6o.jp...","[[Christopher Nolan, /xuAIuYSmsUzKlUMBFGVZaWsY..."
4,49529,"[[Willem Dafoe, /ui8e4sgZAwMPi3hzEO53jyBJF9B.j...","[[Andrew Stanton, /tRwWuo06aN9vuXAPaswMN42x2ii..."


In [5]:
keywords.head()

Unnamed: 0,id,keywords
0,19995,"[culture clash, future, space war, space colon..."
1,285,"[exotic island, east india company, love of on..."
2,206647,"[spy, based on novel or book, secret agent, se..."
3,49026,"[crime fighter, terrorist, secret identity, bu..."
4,49529,"[based on novel or book, planet mars, medallio..."


In [6]:
print(movies.shape)
print(keywords.shape)
print(credits.shape)


(4770, 7)
(4495, 2)
(4767, 3)


In [50]:
combined = movies.merge(keywords, on='id', how='outer')
combined = combined.merge(credits, on='id', how='outer')

In [51]:
combined.isna().sum()

id                0
title             0
genres            0
overview          0
adult             0
release_year      0
poster_url        0
keywords        275
cast              3
director          3
dtype: int64

In [52]:
combined.dropna(inplace=True)
combined.isna().sum().sum()

0

In [53]:
combined.head(2)

Unnamed: 0,id,title,genres,overview,adult,release_year,poster_url,keywords,cast,director
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",False,2009,https://image.tmdb.org/t/p/w500/kyeqWdyUXW608q...,"[culture clash, future, space war, space colon...","[[Sam Worthington, /mflBcox36s9ZPbsZPVOuhf6axa...","[[James Cameron, /9NAZnTjBQ9WcXAQEzZpKy4vdQto...."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",False,2007,https://image.tmdb.org/t/p/w500/jGWpG4YhpQwVmj...,"[exotic island, east india company, love of on...","[[Johnny Depp, /yZAoqNhBtlHnK8rC1n99bEzwOMF.jp...","[[Gore Verbinski, /rSQRdmLNAwdKxrtvBSSlBmWeSsj..."


In [54]:
combined.shape


(4495, 10)

In [23]:
df = combined.drop('poster_url', axis=1)
df['id'] = df['id'].astype('int32')
df['genres'] = df['genres'].apply(lambda x: [x_i.replace(' ', '_') for x_i in x])
df['overview'] = df['overview'].apply(lambda x: x.split())
df['adult'] = df['adult'].astype('int8')
df['release_year'] = df['release_year'].astype('int16')
df['keywords'] = df['keywords'].apply(lambda x: [x_i.replace(' ', '_') for x_i in x])
df['cast'] = df['cast'].apply(lambda x: [x_i[0] for x_i in x])
df['director'] = df['director'].apply(lambda x: [x_i[0] for x_i in x])

df.head(2)

Unnamed: 0,id,title,genres,overview,adult,release_year,keywords,cast,director
0,19995,Avatar,"[Action, Adventure, Fantasy, Science_Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...",0,2009,"[culture_clash, future, space_war, space_colon...","[Sam Worthington, Zoe Saldaña, Michelle Rodrig...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...",0,2007,"[exotic_island, east_india_company, love_of_on...","[Johnny Depp, Keira Knightley, Bill Nighy, Ste...",[Gore Verbinski]


In [24]:
# extending to one list
df['tags'] = df['genres'] + df['overview'] + df['keywords'] + df['cast'] + df['director']
df = df[['id', 'title', 'adult', 'release_year', 'tags']]

df['tags'] = df['tags'].apply(lambda x: ' '.join([x_i.replace(' ', '_') for x_i in x]).lower())

df.head()

Unnamed: 0,id,title,adult,release_year,tags
0,19995,Avatar,0,2009,action adventure fantasy science_fiction in th...
1,285,Pirates of the Caribbean: At World's End,0,2007,"adventure fantasy action captain barbossa, lon..."
2,206647,Spectre,0,2015,action adventure thriller a cryptic message fr...
3,49026,The Dark Knight Rises,0,2012,action crime drama thriller following the deat...
4,49529,John Carter,0,2012,action adventure science_fiction john carter i...


### Stemming ['creative', 'creating', 'created'] -> ['creat', 'creat', 'creat']

In [14]:
%pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [25]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem(text):
    return ' '.join([ps.stem(t) for t in text.split()])

stem(df['tags'].iloc[0])

'action adventur fantasi science_fict in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. culture_clash futur space_war space_coloni societi space_travel futurist romanc space alien tribe alien_planet marin soldier battl love_affair natur anti_war power_rel mind_and_soul sam_worthington zoe_saldaña michelle_rodriguez sigourney_weav stephen_lang james_cameron'

In [26]:
df['tags'] = df['tags'].apply(stem)

### Vectorization (Bag Of Words)

In [17]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [27]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=6000, stop_words='english')

vectors = cv.fit_transform(df['tags']).toarray()
vectors.shape

(4495, 6000)

In [28]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zoo', 'zooey_deschanel', 'zoë_kravitz'],
      dtype=object)

In [29]:
np.unique(vectors)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [30]:
vectors.shape

(4495, 6000)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

In [32]:
import pickle

pickle.dump(similarity, open('./data/similarity.pkl', 'wb'))

In [33]:
df.head()

Unnamed: 0,id,title,adult,release_year,tags
0,19995,Avatar,0,2009,action adventur fantasi science_fict in the 22...
1,285,Pirates of the Caribbean: At World's End,0,2007,"adventur fantasi action captain barbossa, long..."
2,206647,Spectre,0,2015,action adventur thriller a cryptic messag from...
3,49026,The Dark Knight Rises,0,2012,action crime drama thriller follow the death o...
4,49529,John Carter,0,2012,action adventur science_fict john carter is a ...


In [34]:
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[:6]

    for m in movies_list:
        print(df.iloc[m[0]].title)


In [39]:
recommend('Pirates of the Caribbean: At World\'s End')

Pirates of the Caribbean: At World's End
Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Pirates of the Caribbean: On Stranger Tides
Cutthroat Island
The Pirates Who Don't Do Anything: A VeggieTales Movie


In [None]:
from dotenv import load_dotenv
from os import environ
import requests

load_dotenv()

api_key = environ['API_KEY']

def get_popularity_index(id):
    url = f'http://api.themoviedb.org/3/movie/{id}?api_key={api_key}&language=en-US'
    resp = requests.get(url).json()
    return resp.get('popularity')

combined['popularity'] = combined['id'].apply(get_popularity_index)


In [37]:
combined.to_parquet('./data/combined.parquet')
