In [46]:
import pandas as pd
import numpy as np


In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [32]:
df = pd.read_csv('./data/preprocessed_data.csv')
df.head()

Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country
0,The Shining,R,Drama,8.4,Stanley Kubrick,Stephen King,Jack Nicholson,Warner Bros.,1980,United States
1,The Blue Lagoon,R,Adventure,5.8,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,Columbia Pictures,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,8.7,Irvin Kershner,Leigh Brackett,Mark Hamill,Lucasfilm,1980,United States
3,Airplane!,PG,Comedy,7.7,Jim Abrahams,Jim Abrahams,Robert Hays,Paramount Pictures,1980,United States
4,Caddyshack,R,Comedy,7.3,Harold Ramis,Brian Doyle-Murray,Chevy Chase,Orion Pictures,1980,United States


#### Creating tags

In [33]:
# filling '_' with space ' ' charecters
def fill_(x):
    return [x.replace(" ", "_")]

df["rating"] = df["rating"].apply(fill_)
df["genre"] = df["genre"].apply(fill_)
df["director"] = df["director"].apply(fill_)
df["writer"] = df["writer"].apply(fill_)
df["star"] = df["star"].apply(fill_)
df["company"] = df["company"].apply(fill_)
df["country"] = df["country"].apply(fill_)

df.head()

Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country
0,The Shining,[R],[Drama],8.4,[Stanley_Kubrick],[Stephen_King],[Jack_Nicholson],[Warner_Bros.],1980,[United_States]
1,The Blue Lagoon,[R],[Adventure],5.8,[Randal_Kleiser],[Henry_De_Vere_Stacpoole],[Brooke_Shields],[Columbia_Pictures],1980,[United_States]
2,Star Wars: Episode V - The Empire Strikes Back,[PG],[Action],8.7,[Irvin_Kershner],[Leigh_Brackett],[Mark_Hamill],[Lucasfilm],1980,[United_States]
3,Airplane!,[PG],[Comedy],7.7,[Jim_Abrahams],[Jim_Abrahams],[Robert_Hays],[Paramount_Pictures],1980,[United_States]
4,Caddyshack,[R],[Comedy],7.3,[Harold_Ramis],[Brian_Doyle-Murray],[Chevy_Chase],[Orion_Pictures],1980,[United_States]


In [36]:
# create tags

df['tags'] = df['rating'] + df['genre'] + df['director'] + df['writer'] + df['star'] + df['company'] + df['country']
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))

df.head()


Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country,tags
0,The Shining,[R],[Drama],8.4,[Stanley_Kubrick],[Stephen_King],[Jack_Nicholson],[Warner_Bros.],1980,[United_States],R Drama Stanley_Kubrick Stephen_King Jack_Nich...
1,The Blue Lagoon,[R],[Adventure],5.8,[Randal_Kleiser],[Henry_De_Vere_Stacpoole],[Brooke_Shields],[Columbia_Pictures],1980,[United_States],R Adventure Randal_Kleiser Henry_De_Vere_Stacp...
2,Star Wars: Episode V - The Empire Strikes Back,[PG],[Action],8.7,[Irvin_Kershner],[Leigh_Brackett],[Mark_Hamill],[Lucasfilm],1980,[United_States],PG Action Irvin_Kershner Leigh_Brackett Mark_H...
3,Airplane!,[PG],[Comedy],7.7,[Jim_Abrahams],[Jim_Abrahams],[Robert_Hays],[Paramount_Pictures],1980,[United_States],PG Comedy Jim_Abrahams Jim_Abrahams Robert_Hay...
4,Caddyshack,[R],[Comedy],7.3,[Harold_Ramis],[Brian_Doyle-Murray],[Chevy_Chase],[Orion_Pictures],1980,[United_States],R Comedy Harold_Ramis Brian_Doyle-Murray Chevy...


In [37]:
df_new = df[['name', 'score', 'year', 'tags']]
df_new.head()


Unnamed: 0,name,score,year,tags
0,The Shining,8.4,1980,R Drama Stanley_Kubrick Stephen_King Jack_Nich...
1,The Blue Lagoon,5.8,1980,R Adventure Randal_Kleiser Henry_De_Vere_Stacp...
2,Star Wars: Episode V - The Empire Strikes Back,8.7,1980,PG Action Irvin_Kershner Leigh_Brackett Mark_H...
3,Airplane!,7.7,1980,PG Comedy Jim_Abrahams Jim_Abrahams Robert_Hay...
4,Caddyshack,7.3,1980,R Comedy Harold_Ramis Brian_Doyle-Murray Chevy...


#### Vectorization (Bag Of Words)

In [126]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')

vectors_bow = cv.fit_transform(df['tags']).toarray()
vectors_bow.shape


(3827, 6517)

In [127]:
cv.get_feature_names_out()

array(['13', '1492_pictures', '17', ..., 'émilie_dequenne', 'éric_rohmer',
       'éva_gárdos'], dtype=object)

In [128]:
np.unique(vectors_bow)

array([0, 1, 2, 3])

In [129]:
vectors_bow = np.hstack((vectors_bow, df[['score', 'year']].to_numpy()))


In [130]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_bow = cosine_similarity(vectors_bow)


In [131]:
def recommend(movie, similarity):
    movie_index = df[df['name'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[:6]

    for m in movies_list:
        print(df.iloc[m[0]]['name'])


In [133]:
recommend('The Avengers', similarity_bow)

The Avengers
Batman & Robin
Battlefield Earth
Dungeons & Dragons
Speed 2: Cruise Control
Double Dragon


In [134]:
df.sample(5)


Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country,tags
2230,Matinee,[PG],[Comedy],6.9,[Joe_Dante],[Jerico_Stone],[John_Goodman],[Universal_Pictures],1993,[United_States],PG Comedy Joe_Dante Jerico_Stone John_Goodman ...
2123,The Public Eye,[R],[Crime],6.5,[Howard_Franklin],[Howard_Franklin],[Joe_Pesci],[Universal_Pictures],1992,[United_States],R Crime Howard_Franklin Howard_Franklin Joe_Pe...
2429,Chasers,[R],[Action],5.1,[Dennis_Hopper],[Joe_Batteer],[Tom_Berenger],[Morgan_Creek_Entertainment],1994,[United_States],R Action Dennis_Hopper Joe_Batteer Tom_Berenge...
65,Fatso,[PG],[Comedy],6.3,[Anne_Bancroft],[Anne_Bancroft],[Dom_DeLuise],[Brooksfilms],1980,[United_States],PG Comedy Anne_Bancroft Anne_Bancroft Dom_DeLu...
3128,Wild Things,[R],[Crime],6.5,[John_McNaughton],[Stephen_Peters],[Kevin_Bacon],[Mandalay_Entertainment],1998,[United_States],R Crime John_McNaughton Stephen_Peters Kevin_B...


#### Try with TF-IDF

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

vectors_tfidf = tfidf.fit_transform(df['tags']).toarray()
vectors_tfidf = np.hstack((vectors_tfidf, df[['score', 'year']].to_numpy()))
vectors_tfidf.shape


(3827, 6519)

In [138]:
similarity_tfidf = cosine_similarity(vectors_tfidf)

In [139]:
recommend('The Avengers', similarity_tfidf)

The Avengers
Batman & Robin
Dungeons & Dragons
Speed 2: Cruise Control
Double Dragon
Mortal Kombat: Annihilation
