In [1]:
import pandas as pd
import numpy as np


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


In [4]:
df = pd.read_csv('../data/preprocessed_data.csv')
df.head()

Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country
0,The Shining,R,Drama,8.4,Stanley Kubrick,Stephen King,Jack Nicholson,Warner Bros.,1980,United States
1,The Blue Lagoon,R,Adventure,5.8,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,Columbia Pictures,1980,United States
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,8.7,Irvin Kershner,Leigh Brackett,Mark Hamill,Lucasfilm,1980,United States
3,Airplane!,PG,Comedy,7.7,Jim Abrahams,Jim Abrahams,Robert Hays,Paramount Pictures,1980,United States
4,Caddyshack,R,Comedy,7.3,Harold Ramis,Brian Doyle-Murray,Chevy Chase,Orion Pictures,1980,United States


### Creating tags

In [5]:
df['year'].unique()

array([1980, 1981, 1984, 1983, 1982, 1987, 1985, 1986, 1988, 1989, 1992,
       1990, 1991, 1993, 1999, 2016, 1997, 1994, 1995, 1996, 2001, 2000,
       1998, 2014, 2002, 2003, 2005])

In [6]:
def update_year(year):
    if (year >= 1980) and (year < 1990):
        return '80s'
    elif (year >= 1990) and (year < 2000):
        return '90s'
    elif (year >= 2000) and (year < 2010):
        return '2000s'
    elif (year >= 2010) and (year < 2020):
        return '2010s'

df['year'] = df['year'].apply(update_year)

In [7]:
# filling '_' with space ' ' charecters
def fill_(x):
    return [x.replace(" ", "_")]

df["rating"] = df["rating"].apply(fill_)
df["genre"] = df["genre"].apply(fill_)
df["director"] = df["director"].apply(fill_)
df["writer"] = df["writer"].apply(fill_)
df["star"] = df["star"].apply(fill_)
df["company"] = df["company"].apply(fill_)
df["country"] = df["country"].apply(fill_)
df["year"] = df["year"].apply(fill_)

df.head()

Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country
0,The Shining,[R],[Drama],8.4,[Stanley_Kubrick],[Stephen_King],[Jack_Nicholson],[Warner_Bros.],[80s],[United_States]
1,The Blue Lagoon,[R],[Adventure],5.8,[Randal_Kleiser],[Henry_De_Vere_Stacpoole],[Brooke_Shields],[Columbia_Pictures],[80s],[United_States]
2,Star Wars: Episode V - The Empire Strikes Back,[PG],[Action],8.7,[Irvin_Kershner],[Leigh_Brackett],[Mark_Hamill],[Lucasfilm],[80s],[United_States]
3,Airplane!,[PG],[Comedy],7.7,[Jim_Abrahams],[Jim_Abrahams],[Robert_Hays],[Paramount_Pictures],[80s],[United_States]
4,Caddyshack,[R],[Comedy],7.3,[Harold_Ramis],[Brian_Doyle-Murray],[Chevy_Chase],[Orion_Pictures],[80s],[United_States]


In [8]:
# create tags

df['tags'] = df['rating'] + df['genre'] + df['director'] + df['writer'] + df['star'] + df['company'] + df['country'] + df['year']
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))

df.head()


Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country,tags
0,The Shining,[R],[Drama],8.4,[Stanley_Kubrick],[Stephen_King],[Jack_Nicholson],[Warner_Bros.],[80s],[United_States],R Drama Stanley_Kubrick Stephen_King Jack_Nich...
1,The Blue Lagoon,[R],[Adventure],5.8,[Randal_Kleiser],[Henry_De_Vere_Stacpoole],[Brooke_Shields],[Columbia_Pictures],[80s],[United_States],R Adventure Randal_Kleiser Henry_De_Vere_Stacp...
2,Star Wars: Episode V - The Empire Strikes Back,[PG],[Action],8.7,[Irvin_Kershner],[Leigh_Brackett],[Mark_Hamill],[Lucasfilm],[80s],[United_States],PG Action Irvin_Kershner Leigh_Brackett Mark_H...
3,Airplane!,[PG],[Comedy],7.7,[Jim_Abrahams],[Jim_Abrahams],[Robert_Hays],[Paramount_Pictures],[80s],[United_States],PG Comedy Jim_Abrahams Jim_Abrahams Robert_Hay...
4,Caddyshack,[R],[Comedy],7.3,[Harold_Ramis],[Brian_Doyle-Murray],[Chevy_Chase],[Orion_Pictures],[80s],[United_States],R Comedy Harold_Ramis Brian_Doyle-Murray Chevy...


In [9]:
df_new = df[['name', 'score', 'tags']]
df_new.head()


Unnamed: 0,name,score,tags
0,The Shining,8.4,R Drama Stanley_Kubrick Stephen_King Jack_Nich...
1,The Blue Lagoon,5.8,R Adventure Randal_Kleiser Henry_De_Vere_Stacp...
2,Star Wars: Episode V - The Empire Strikes Back,8.7,PG Action Irvin_Kershner Leigh_Brackett Mark_H...
3,Airplane!,7.7,PG Comedy Jim_Abrahams Jim_Abrahams Robert_Hay...
4,Caddyshack,7.3,R Comedy Harold_Ramis Brian_Doyle-Murray Chevy...


#### Vectorization (Bag Of Words)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')

vectors_bow = cv.fit_transform(df['tags']).toarray()
vectors_bow.shape


(3827, 6521)

In [11]:
cv.get_feature_names_out()

array(['13', '1492_pictures', '17', ..., 'émilie_dequenne', 'éric_rohmer',
       'éva_gárdos'], dtype=object)

In [12]:
np.unique(vectors_bow)

array([0, 1, 2, 3])

In [13]:
vectors_bow = np.hstack((vectors_bow, df[['score']].to_numpy()))


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_bow = cosine_similarity(vectors_bow)


In [15]:
def recommend(movie, similarity):
    movie_index = df[df['name'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[:6]

    for m in movies_list:
        print(df.iloc[m[0]]['name'])


In [16]:
recommend('The Avengers', similarity_bow)

The Avengers
If Looks Could Kill
Batman Forever
Sphere
Twister
U.S. Marshals


In [17]:
df.sample(5)


Unnamed: 0,name,rating,genre,score,director,writer,star,company,year,country,tags
2951,Scream 2,[R],[Horror],6.2,[Wes_Craven],[Kevin_Williamson],[Neve_Campbell],[Dimension_Films],[90s],[United_States],R Horror Wes_Craven Kevin_Williamson Neve_Camp...
106,Friday the 13th Part 2,[R],[Horror],6.1,[Steve_Miner],[Ron_Kurz],[Betsy_Palmer],[Georgetown_Productions_Inc.],[80s],[United_States],R Horror Steve_Miner Ron_Kurz Betsy_Palmer Geo...
1513,Speed Zone,[PG],[Action],4.5,[Jim_Drake],[Michael_Short],[Peter_Boyle],[Canadian_International_Studios_VIII],[80s],[United_States],PG Action Jim_Drake Michael_Short Peter_Boyle ...
115,Absence of Malice,[PG],[Drama],6.9,[Sydney_Pollack],[Kurt_Luedtke],[Paul_Newman],[Columbia_Pictures],[80s],[United_States],PG Drama Sydney_Pollack Kurt_Luedtke Paul_Newm...
2152,This Is My Life,[PG-13],[Drama],6.1,[Nora_Ephron],[Meg_Wolitzer],[Julie_Kavner],[Twentieth_Century_Fox],[90s],[United_States],PG-13 Drama Nora_Ephron Meg_Wolitzer Julie_Kav...


#### Try with TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

vectors_tfidf = tfidf.fit_transform(df['tags']).toarray()
vectors_tfidf = np.hstack((vectors_tfidf, df[['score']].to_numpy()))
vectors_tfidf.shape


(3827, 6522)

In [19]:
similarity_tfidf = cosine_similarity(vectors_tfidf)

In [20]:
recommend('The Avengers', similarity_tfidf)

The Avengers
National Lampoon's Christmas Vacation
Benny & Joon
Tall Tale
Diabolique
Quiz Show


### Saving the recommendation assets in parquet format

In [None]:
!pip install fastparquet
# or
# !pip install pyarrow


In [26]:
df[['name']].to_parquet('../data/df_name.parquet')

In [37]:
pd.DataFrame(similarity_bow, columns=[str(i) for i in range(similarity_bow.shape[1])]).to_parquet('../data/df_similarity_bow.parquet')
