This is a movie recommendation system using Content Based Recommendation

In [1]:
import pandas as pd
import numpy as np

Add an indicies column

In [2]:
dataset = pd.read_csv("Movies.csv", usecols=["title", "runtime", "adult", "genres","production_companies","spoken_languages",
         "keywords"])

In [3]:
dataset['title'] = dataset['title'].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)

In [4]:
dataset = dataset.drop_duplicates(subset='title', keep='first')

In [5]:
dataset = dataset.reset_index(drop=True)

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

In [8]:
dataset['adult'] = le.fit_transform(dataset["adult"])

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def commma_tokenizer(text):
  return text.split(", ")

comma_tokenizer is used so that phrases are split up, rather than individual words

In [11]:
genres_vectorizer = CountVectorizer(tokenizer=commma_tokenizer)

In [12]:
dataset['genres'] = dataset['genres'].replace(np.nan, "UKNOWN")

In [13]:
genres = [x for x in dataset["genres"]]

In [14]:
gv = genres_vectorizer.fit_transform(genres)

In [15]:
dataset['genres'] = list(gv.toarray())

Creates every genre into a token

In [16]:
production_companies_vectorizer = CountVectorizer(tokenizer=commma_tokenizer, min_df = 1000)

In [17]:
dataset['production_companies'] = dataset['production_companies'].replace(np.nan, "UKNOWN")

In [18]:
production_companies = [x for x in dataset["production_companies"]]

In [19]:
pcv = production_companies_vectorizer.fit_transform(production_companies)

In [20]:
dataset['production_companies'] = list(pcv.toarray())

Creates every production company into a token

In [21]:
languages_spoken_vectorizer = CountVectorizer(tokenizer=commma_tokenizer, min_df=1000)

In [22]:
dataset['spoken_languages'] = dataset['spoken_languages'].replace(np.nan, "UKNOWN")

In [23]:
languages_spoken = [x for x in dataset["spoken_languages"]]

In [24]:
lsv = languages_spoken_vectorizer.fit_transform(languages_spoken)

In [25]:
dataset['spoken_languages'] = list(lsv.toarray())

Creates every language that appears at least 1000 times into a token

In [26]:
keywords_vectorizer = CountVectorizer(tokenizer=commma_tokenizer, min_df = 300) 


In [27]:
dataset['keywords'] = dataset['keywords'].replace(np.nan, "UKNOWN")

In [28]:
keywords = [x for x in dataset["keywords"]]

In [29]:
kv = keywords_vectorizer.fit_transform(keywords)

In [30]:
dataset['keywords'] = list(kv.toarray())

In [31]:
titles = dataset['title']

In [32]:
dataset = dataset.iloc[:,1:].values

Creates every keyword into a token

In [33]:
data = []

In [34]:
for row in dataset:
    flat_list = None
    for i in range(len(row)):
        flat_list = row.tolist()

        if(type(row[i]) == np.ndarray):

            for item in row[i]:
                flat_list.append(item)

    del flat_list[2:6]
    data.append(flat_list)


In [35]:
final_dataset = np.array(data)

In [36]:
from sklearn.neighbors import NearestNeighbors

In [37]:
neighbors = NearestNeighbors(metric = "cosine")

In [38]:
neighbors.fit(final_dataset)

This will fit the knn model according to the values

In [39]:
indices = neighbors.kneighbors(final_dataset[2].reshape(1,-1), n_neighbors=4, return_distance = False)

Ignoring the distances output, we will take the indices output and match it to index in the dataset and output

In [40]:
print(f"Movies that are similar to {titles[indices[0][0]]} that we think you make like: {titles[indices[0][1]]}, {titles[indices[0][2]]}, and {titles[indices[0][3]]}")

Movies that are similar to the dark knight that we think you make like: man of steel, batman & robin, and atom man vs. superman


Using joblib to export the knn model, the final dataset, and the titles dataset, to use in the Gradio Application

In [41]:
import joblib

In [42]:
joblib.dump(neighbors, "neighbors.pkl")

['neighbors.pkl']

In [43]:
joblib.dump(final_dataset, "final_dataset")

['final_dataset']

In [44]:
joblib.dump(titles, "titles")

['titles']