# Import the movie dataset

In [93]:
import pandas as pd
data = pd.read_csv('./data/movie_overviews.csv')
data = data.head()

# Pre-process the data

In [94]:
from nltk.corpus import stopwords
from nltk import word_tokenize

def process_overview(string):
    
    if type(string) == str:
        bow = word_tokenize(string)
        bow_lowercase = [t.lower() for t in bow if t.isalpha() and  t not in stopwords.words('english')]
        return ' '.join(bow_lowercase)

data['overview'] = data['overview'].apply(process_overview)
data.head()

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,led woody andy toys live happily room andy bir...,
1,8844,Jumanji,when siblings judy peter discover enchanted bo...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,a family wedding reignites ancient feud neighb...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,cheated mistreated stepped women holding breat...,Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,just george banks recovered daughter wedding r...,Just When His World Is Back To Normal... He's ...


# Stop words removal and Ti-idf vectorization

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


# Recommendation

In [96]:
indices = pd.Series(d.index, index=data['title']).drop_duplicates()

def get_recommendations(title, cosine_sim, indices):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

print(get_recommendations('Toy Story', cosine_sim, indices))

2               Grumpier Old Men
1                        Jumanji
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object
