In [None]:
#Connecting to Gdrive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval
import warnings; 
warnings.simplefilter('ignore')

In [None]:
#Reading the datasets
moviemetadata = pd.read_csv('gdrive/My Drive/CS419/movies_metadata.csv')
credits = pd.read_csv('gdrive/My Drive/CS419/credits.csv')
links = pd.read_csv('gdrive/My Drive/CS419/links_small.csv')
keywords = pd.read_csv('gdrive/My Drive/CS419/keywords.csv')


In [None]:
#Deciding the useful columns
moviemetadata = moviemetadata.loc[:, ["genres", "id", "original_title", "vote_count", "vote_average"]]

In [None]:
#Converting the datatype of the ID
keywords["id"] = keywords["id"].astype("int")
credits["id"] = credits["id"].astype("int")
moviemetadata["id"] = moviemetadata["id"].astype("int")


ValueError: ignored

In [None]:
#Merging the useful credits, keywords columns
finalmoviemetadata = moviemetadata.merge(credits, on = "id")
finalmoviemetadata = moviemetadata.merge(keywords, on = "id")
vote_counts = finalmoviemetadata[finalmoviemetadata["vote_count"].notnull()]["vote_count"].astype("int")
vote_averages = finalmoviemetadata[finalmoviemetadata["vote_average"].notnull()]["vote_average"].astype("int")


ValueError: ignored

Using IMDB's weighted rating formula

**Weighted Rating** =  $\frac{v}{v+m}.R + \frac{m}{v+m}.C$

v: number of votes for the movie

R: average rating of the movie

C: mean vote across all movies

m: minimum votes required to be listed

We set  m=0.5 , which essentially means that we keep the top 50 percentile movies only to ignore the lower- rated films

In [None]:

C = vote_averages.mean()
m = vote_counts.quantile(0.50)

NameError: ignored

In [None]:
#Taking the movies with non null values above the required threshold
finalmoviemetadata = finalmoviemetadata[(finalmoviemetadata["vote_count"] >= m) & (finalmoviemetadata['vote_count'].notnull()) & (finalmoviemetadata['vote_average'].notnull())]

In [None]:

finalmoviemetadata["vote_count"] = finalmoviemetadata["vote_count"].astype("int")
finalmoviemetadata["vote_average"] = finalmoviemetadata["vote_average"].astype("int")

In [None]:
#converting string to object
finalmoviemetadata["keywords"] = finalmoviemetadata["keywords"].apply(literal_eval)
finalmoviemetadata["genres"] = finalmoviemetadata["genres"].apply(literal_eval)
finalmoviemetadata["cast"] = finalmoviemetadata["cast"].apply(literal_eval)

#keeping only top 3 cast names per movie
finalmoviemetadata["cast"] = finalmoviemetadata["cast"].apply(lambda x: x[:3] if len(x)>3 else x)

In [None]:
#Removing blank spaces
finalmoviemetadata["director"] = finalmoviemetadata["director"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))
finalmoviemetadata["cast"] = finalmoviemetadata["cast"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))
finalmoviemetadata["genres"] = finalmoviemetadata["genres"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))
finalmoviemetadata["keywords"] = finalmoviemetadata["keywords"].astype("str").apply(lambda x: str.lower(x.replace(" ", "")))

In [None]:
#Deciding the final features to work upon
finalmoviemetadata["features"] = finalmoviemetadata["genres"] + finalmoviemetadata["cast"] + finalmoviemetadata["director"] + finalmoviemetadata["keywords"]

In [None]:
finalmoviemetadata["features"]

In [None]:
#Making word matrices to compute similarity
words = CountVectorizer(analyzer = "word", ngram_range = (1, 2), min_df = 0, stop_words = 'english')
word_matrix = words.fit_transform(finalmoviemetadata["features"])

Cosine similarity
$cosine(x,y) = \frac{x.y'}{||x||.||y||}$

In [None]:
#We use cosine similarity for similarity computation
cos_sim = cosine_similarity(word_matrix, word_matrix)

In [None]:
finalmoviemetadata = finalmoviemetadata.reset_index()

#getting titles and indices of the movies
titles = finalmoviemetadata["original_title"]
indices = pd.Series(finalmoviemetadata.index, index = finalmoviemetadata["original_title"])

In [None]:
from flask import Flask, request, json
app = Flask(__name__)

@app.route("/")
def main():
    title = request.args.get('movie')
    idx = indices[title]
    print("Index",idx)
    similar_scores = list(enumerate(cos_sim[idx]))

    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_scores = similar_scores[1:6]
    movie_indices = [i[0] for i in similar_scores]

    output = []
    for item in titles.iloc[movie_indices]:
        output.append(item)
    return json.dumps(output)
if __name__ == "__main__":
    app.run()





# def recommendations(title):
#     index = indices[title]
#     simil = list(enumerate(cos_sim[index]))
#     simil = sorted(simil, key = lambda x: x[1], reverse=True)
#     simil = simil[1:31]
#     movies = [i[0] for i in simil]
#     return titles.iloc[movies]