# Loading Required Data

In [None]:
import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import MinMaxScaler
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', 50)

In [None]:
meta = pd.read_csv("../input/the-movies-dataset/movies_metadata.csv", low_memory = False)

display(meta.head())
meta.shape

In [None]:
keywords = pd.read_csv("../input/the-movies-dataset/keywords.csv")

display(keywords.head())
keywords.shape

In [None]:
credits = pd.read_csv("../input/the-movies-dataset/credits.csv")

display(credits.head())
credits.shape

In [None]:
meta["id"] = meta["id"].apply(pd.to_numeric, errors = "ignore")
keywords["id"] = keywords["id"].apply(int)
credits["id"] = credits["id"].apply(int)

In [None]:
df = meta.merge(keywords, on = "id").merge(credits, on = "id")
df

# Checking Features, Cleaning, & Reducing Size

In [None]:
df.isnull().sum()

In [None]:
df.adult.value_counts()

We can remove duplicated movies:

In [None]:
df[df.duplicated(subset = ["title", "id"])].shape[0]

In [None]:
df[df.duplicated(subset = ["title", "id"], keep = False)].sort_values("id").head(4)

In [None]:
df.drop_duplicates(subset = ["title", "id"], inplace = True)

We can remove rows that have not title.

In [None]:
df = df[df.title.notnull()]
df.shape

Instead of using whole data, we can take movies that have more than 20 votes.

In [None]:
df = df[df.vote_count > 20]
df.shape

We can see huge decrease. With limiting vote count to 20, we eliminate about 30000 movies.

In [None]:
columns = ["title", "genres", "original_language", "overview", "popularity", 
           "release_date", "imdb_id", "runtime", "tagline", 
           "vote_average", "vote_count", "keywords", "cast", "crew"]

df = df[columns]

In [None]:
df = df[df["release_date"].notnull()]
df = df[df["runtime"].notnull()]

In [None]:
df["release_date"] = pd.to_datetime(df['release_date'])
df["release_year"] = df["release_date"].dt.year

df.drop("release_date", axis = 1, inplace = True)

In [None]:
df["vote_average_bins"] = pd.cut(df["vote_average"].astype(float), 10, labels = range(1, 11))
df["release_year_bins"] = pd.qcut(df["release_year"].astype(float), q = 10, labels = range(1, 11))

binned_features = ["vote_average_bins", "release_year_bins"]

You can use features with scaling. If you don't scale them, probably that features will be most important.

In [None]:
for col in binned_features:
    scaler = MinMaxScaler()
    
    df[col] = df[col].astype(int)
    df[col] = scaler.fit_transform(df[col].values.reshape(-1, 1))

In [None]:
df.set_index("title", inplace = True)

In [None]:
df.sample(5)

# Languages

In [None]:
languages = pd.get_dummies(df["original_language"])

languages

# Genres

In [None]:
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
df["genres"] = df["genres"].astype(str)

In [None]:
cv = CountVectorizer(lowercase = False)

genres = cv.fit_transform(df["genres"])
genres_df = pd.DataFrame(genres.todense(), columns = cv.get_feature_names())

genres_df.set_index(df.index, inplace = True)

genres_df

# Overview, Tagline, & Keywords

In [None]:
stop_words = stopwords.words('english')

def tokenizer(text):
    
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if len(token) > 1]
    stems = []
    
    for token in tokens:
        stemmer = PorterStemmer()
        stems.append(stemmer.stem(token))
    return stems

In [None]:
df['keywords'] = df['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
df["keywords"] = df["keywords"].astype(str)

In [None]:
df["tagline"].fillna("", inplace = True)
df["overview"].fillna("", inplace = True)
df["keywords"].fillna("", inplace = True)

df["text"] = df["overview"] + df["tagline"] + df["keywords"]

tfidf = TfidfVectorizer(min_df = 5, max_df = 0.8, tokenizer = tokenizer, ngram_range = (1, 2),
                        binary = True, use_idf = False, norm = None)

tfidf_matrix = tfidf.fit_transform(df["text"])
tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns = tfidf.get_feature_names())

tfidf_df.set_index(df.index, inplace = True)

tfidf_df

# Cast

In [None]:
df['cast'] = df['cast'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else "")

In [None]:
df["cast"] = df["cast"].apply(lambda x: [c.replace(" ", "") for c in x])
df["cast"] = df["cast"].apply(lambda x: x[:15])
df["CC"] = df["cast"].astype(str)

In [None]:
cv = CountVectorizer(lowercase = False, min_df = 4)

cast = cv.fit_transform(df["CC"])
cast_df = pd.DataFrame(cast.todense(), columns = cv.get_feature_names())

cast_df.set_index(df.index, inplace = True)

cast_df

# Directors

In [None]:
def director(x):
    for i in x:
        if i["job"] == "Director":
            return i["name"]
    return ""

df["dir"] = df["crew"].apply(literal_eval).apply(director)

In [None]:
directors = pd.get_dummies(df["dir"])

directors

# Writers

In [None]:
def writer_screenplay(x):
    names = []
    for i in x:
        if (i["job"] == "Writer") | (i["job"] == "Screenplay") | (i["job"] == "Author"):
            name = i["name"]
            names.append(name)
    return names

df["writer_screenplay"] = df["crew"].apply(literal_eval).apply(writer_screenplay)

In [None]:
df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: [c.replace(" ", "") for c in x])
df["writer_screenplay"] = df["writer_screenplay"].apply(lambda x: x[:3])
df["writer_screenplay"] = df["writer_screenplay"].astype(str)

In [None]:
cv = CountVectorizer(lowercase = False, min_df = 2)

writing = cv.fit_transform(df["writer_screenplay"])
writing_df = pd.DataFrame(writing.todense(), columns = cv.get_feature_names())

writing_df.set_index(df.index, inplace = True)

writing_df

# Final - Putting All Together

In [None]:
del meta, keywords, credits

gc.collect()

In [None]:
train = pd.concat([languages, genres_df, cast_df, writing_df, tfidf_df], axis = 1)

train = train.astype(np.int8)

In [None]:
train

# Recommendations

In [None]:
cosine_sim = cosine_similarity(train)

In [None]:
indices = pd.Series(range(0, len(train.index)), index = train.index).drop_duplicates()

In [None]:
def get_recommendations(title, cosine_sim = cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:16]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    recommendations = pd.DataFrame({"Movies": df.iloc[movie_indices].index.tolist(),
                                    "Id": df.iloc[movie_indices].imdb_id.tolist(),
                                    "Similarity": [sim[1] for sim in sim_scores]})
    return recommendations

This recommender system works well on film series because that movies have same genre, mostly similar cast, same director and writer. Also, their overviews are probably similar.

In [None]:
get_recommendations("Star Wars: Episode I - The Phantom Menace")

Recommendations for **Star Wars Episode I**:

- Other movies in its series or special episodes with related to this movie

- Generally sci-fi movies

In [None]:
get_recommendations("Madagascar")

Recommendations for **Madagascar**;

Other movies in its series or special episodes with related to this movie

Generally animation movies

In [None]:
get_recommendations("The Godfather")

In [None]:
get_recommendations("The Departed")

The Departed is remake of Internal Affairs. Recommending Internal Affairs shows us the system works well.

In [None]:
get_recommendations("Modern Times")

Recommendations for **Modern Times**:

- Chaplin is lead actor, director and writer in his movies. So that, recommendations are generally Chaplin's movies.

In [None]:
get_recommendations("The Good, the Bad and the Ugly")

We can see lots of western movies.

In [None]:
get_recommendations("The Dark Knight")

Movies with related to Batman and some revenge movies.

In [None]:
get_recommendations("2001: A Space Odyssey")

In [None]:
get_recommendations("Heat")

Generally heist and crime movies.

In [None]:
get_recommendations("Lock, Stock and Two Smoking Barrels")

Snatch is most similar movie for **Lock, Stock and Two Smoking Barrels**, this would be most succesful recommendation for this movie.