In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
# CountVectorizer - processes features and converts it a vector
from sklearn.feature_extraction.text import CountVectorizer

# cosine_similarity - Measures similarity between two vectors from 0-1 using angle of diference
from sklearn.metrics.pairwise import cosine_similarity

def get_title_from_index(index):
  return df.loc[index, "original_title"]

def get_index_from_title(original_title):
  return df.loc[df.original_title == original_title].index[0]

In [None]:
# Import Dataset
df = pd.read_csv("../input/imdb-v2/IMDB Movies 2000 - 2020.csv")

# remove spaces from strings in column
df.columns = df.columns.str.replace(' ', '')

# check columns from dataset
print(df.columns)

df.shape

In [None]:
# Choose features to compare
features = ['actors', 'genre', 'language_1', 'director', 'language_2', 'actors_f2', 'actors_1', 'desc35','country']

In [None]:
def combine_features(row):

## Input feature(s) to measure, only strings allowed
# Note; actors_f2 and genre have been used twice to put more emphasis over other features. 
    return row['actors_f2'] +" "+row['genre'] +" "+row['language_1'] +" "+row['director'] +" "+row['country'] +" "+row['desc35'] +" "+row['actors_f2'] +" "+row['genre']

## Check and print error if need
#"+row['actors'] +" "+row['description']
#  except:
#    print("Error", row)

In [None]:
# Remove NaNs in feature
for feature in features:
  df[feature] = df[feature].fillna(' ')

# Remove spaces between names
df.director = df.director.str.replace(' ', '')
df.actors_f2 = df.actors_f2.str.replace(' ', '')

# Double check feature transform
print(df.actors_f2)
print(df.director)

In [None]:
# Combine chosen features into 1 column
df["combined_features"] = df.apply(combine_features,axis=1)
print("Combined Features:", df["combined_features"].head())

In [None]:
# Define CountVectorizer
cv = CountVectorizer()

# Counts the number of texts and prints the transformed matrix into an array for better understanding
count_matrix = cv.fit_transform(df["combined_features"])

In [None]:
cosine_sim = cosine_similarity(count_matrix)

In [None]:
# Input movie to match with (eg. what did the user just watched)
movie_user_likes = "Jurassic World" 

movie_index = get_index_from_title(movie_user_likes)
# Compile similar movies based on cosine similarity
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [None]:
# Sort chosen movie index, reverse to sort best match first
sorted_similar_movies = sorted(similar_movies,key=lambda x: x[1], reverse=True)

In [None]:
# Show recommendations based on movie input
i=0
for movie in sorted_similar_movies:
  print (get_title_from_index(movie[0]))
  i=i+1
# Show 15 recommendations (Change number as desire)
  if i>15:
    break