# Movie Recommendation Using Clustering and Content_Based Filtering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('IMDb_Movie_Dataset.csv', index_col=0)
df.head()

## K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from datetime import date
import tensorflow as tf
import random

In [None]:
# get columns with numerical value
data_num = df.drop(df.columns[[0, 3, 5, 6]], axis=1)
data_num.head()

In [None]:
# convert release year into movie's age
current_year = date.today().year
age = []
for i in data_num['Year']:
    age.append(current_year - i)
age = pd.DataFrame(age, columns=['Year'])
data_num['Year'] = age['Year']
data_num.head()

In [None]:
scaler = MinMaxScaler()
features = scaler.fit_transform(data_num)

In [None]:
inertia = []
for i in range(1,21):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(features)
    inertia.append(kmeans.inertia_)

In [None]:
#Visualizing the ELBOW method to get the optimal value of K
plt.figure(figsize=(20, 8))
plt.plot(range(1,21), inertia)
plt.title('The Elbow Method')
plt.xlabel('number of clusters')
plt.ylabel('inertia')
plt.show()

In [None]:
# to initialize and determine the starting centroids of the clusters
tf.compat.v1.random.set_random_seed(1234)

In [None]:
# K-Means clustering
kmeans = KMeans(
        n_clusters=10, init="k-means++",
        n_init=10,
        tol=1e-04, random_state=42
    )
kmeans.fit(features)
df['Label']=kmeans.labels_
df.head()

In [None]:
sns.countplot(x='Label', data=df)

## Content-Based Filtering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [None]:
# concatenate Title, Genre, Director, & Stars columns
data_string = []
for i in range(len(df)):
    tmp = ''
    tmp = str(df['Title'][i]) + ' ' + str(df['Genre'][i]) + ' ' + str(df['Director'][i]) + ' ' + str(df['Stars'][i])
    data_string.append(tmp.replace(',', ''))

In [None]:
# calculate text similarity
tfidf_vectorizer = TfidfVectorizer()
matrix = tfidf_vectorizer.fit_transform(data_string)
kernel = linear_kernel(matrix, matrix)

## Get Recommendation

In [None]:
def get_recommendation(movie_id):
    movie_list = list(enumerate(kernel[movie_id]))
    movie_list = sorted(movie_list, key=lambda x: x[1], reverse=True)
    
    movie_cluster = df['Label'][movie_id]
    
    movie_rec = []
    for i in movie_list:
        if len(movie_rec) == 11:
            break
        if df["Label"][i[0]] == movie_cluster:
            movie_rec.append(i[0])
        
    return movie_rec[1:]

Test movie recommendation

In [None]:
movie = 'Iron Man'
movie_index = df.index[df['Title'] == movie].values[0]
movie_rec = get_recommendation(movie_index)

print("Movie recommendation:")
rec = []
for i in movie_rec:
    tmp = []
    tmp.append(df['Title'][i])
    tmp.append(df['Year'][i])
    tmp.append(df['Label'][i])
    rec.append(tmp)
rec = pd.DataFrame(rec, columns=['Title', 'Year', 'Label'])
rec

## Preprocess Movie Recommendation Database

In [None]:
# save recommendation on every movie
movie_rec = []

for i in range(len(kernel)):
    movie_cluster = df['Label'][i]
    
    movie_list = list(enumerate(kernel[i]))
    movie_list = sorted(movie_list, key=lambda x: x[1], reverse=True)
    
    movie_rec_list = []
    for j in range(1, len(movie_list)):
        if len(movie_rec_list) == 11:
            break
        if df["Label"][movie_list[j][0]] == movie_cluster:
            movie_rec_list.append(movie_list[j][0])
        
    movie_rec.append(movie_rec_list)

# convert to pandas dataframe
movie_rec = pd.DataFrame(movie_rec)
movie_rec.head()

In [None]:
# save dataset to csv
movie_rec.to_csv('movie_recommendation_dataset.csv', index=False)