In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/movie-metadatacsv/movie_metadata.csv')
data.head()

In [None]:
#Useful columns
data = data[['movie_title', 'title_year', 'genres', 'language', 'country','color', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', ]]
data.isna().sum()

In [None]:
data_no_duplicates = data.drop_duplicates()
print((data.shape[0] - data_no_duplicates.shape[0]),' Perfect duplicates deleted.')
data = data_no_duplicates
data_no_duplicates = data.drop_duplicates(subset=['movie_title', 'title_year'])
print((data.shape[0] - data_no_duplicates.shape[0]),' duplicates by names')
data = data_no_duplicates

In [None]:
data['movie_title'] = data['movie_title'].str.strip()

In [None]:

data['title_year'].fillna( value=round(data.title_year.mean()) , inplace=True )
data['language'].fillna( value='English' , inplace=True )
data['country'].fillna( value='USA' , inplace=True )
data['color'].fillna( value='Color' , inplace=True )
for col in ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']:
    data[col].fillna( value='' , inplace=True )

data.head()

In [None]:
data_words = data[['movie_title', 'genres', 'language','country', 'color', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name']]

def replace_space(val):
    val = val.str.replace(' ','_')
    val = val.str.replace('.','')
    return val

data_words = data_words.apply(replace_space, axis=1)
data_words.head()

In [None]:
data['words'] = (data_words['movie_title'] +
               ' ' + data_words['genres'].str.replace('|', ' ') +
               ' ' + data_words['language'] +
               ' ' + data_words['country'] +
               ' ' + data_words['color'] +
               ' ' + data_words['director_name'] +
               ' ' + data_words['actor_1_name'] +
               ' ' + data_words['actor_2_name'] +
               ' ' + data_words['actor_3_name']
              )
data.head()

## Building the model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['words'])
tfidf_matrix.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform(data['words'])
cv_matrix.shape

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim_cv = linear_kernel(cv_matrix, cv_matrix)

In [None]:
indices = pd.Series(data.index, index=data['movie_title'])

In [None]:
def get_recommendations(title, cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return data['movie_title'].iloc[movie_indices]

In [None]:
get_recommendations('The Avengers', cosine_sim_tfidf)

In [None]:
get_recommendations('The Avengers', cosine_sim_cv)