In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
dataset = pd.read_csv('movie_dataset.csv')
dataset.head(1)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
index                   4803 non-null int64
budget                  4803 non-null int64
genres                  4775 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4391 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null fl

In [4]:
dataset.describe()

Unnamed: 0,index,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,4803.0,4803.0,4803.0,4803.0,4803.0,4801.0,4803.0,4803.0
mean,2401.0,29045040.0,57165.484281,21.492301,82260640.0,106.875859,6.092172,690.217989
std,1386.651002,40722390.0,88694.614033,31.81665,162857100.0,22.611935,1.194612,1234.585891
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
25%,1200.5,790000.0,9014.5,4.66807,0.0,94.0,5.6,54.0
50%,2401.0,15000000.0,14629.0,12.921594,19170000.0,103.0,6.2,235.0
75%,3601.5,40000000.0,58610.5,28.313505,92917190.0,118.0,6.8,737.0
max,4802.0,380000000.0,459488.0,875.581305,2787965000.0,338.0,10.0,13752.0


In [9]:
dataset.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [5]:
def get_title(index):
    return dataset[dataset.index == index]['title'].values[0]

In [6]:
def get_index(title):
    return dataset[dataset.title == title]['index'].values[0]

In [10]:
def combine_features(row):
    return row['keywords']+' '+row['cast']+' '+row['genres']+' '+row['director']+' '+row['overview']+' '+row['production_companies']+' '+row['spoken_languages']+' '+row['cast']+' '+row['crew']

In [11]:
features_to_use = ['keywords', 'genres', 'cast', 'director', 'overview', 'production_companies', 'spoken_languages', 'cast', 'crew']

In [12]:
for feature in features_to_use:
	dataset[feature] = dataset[feature].fillna('')


In [13]:
len(dataset)

4803

In [14]:
dataset['combined_features'] = dataset.apply(combine_features, axis=1)

In [15]:
dataset.head(1)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...


In [16]:
cv = CountVectorizer()

In [17]:
count_matrix = cv.fit_transform(dataset['combined_features'])
# count_matrix

In [18]:
similarity_score = cosine_similarity(count_matrix)

In [19]:
def get_recoomendations(my_fav):
    movie_index = get_index(my_fav)
    similar_movies = list(enumerate(similarity_score[movie_index]))

    #a

    sorted_simialr_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

    i=0
    for movie in sorted_simialr_movies:
        print('Movies: ', get_title(movie[0]) )
        i=i+1
        if i>10:
            break

In [24]:
my_fav = 'Star Wars'
get_recoomendations(my_fav)

Movies:  Star Wars
Movies:  The Empire Strikes Back
Movies:  Indiana Jones and the Temple of Doom
Movies:  Mad Max: Fury Road
Movies:  The Lord of the Rings: The Two Towers
Movies:  Lara Croft: Tomb Raider
Movies:  Alexander
Movies:  Land of the Dead
Movies:  The Chronicles of Narnia: Prince Caspian
Movies:  Around the World in 80 Days
Movies:  Transformers


In [25]:
my_fav = 'Avatar'
get_recoomendations(my_fav)

Movies:  Avatar
Movies:  The Hunger Games: Mockingjay - Part 1
Movies:  Exodus: Gods and Kings
Movies:  Ender's Game
Movies:  The Avengers
Movies:  Cloud Atlas
Movies:  The Day After Tomorrow
Movies:  Jupiter Ascending
Movies:  The Curious Case of Benjamin Button
Movies:  Captain America: The First Avenger
Movies:  The Martian


# create pipe Line

In [23]:
#up next