# Movie Recommendation
- https://towardsdatascience.com/using-cosine-similarity-to-build-a-movie-recommendation-system-ae7f20842599

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv(r"../../data/movie_dataset.csv")

In [2]:
df['original_title'].sample(30)

3892                                归来
1894                               War
1530                            Flight
3983                     Hustle & Flow
1727                    3 Days to Kill
571               Inglourious Basterds
1382                              TMNT
1108                         Pinocchio
950                     The Negotiator
1633                       Ultraviolet
3835                         The Witch
1132                   Red Riding Hood
3034                               Mud
3010             Employee of the Month
2150                    Eye for an Eye
1933                     Underclassman
180               The Bourne Ultimatum
1433                 The Four Feathers
111                       Transformers
208                   The 13th Warrior
1441    Walk Hard: The Dewey Cox Story
1421                             Hoffa
2546                        The Skulls
1778                     Freaky Friday
4197                 My Summer of Love
4452               جدایی 

In [3]:
df.loc[df.original_title == 'Cars']

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
566,566,120000000,Animation Adventure Comedy Family,http://disney.go.com/disneyvideos/animatedfilm...,920,car race car journey village and town auto rou...,en,Cars,"Lightning McQueen, a hotshot rookie race car d...",82.643036,...,117.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Ahhh... it's got that new movie smell.,Cars,6.6,3877,Owen Wilson Paul Newman Bonnie Hunt Larry the ...,"[{'name': 'John Lasseter', 'gender': 2, 'depar...",John Lasseter


In [4]:
df.shape

(4803, 24)

In [5]:
features = ['keywords', 'cast', 'genres', 'director']
for feature in features:
    df[feature] = df[feature].fillna('')

In [6]:
def combined_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']
df["combined_features"] = df.apply(combined_features, axis =1)

In [7]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])
print("Count Matrix:", count_matrix.toarray())

Count Matrix: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
count_matrix.shape

(4803, 14845)

In [9]:
len(cv.get_feature_names())

14845

In [10]:
count_matrix

<4803x14845 sparse matrix of type '<class 'numpy.int64'>'
	with 97547 stored elements in Compressed Sparse Row format>

In [11]:
cosine_sim = cosine_similarity(count_matrix)

In [12]:
movie_user_likes = "Cars"
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [13]:
def get_features(movie_index):
    f = pd.DataFrame.sparse.from_spmatrix(count_matrix[movie_index], columns =cv.get_feature_names() )
    m2 = (f != 0).any()
    return f[m2.index[m2]].T

In [14]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

features = []

i=0
for movie in sorted_similar_movies:
    print(movie[0] , get_title_from_index(movie[0]))
    feature_vector = get_features(movie[0])
    features.append(feature_vector)
    i=i+1
    if i>15:
        break

566 Cars
40 Cars 2
405 The Fast and the Furious: Tokyo Drift
500 2 Fast 2 Furious
935 Herbie Fully Loaded
1152 Back to the Future Part II
1186 The Final Destination
706 Days of Thunder
2285 Back to the Future
2426 Larry the Cable Guy: Health Inspector
503 The Adventures of Rocky & Bullwinkle
130 Bolt
1086 Aliens in the Attic
44 Furious 7
3318 Witless Protection
1983 Meet the Deedles


In [15]:
pd.concat(features[0:3], axis=1).dropna()

Unnamed: 0,0,0.1,0.2
car,2.0,1.0,4.0
race,1.0,1.0,1.0
