# Feature Combination - Hybrid Model

#### Author: Sulekha Aloorravi

In [1]:
import pandas as pd

### Read Ratings data

In [2]:
df_ratings = pd.read_csv("ratings_small.csv")

In [3]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
df_ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Create Pivot with movie id as row and user id as column

In [5]:
pivot_ratings = df_ratings.pivot_table(values = "rating", index = "movieId", columns = "userId")

In [6]:
pivot_ratings = pivot_ratings.fillna(0)

In [7]:
pivot_ratings.head(5)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


### Read Movies Metadata

In [8]:
movies_metadata = pd.read_csv("movies_metadata.csv", low_memory =  False)

In [9]:
movies_metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [10]:
movies_metadata['tagline'] = movies_metadata['tagline'].fillna('')

In [11]:
movies_metadata['overview'] = movies_metadata['overview'].fillna('')

In [12]:
movies_metadata['description'] = movies_metadata['overview'] + movies_metadata['tagline']

In [13]:
movies_metadata['title'].drop_duplicates(inplace = True)

In [14]:
movies_metadata.reset_index(inplace = True)

In [15]:
movies_metadata.dtypes

index                      int64
adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
description               object
dtype: object

### Convert text into features using tfidf

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english',max_features = 100)
tfidf_matrix = tf.fit_transform(movies_metadata['description'])

In [18]:
cbr_features = pd.DataFrame(tfidf_matrix.todense(), columns = tf.get_feature_names(), index = movies_metadata["id"] )

In [19]:
cbr_features.reset_index(inplace = True)

In [20]:
### Create Hybrid Features by merging collaborative features and content based features

In [21]:
pivot_ratings.reset_index(inplace = True)

In [22]:
pivot_ratings["movieId"] = pivot_ratings["movieId"].astype("str")

In [23]:
pivot_ratings["movieId"] = pivot_ratings["movieId"].astype("object")

In [24]:
hybrid_features = pivot_ratings.merge(cbr_features, left_on = "movieId", right_on = "id")

In [25]:
hybrid_features.drop(["id"], axis = 1, inplace = True)

In [26]:
hybrid_features_id = hybrid_features.copy()

In [27]:
hybrid_features.drop(["movieId"], axis = 1, inplace = True)

### Convert hybrid features into a Sparse matrix

In [28]:
from scipy.sparse import csr_matrix

In [29]:
ratings_sparse = csr_matrix(hybrid_features.values)

In [30]:
ratings_sparse

<2831x771 sparse matrix of type '<class 'numpy.float64'>'
	with 58334 stored elements in Compressed Sparse Row format>

### Build a Recommendation Engine

In [31]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')

In [32]:
model_knn

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [33]:
model_knn.fit(ratings_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [34]:
import numpy as np

In [35]:
query_index = np.random.choice(hybrid_features.shape[0])

In [36]:
query_index

1308

In [37]:
distances, indices = model_knn.kneighbors(hybrid_features.iloc[query_index,:].values.reshape(1,-1), n_neighbors = 6)

In [44]:
distances, indices

(array([[ 0.        ,  0.15152979,  0.22710266,  0.25760766,  0.33196553,
          0.3827866 ]]),
 array([[1308,  388, 1411, 1273,  559,  917]], dtype=int64))

In [47]:
hybrid_features.index[indices.flatten()[1]]

388

In [48]:
distances.flatten()[1]

0.15152979377740383

In [54]:
hybrid_features_id.loc[hybrid_features_id.index[1308]][0]

'2899'

In [52]:
hybrid_features_id.loc[hybrid_features_id.index[388]][0]

'580'

In [57]:
hybrid_features_id.loc[hybrid_features_id.index[1411]][0]

'3145'

In [55]:
movies_metadata[movies_metadata["id"] == '2899'].title

9260    Asterix & Obelix: Mission Cleopatra
Name: title, dtype: object

In [56]:
movies_metadata[movies_metadata["id"] == '580'].title

3998    Jaws: The Revenge
Name: title, dtype: object

In [58]:
movies_metadata[movies_metadata["id"] == '3145'].title

33312    The Horror of Frankenstein
Name: title, dtype: object