# Mixed Models Recommendation Engine - Hybrid Model

#### Author: Sulekha Aloorravi

In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import numpy as np

### Build item-based collaborative filtering separately

### Read Ratings Data

In [2]:
df_ratings = pd.read_csv("ratings_small.csv")

In [3]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
df_ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Create Pivot with movie id as row and user id as column

In [5]:
pivot_ratings = df_ratings.pivot_table(values = 'rating', index = 'movieId', columns = 'userId')

In [6]:
pivot_ratings = pivot_ratings.fillna(0)

### Build Content based Recommendation Engine separately

### Read Movies Metadata

In [7]:
movies_metadata = pd.read_csv("movies_metadata.csv", low_memory = False)

In [8]:
movies_metadata['tagline'].fillna('')
movies_metadata['description'] = movies_metadata['overview'] + movies_metadata['tagline']

In [9]:
movies_metadata.dropna(subset=['description'], inplace=True)
movies_metadata['title'].drop_duplicates(inplace=True)

In [10]:
movies_metadata["id"] = movies_metadata["id"].astype("int64")

In [11]:
movies_metadata.reset_index(inplace = True)

### Content based featurization

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english',max_features = 100)
tfidf_matrix = tf.fit_transform(movies_metadata['description'])

In [13]:
tfidf_matrix

<20404x100 sparse matrix of type '<class 'numpy.float64'>'
	with 102663 stored elements in Compressed Sparse Row format>

In [14]:
cbr_features = pd.DataFrame(tfidf_matrix.todense(), columns = tf.get_feature_names(), index = movies_metadata["id"])

### Item-based featurization

In [15]:
pivot_ratings.reset_index(inplace = True)

In [16]:
pivot_ratings = pivot_ratings[pivot_ratings["movieId"].isin(movies_metadata["id"])]

In [17]:
pivot_ratings.reset_index(inplace = True, drop = True)

In [18]:
pivot_ratings_id = pivot_ratings.copy()

In [19]:
pivot_ratings_id.reset_index(inplace = True, drop = True)

In [20]:
pivot_ratings.drop(labels = ["movieId"], axis = 1, inplace = True)

In [21]:
ratings_sparse = csr_matrix(pivot_ratings.values)

### Recommend movies based on similar movies from both the models

In [22]:
query_index = np.random.choice(pivot_ratings.shape[0])

In [23]:
query_index

391

In [24]:
movie = pivot_ratings_id[pivot_ratings_id.index == query_index].movieId.values[0]

### Item based Recommendation

In [25]:
model_cbf = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_cbf.fit(ratings_sparse)

model_cbf

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [26]:
distances, indices = model_cbf.kneighbors(pivot_ratings.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
recommendations = []
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Check combined output')
    else:
        recommendations.append(movies_metadata[movies_metadata.id ==\
                                pivot_ratings_id[pivot_ratings_id.index == indices.flatten()[i]].movieId.values[0]]\
                                .title.values[0])

Check combined output


### Content based Reommendation

In [27]:
cbr_features_id = cbr_features.copy()

In [28]:
cbr_features_id.reset_index(inplace = True)

In [29]:
cbr_features.reset_index(inplace = True, drop = True)

In [30]:
cbr_index = cbr_features_id[cbr_features_id.id == movie].index.values[0]

In [31]:
cbr_sparse = csr_matrix(cbr_features .values)

In [32]:
model_cbr = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_cbr.fit(cbr_sparse)
model_cbr

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [33]:
cbr_distances, cbr_indices = model_cbr.kneighbors(cbr_features.iloc[cbr_index, :].values.reshape(1, -1), n_neighbors = 6)
for i in range(0, len(cbr_distances.flatten())):
    if i == 0:
        print('Check combined output')
    else:
        recommendations.append(movies_metadata[movies_metadata.id ==\
                                cbr_features_id[cbr_features_id.index == cbr_indices.flatten()[i]].id.values[0]]\
                                .title.values[0])

Check combined output


### List of recommendations from combined model

In [34]:
print ('Recommendations for: {0}\n'.format(movies_metadata[movies_metadata['id'] == movie].title.values[0]))
print(recommendations)

Recommendations for: Army of Darkness

['Brazil', 'Westworld', 'Down by Law', 'Open Water', 'About Schmidt', 'Bride of Re-Animator', 'Devil Doll', 'Evil Dead II', 'Jason Goes to Hell: The Final Friday', 'Scary Movie 5']
