# Load the dataset

In [1]:
pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.13.tar.gz (63 kB)
[K     |████████████████████████████████| 63 kB 709 kB/s eta 0:00:01
Collecting python-slugify
  Downloading python_slugify-8.0.1-py2.py3-none-any.whl (9.7 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 3.6 MB/s eta 0:00:011
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.13-py3-none-any.whl size=77734 sha256=82a2f8e7a8538fc3ae19feb45a9ae8c7420f1e46a3b53c28a4d88db58d23eb6e
  Stored in directory: /home/jovyan/.cache/pip/wheels/9c/45/15/6d6d116cd2539fb8f450d64b0aee4a480e5366bb11b42ac763
Successfully built kaggle
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.13 python-slugify-8.0.1 text-unidecode-1.3
Note: you may need to restart the kernel to use updated packages.


In [19]:
!mkdir ~/.kaggle

In [20]:
!ls ~/.kaggle

kaggle.json


In [10]:
import os
import kaggle
kaggle.api.dataset_download_files('ahsanaseer/top-rated-tmdb-movies-10k', path='movies', unzip=True)



In [1]:
import pandas as pd
movies=pd.read_csv("./movies/top10K-TMDB-movies.csv")
movies=movies[['id', 'title', 'overview', 'genre']]
movies['tags'] = movies['overview']+movies['genre']
recommendation_movie  = movies.drop(columns=['overview', 'genre'])

# Model Creation

## Single Value Decomposition

In [149]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
movies=pd.read_csv("./movies/top10K-TMDB-movies.csv")
movies=movies[['id', 'title', 'overview', 'genre']]

recommendation_movie = movies
recommendation_movie['tags'] = movies['overview']
recommendation_movie = recommendation_movie.drop(columns = ['overview'])

recommendation_movie["tags"] = recommendation_movie["tags"].fillna('')
recommendation_movie["genre"] = recommendation_movie["genre"].fillna('')
recommendation_movie["tags"] = recommendation_movie["tags"].apply(lambda x: x.lower())
recommendation_movie["genre"] = recommendation_movie["genre"].apply(lambda x: x.lower())

In [363]:
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',token_pattern=r'\b\w+\b', ngram_range=(1, 2),sublinear_tf=True,smooth_idf=True)
tag_matrix = vectorizer.fit_transform(recommendation_movie['tags'])
genre_vectorizer = TfidfVectorizer(stop_words='english',token_pattern=r'\b\w+\b', ngram_range=(1, 2),sublinear_tf=True,smooth_idf=True)
genre_matrix = genre_vectorizer.fit_transform(recommendation_movie['genre'])

In [364]:
from scipy.sparse import hstack
combined_matrix = hstack([tag_matrix, genre_matrix])

In [365]:
from sklearn.metrics.pairwise import cosine_similarity
num_latent_factors = 400
U, sigma, Vt = svds(combined_matrix, k=num_latent_factors)
similarity_matrix = cosine_similarity(U)

In [366]:
def recommend_movies_svds(movie_title):
    movie_index = movies[movies['title'] == movie_title].index[0]
    movie_score = similarity_matrix[movie_index].reshape(1, -1)
    similar_indices = np.argsort(movie_score)[-1,-6:-1][::-1]
    recommended_movies = movies.loc[similar_indices, 'title']
    return print(recommended_movies)

In [368]:
recommend_movies_svds("The Dark Knight")

825             Batman Begins
688     The Dark Knight Rises
8361         Reasonable Doubt
808                      Rush
9533                   Edison
Name: title, dtype: object


In [369]:
import pickle
pickle.dump(recommendation_movie,open("movies.pkl","wb"))
pickle.dump(similarity_matrix, open("similarity.pkl", "wb"))

## Bag of Words

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=10000, stop_words='english')

In [3]:
model =cv.fit_transform(recommendation_movie['tags'].values.astype('U')).toarray()

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(model)

In [5]:
def recommend(movie_title):
    movie_index = recommendation_movie[recommendation_movie.title == movie_title].index[0]
    cosine_similarity = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda model:model[1])
    for i in cosine_similarity[1:6]:
        print(recommendation_movie.iloc[i[0]].title)

In [6]:
import pickle
pickle.dump(recommendation_movie,open("movies.pkl","wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))

## tfidf

In [7]:
recommendation_movie["tags"] = recommendation_movie["tags"].fillna('')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
tfmodel = tfidf.fit(recommendation_movie['tags'])
tag_vectors = tfidf.fit_transform(recommendation_movie['tags'])

In [9]:
similarity_matrix = cosine_similarity(tag_vectors)

In [10]:
def recommend_tfidf(movie_title):
    movie_index = recommendation_movie[recommendation_movie.title == movie_title].index[0]
    tfidf_cosine = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1])
    for i in tfidf_cosine[1:6]:
        print(recommendation_movie.iloc[i[0]].title)
    

In [11]:
import pickle
pickle.dump(recommendation_movie,open("movies.pkl","wb"))
pickle.dump(similarity, open("similarity.pkl", "wb"))