In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
import pickle

In [None]:
movies = pd.read_csv('movie_data_with_urls.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
movies

### Data Preprocessing

In [None]:
movList = list(movies['genres'])
sp_list = []

for mov in movList:
    sp_list.append(mov.split('|'))

movies['genres'] = sp_list
movies

In [None]:
movies['genres'] = movies['genres'].apply(lambda genre_list: [genre.lower() for genre in genre_list])
movies['title'] = movies['title'].str.lower()
movies

In [None]:
display(movies.head(), ratings.head())

In [None]:
mov_ratings = movies.merge(ratings, how='inner', on='movieId')
mov_ratings.drop('timestamp', axis=1, inplace=True)
mov_ratings.head()

### Content Based Filtering

In [None]:
mov_ratings[mov_ratings['title'] == 'Cocoon (1985)']

In [None]:
new_df = mov_ratings[['movieId', 'title', 'genres', 'Poster_URL']]


In [None]:
# remove square brackets
new_df['genres'] = new_df['genres'].apply(lambda x: ' '.join(x))
new_df.drop_duplicates(inplace=True)

In [None]:
new_df.shape

In [None]:
new_df = new_df.reset_index()
new_df.head()

In [None]:
# To transform given text into a vector on the basis of frequency count
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=23, stop_words='english')

In [None]:
vectors = cv.fit_transform(new_df['genres']).toarray()
vectors.shape

In [None]:
from nltk.stem.porter import PorterStemmer
pt = PorterStemmer()

In [None]:
def stem(text):
    y = []
    for i in text.split():
        y.append(pt.stem(i))
    
    return " ".join(y)

In [None]:
new_df['genres'] = new_df['genres'].apply(stem)
new_df['genres'][0]

In [None]:
# Save the array to a pickle file
with open('./PKL_Files/stemmed_df_content_based', 'wb') as file:
    pickle.dump(new_df, file)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [None]:
# Save the array to a pickle file
with open('./PKL_Files/similarity_content_based', 'wb') as file:
    pickle.dump(similarity, file)

In [None]:
def recommend(movie):
    # load files
    # Load the array from the pickle file
    with open('./PKL_Files/stemmed_df_content_based', 'rb') as file:
        new_df = pickle.load(file)

        # Load the array from the pickle file
    with open('./PKL_Files/similarity_content_based', 'rb') as file:
        similarity = pickle.load(file)

    mov_list = []
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movie_list:
        d = dict()
        d['title'] = new_df.iloc[i[0]].title
        d['url'] = new_df.iloc[i[0]].Poster_URL
        mov_list.append(d)
    
    return mov_list

In [None]:
recommended_mov_list = recommend('toy story (1995)')
for m in recommended_mov_list:
    print(m)

### Colaborative filtering

In [None]:
mov_ratings[mov_ratings['title'] == 'nothing in common (1986)']

In [None]:
# remove square brackets
pd.set_option('display.max_colwidth', None)

mov_ratings['genres'] = mov_ratings['genres'].apply(lambda x: ' '.join(x))
mov_ratings.drop_duplicates(inplace=True)


In [None]:
# Save the array to a pickle file
with open('./PKL_Files/movie_rating_collaborative', 'wb') as file:
    pickle.dump(mov_ratings, file)

In [None]:
# users who have given more than 100 ratings are considered

x = mov_ratings.groupby('userId').count()['rating'] > 100
users = x[x].index
users

In [None]:
filtered_rating = mov_ratings[mov_ratings['userId'].isin(users)]
filtered_rating


In [None]:
# movies with more than 50 ratings
y = filtered_rating.groupby('title').count() > 50
famous_movies = y[y].index
famous_movies

In [None]:
final_ratings = filtered_rating[filtered_rating['title'].isin(famous_movies)]



In [None]:
pt = final_ratings.pivot_table(index='title',columns='userId',values='rating')
pt.fillna(0,inplace=True)
pt

In [None]:
# Save the array to a pickle file
with open('./PKL_Files/pivot_table_collaborative', 'wb') as file:
    pickle.dump(pt, file)

In [None]:
similarity_scores = cosine_similarity(pt)
similarity_scores

In [None]:
# Save the array to a pickle file
with open('./PKL_Files/similarity_scores_collaborative', 'wb') as file:
    pickle.dump(similarity_scores, file)

In [None]:
def collaborative_recommend(movie_name):
    # Load files
    with open('./PKL_Files/movie_rating_collaborative', 'rb') as file:
        mov_ratings = pickle.load(file)

    with open('./PKL_Files/similarity_scores_collaborative', 'rb') as file:
        similarity_scores = pickle.load(file)

    with open('./PKL_Files/pivot_table_collaborative', 'rb') as file:
        pt = pickle.load(file)

    # index fetch
    index = np.where(pt.index==movie_name)[0][0]

    similar_items = sorted(list(enumerate(similarity_scores[index])),key=lambda x:x[1],reverse=True)[1:5]
    # simillar items from 1 to 4
    
    data = []
    for i in similar_items:
        item = []
        d = dict()
        temp_df = mov_ratings[mov_ratings['title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('title')['title'].values))

        # urls = temp_df['Poster_URL'].str.replace(r'\d+', '', regex=True) # to remove the index value before the RL

        d['title'] = item[0]
        d['url'] = temp_df['Poster_URL'].values.tolist()[0]
        data.append(d)
    
    return data

In [None]:
collaborative_recommend('zulu (1964)')