# **About Project**
**This is the final project of An End TO End Deep Learning training in Electro Pi for AI, This project is a recommender system for Netflix for movies using Machine Learning and Deep Learning tools**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Lets import some libraries that we will use
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow.keras as tf
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# To shift lists
from collections import deque

In [None]:
# Load single data-file
df_raw = pd.read_csv('../input/netflix-prize-data/combined_data_2.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie_Id'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
movie_info = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(movie_info.shape))
movie_info.sample(5)

In [None]:
#load titles of movies
movie_title = pd.read_csv('../input/netflix-prize-data/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
movie_title.set_index('Movie_Id', inplace = True)
movie_title.head()

In [None]:
#put all together
data_merge = pd.merge(movie_info, movie_title, on='Movie_Id')

In [None]:
data_merge.head()

In [None]:
data_merge.sample(5)

let's get some information from data

In [None]:
data_merge.groupby('Name')['Rating'].mean().sort_values(ascending=False).head()


In [None]:
data_merge.groupby('Name')['Rating'].count().sort_values(ascending=False).head()

In [None]:
#Data frame for ratings 
ratings = pd.DataFrame(data_merge.groupby('Name')['Rating'].mean())
ratings.head()

In [None]:
ratings['num of ratings'] = pd.DataFrame(data_merge.groupby('Name')['Rating'].count().sort_values(ascending=False))
ratings.head()

In [None]:
plt.figure(figsize=(10,4))
ratings['num of ratings'].hist()


In [None]:
plt.figure(figsize=(10,4))
ratings['Rating'].hist(bins=100)

In [None]:

sns.jointplot(x='Rating',y='num of ratings',data=ratings,alpha=0.5)


# **Let's prepair data for our model**

In [None]:
#drop columns that we won't use
data_model = data_merge.drop(columns=['Date','Year','Name'])
data_model.head()

In [None]:
print(data_model.shape)
print(data_model.User.nunique())
print(data_model.Movie_Id.nunique())
data_model.isna().sum()

In [None]:
data_model.dtypes

In [None]:
data_model['User']=data_model['User'].astype(int)
data_model.dtypes

In [None]:
columns_titles = ["Movie_Id",'User',"Rating"]
data_model=data_model.reindex(columns=columns_titles)
data_model.head()

In [None]:
#here we slice our data to save time for testing our model
data_model=data_model.sample(20000)

In [None]:
data_model.head()

In [None]:
#split our data
from sklearn.model_selection import train_test_split
Xtrain, Xtest = train_test_split(data_model, test_size=0.3, random_state=1)
print(f"Shape of train data: {Xtrain.shape}")
print(f"Shape of test data: {Xtest.shape}")

In [None]:
#Get the number of unique entities in movies and users columns
nmovies_id = data_model.Movie_Id.nunique()
nuser_id = data_model.User.nunique()

In [None]:

#Movie input network
input_movies = tf.layers.Input(shape=[1])
embed_movies = tf.layers.Embedding(2700000 + 1,15)(input_movies)
movies_out = tf.layers.Flatten()(embed_movies)

#user input network
input_users = tf.layers.Input(shape=[1])
embed_users = tf.layers.Embedding(2700000 + 1,15)(input_users)
users_out = tf.layers.Flatten()(embed_users)

conc_layer = tf.layers.Concatenate()([movies_out, users_out])
x = tf.layers.Dense(4, activation='relu')(conc_layer)
x_out = x = tf.layers.Dense(1, activation='relu')(x)
model = tf.Model([input_movies, input_users], x_out)

In [None]:
opt = tf.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='mean_squared_error')
model.summary()

In [None]:
hist = model.fit([Xtrain.Movie_Id, Xtrain.User], Xtrain.Rating, 
                 batch_size=64, 
                 epochs=10, 
                 verbose=1,
                 validation_data=([Xtest.Movie_Id, Xtest.User], Xtest.Rating))

In [None]:
train_loss = hist.history['loss']
val_loss = hist.history['val_loss']
plt.plot(train_loss, color='r', label='Train Loss')
plt.plot(val_loss, color='b', label='Validation Loss')
plt.title("Train and Validation Loss Curve")
plt.legend()
plt.show()

In [None]:
#save the model
model.save('model')

In [None]:
# Extract embeddings
movie_em = model.get_layer('embedding')
movie_em_weights = movie_em.get_weights()[0]
movie_em_weights.shape

In [None]:
data_copy = data_merge.copy()
data_copy = data_copy.set_index("Movie_Id")

In [None]:
m_id =list(data_merge.Movie_Id.unique())

**To visualize the data on embedding projector of Tensorflow >> 
[Embedding Projector](https://projector.tensorflow.org/)**

In [None]:

# dict_map = {}
# for i in m_id:
#     dict_map[i] = data_copy.iloc[i]['Name']
    
# out_v = open('vecs.tsv', 'w')
# out_m = open('meta.tsv', 'w')
# for i in m_id:
#     book = dict_map[i]
#     embeddings = movie_em_weights[i]
#     out_m.write(book + "\n")
#     out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    
# out_v.close()
# out_m.close()

In [None]:
#Making recommendations for user 50
movie_arr = np.array(m_id) #get all Movie IDs
user = np.array([50 for i in range(len(m_id))])
pred = model.predict([movie_arr, user])
pred

In [None]:
pred = pred.reshape(-1) #reshape to single dimension
pred_ids = (-pred).argsort()[0:7]
pred_ids


**Bellow, We get 20 predictions for the user number 100
From the table, we notice that .. first User_Id is the user number 100 and the other Users_Ids indicates that the user have the same behavior as the first User_Id**

In [None]:
data_merge.iloc[pred_ids]