In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import os
import warnings # Be careful!

from keras.models import load_model
from sklearn.model_selection import train_test_split
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model
from keras.optimizers import Adam

import plotly.graph_objs as go


warnings.filterwarnings('ignore') # Be careful!
%matplotlib inline

In [None]:
data_header = ['user_id','item_id','rating','timestamp']
data = pd.read_csv('../input/moviestarea/movies/u.data', sep = '\t', names = data_header)
data.head()

In [None]:
item_header = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action','Adventure',
               'Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical',
               'Mystery','Romance','Sci-Fi','Thriller','War','Western']
item = pd.read_csv('../input/moviestarea/movies/u.item', sep = '|', names = item_header)
item.head()

In [None]:
user_header = ['user id','age','gender','occupation','zip code']
user = pd.read_csv('../input/moviestarea/movies/u.user', sep = '|', names = user_header)
user.head()

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
unique_movies = data['item_id'].nunique()
unique_users = data['user_id'].nunique()


print(f"unique_movies: {unique_movies}, unique_users: {unique_users}")

In [None]:
# creating movies embedding path
movies_input = Input(shape=[1], name="Movies-Input")
movies_embedding = Embedding(unique_movies+1, 5, name="Movies-Embedding")(movies_input)
movies_vec = Flatten(name="Movies-Flatten")(movies_embedding)
# creating user embedding path
users_input = Input(shape=[1], name="Users-Input")
users_embedding = Embedding(unique_users+1, 5, name="Users-Embedding")(users_input)
users_vec = Flatten(name="Users-Flatten")(users_embedding)
# concatenate features
conc = Concatenate()([movies_vec, users_vec])
# add fully-connected-layers
fc1 = Dense(128, activation='relu')(conc)
fc2 = Dense(32, activation='relu')(fc1)
out = Dense(1)(fc2)
# Create model and compile it
model2 = Model([users_input, movies_input], out)

model2.compile('adam', 'mean_squared_error')

In [None]:
history = model2.fit([train.user_id, train.item_id], train.rating, epochs=5, verbose=1)

In [None]:
predictions = model2.predict([test.user_id.head(10), test.item_id.head(10)])

for i in range(0,10):
    print(predictions[i], test.rating.iloc[i])

In [None]:
# Create a new column with the genre

genre = item.iloc[:,5:].idxmax(axis=1)

# Drop genre names columns

item.drop(item.columns[5:], axis=1, inplace=True)

item['genre'] = genre

In [None]:
cut_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
cut_labels = [1,2,3,4,5,6,7,8,9,10]
user['age'] = pd.cut(user['age'], bins=cut_bins, labels=cut_labels)

In [None]:
data = data.merge(item[['movie id', 'genre']], left_on='item_id', right_on='movie id')
data.drop(['movie id'], axis=1, inplace=True)

In [None]:
data = data.merge(user[['user id', 'age', 'gender']], left_on='user_id', right_on='user id')
data.drop(['user id'], axis=1, inplace=True)
data

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

genre_enc = OrdinalEncoder()
genre_enc.fit(data[['genre']])
data['genre'] = genre_enc.transform(data[['genre']])

gender_enc = OrdinalEncoder()
gender_enc.fit(data[['gender']])
data['gender'] = gender_enc.transform(data[['gender']])

In [None]:
data.head()

In [None]:
unique_movies = data['item_id'].nunique()
unique_users = data['user_id'].nunique()
unique_genres = data['genre'].nunique()
unique_ages = data['age'].nunique()
unique_genders = data['gender'].nunique()

print(
    f"unique_movies: {unique_movies}", 
    f"unique_users: {unique_users}",
    f"unique_genres: {unique_genres}",
    f"unique_ages: {unique_ages}",
    f"unique_genders: {unique_genders}"
)

In [None]:
# creating movies embedding path
movies_input = Input(shape=[1], name="Movies-Input")
movies_embedding = Embedding(unique_movies+1, 10, name="Movies-Embedding")(movies_input)
movies_vec = Flatten(name="Movies-Flatten")(movies_embedding)

# creating users embedding path
users_input = Input(shape=[1], name="Users-Input")
users_embedding = Embedding(unique_users+1, 10, name="Users-Embedding")(users_input)
users_vec = Flatten(name="Users-Flatten")(users_embedding)

# creating genres embedding path
genres_input = Input(shape=[1], name="Genres-Input")
genres_embedding = Embedding(unique_genres+1, 5, name="Genres-Embedding")(genres_input)
genres_vec = Flatten(name="Genres-Flatten")(genres_embedding)

# creating ages embedding path
ages_input = Input(shape=[1], name="Ages-Input")
ages_embedding = Embedding(unique_ages+1, 5, name="Ages-Embedding")(ages_input)
ages_vec = Flatten(name="Ages-Flatten")(ages_embedding)

# creating genders embedding path
genders_input = Input(shape=[1], name="Genders-Input")
genders_embedding = Embedding(unique_genders+1, 2, name="Genders-Embedding")(genders_input)
genders_vec = Flatten(name="Genders-Flatten")(genders_embedding)

# concatenate features
conc = Concatenate()([movies_vec, users_vec, genres_vec, ages_vec, genders_vec])
# add fully-connected-layers
fc1 = Dense(512, activation='relu')(conc)
fc2 = Dense(256, activation='relu')(fc1)
out = Dense(1)(fc2)
# Create model and compile it
model = Model([users_input, movies_input, genres_input, ages_input, genders_input], out)

adam = Adam(learning_rate=0.001)
model.compile(optimizer=adam, loss="mean_squared_error", metrics=['mae', 'mse'])

In [None]:
history = model.fit(
    [data.user_id, data.item_id, data.genre, data.age, data.gender], 
    data.rating, 
    epochs=10, 
    verbose=1,
    batch_size=64,
    validation_split=0.2,
    validation_batch_size=64
)

In [None]:
metrics = model.evaluate([data.user_id, data.item_id, data.genre, data.age, data.gender], data.rating,)
mse = metrics[0]
mae = metrics[1]

In [None]:
import matplotlib.pyplot as plt

train_loss = history.history['loss']
val_loss = history.history['val_loss']
plt.plot(train_loss, color='r', label='Train Loss')
plt.plot(val_loss, color='b', label='Validation Loss')
plt.title("Train and Validation Loss Curve")
plt.legend()
plt.show()

In [None]:
groupbyitem = data.groupby(['item_id']).mean()
x = groupbyitem.index
y = groupbyitem['rating']

y_upper = y + mae
y_lower = y - mae

y_pred = model.predict([data.user_id, data.item_id, data.genre, data.age, data.gender])
y_pred = [x[0] for x in y_pred]
 
x = x.to_list()
y = y.to_list()
y_upper = y_upper.to_list()
y_lower = y_lower.to_list()

In [None]:

fig = go.Figure([
    go.Scatter(
        x=x,
        y=y,
        #line=dict(color='rgb(0,100,80)'),
        mode='lines',
        name='real'
    ),
    go.Scatter(
        x=x,
        y=y_pred,
        #line=dict(color='rgb(0,100,80)'),
        mode='lines',
        name='prediction'
    ),
    go.Scatter(
        x=x,
        y=y_upper,
        #line=dict(color='rgb(0,100,80)'),
        mode='lines',
        name='avg max error'
    ),
    go.Scatter(
        x=x,
        y=y_lower,
        #line=dict(color='rgb(0,100,80)'),
        mode='lines',
        name='avg min error'
    ),
])
fig.show()

In [None]:
data.head()

In [None]:

# Get all the movies ids

movies_ids = list(data.item_id.unique())
movies_ids = np.array(movies_ids)

# Build a unknown user id list

users_ids = np.array([unique_users for i in range(len(movies_ids))])

# Build a unknown genre id list

genres_ids = np.array([unique_genres for i in range(len(movies_ids))])

In [None]:
def top5movies (users_ids, movies_ids, genres_ids, ages_ids, women_ids):
    pred = model.predict([users_ids, movies_ids, genres_ids, ages_ids, women_ids])
    
    pred = pred.reshape(-1)
    pred_ids = (-pred).argsort()[0:5]
    
    return pd.DataFrame({'movies': list(movies_ids), 'rating': pred}).sort_values(by='rating', ascending=False)['movies'][:5]

In [None]:
# Build a unknown age id list

ages_ids = np.array([unique_ages for i in range(len(movies_ids))])

# Build a gender = F = 0 list

women_ids = np.array([0 for i in range(len(movies_ids))])

top5movies_women = top5movies(users_ids, movies_ids, genres_ids, ages_ids, women_ids).to_list()


In [None]:
item[item['movie id'].isin(top5movies_women)] 

In [None]:
# Build a unknown age id list

ages_ids = np.array([unique_ages for i in range(len(movies_ids))])

# Build a gender = M = 1 list

women_ids = np.array([1 for i in range(len(movies_ids))])

top5movies_men = top5movies(users_ids, movies_ids, genres_ids, ages_ids, women_ids).to_list()

In [None]:
item[item['movie id'].isin(top5movies_men)]

In [None]:
# Build a age_id = 3 (20 - 30) list

ages_ids = np.array([3 for i in range(len(movies_ids))])

# Build a gender = M = 1 list

genders_ids = np.array([unique_genders for i in range(len(movies_ids))])

top5movies_20_30 = top5movies(users_ids, movies_ids, genres_ids, ages_ids, genders_ids).to_list()

In [None]:
item[item['movie id'].isin(top5movies_20_30)]

In [None]:

# Get all the movies ids

genres_ids = list(data.genre.unique())
genres_ids = np.array(genres_ids)

# Build a unknown user id list

users_ids = np.array([unique_users for i in range(len(genres_ids))])

# Build a unknown genre id list

movies_ids = np.array([unique_movies for i in range(len(genres_ids))])

In [None]:
def top3genres (users_ids, movies_ids, genres_ids, ages_ids, women_ids):
    pred = model.predict([users_ids, movies_ids, genres_ids, ages_ids, women_ids])
    
    pred = pred.reshape(-1)
    pred_ids = (-pred).argsort()[0:5]
    
    return pd.DataFrame({'genres': list(genres_ids), 'rating': pred}).sort_values(by='rating', ascending=False)['genres'][:5]

In [None]:
# Build a unknown age id list

ages_ids = np.array([unique_ages for i in range(len(movies_ids))])

# Build a gender = F = 0 list

women_ids = np.array([0.0 for i in range(len(movies_ids))])

top3genres_women = top3genres(users_ids, movies_ids, genres_ids, ages_ids, women_ids).to_list()


In [None]:
for g in genre_enc.inverse_transform(np.array(top3genres_women).reshape(-1, 1)):
    print (g[0])

In [None]:
# Build a unknown age id list

ages_ids = np.array([unique_ages for i in range(len(movies_ids))])

# Build a gender = M = 1 list

men_ids = np.array([1.0 for i in range(len(movies_ids))])

top3genres_men = top3genres(users_ids, movies_ids, genres_ids, ages_ids, women_ids).to_list()

In [None]:
for g in genre_enc.inverse_transform(np.array(top3genres_men).reshape(-1, 1)):
    print (g[0])

In [None]:
# Build a age_id = 3 (20 - 30) list

ages_ids = np.array([3 for i in range(len(movies_ids))])

# Build a gender = M = 1 list

genders_ids = np.array([unique_genders for i in range(len(movies_ids))])

top3genres_20_30 = top3genres(users_ids, movies_ids, genres_ids, ages_ids, genders_ids).to_list()

In [None]:
for g in genre_enc.inverse_transform(np.array(top3genres_20_30).reshape(-1, 1)):
    print (g[0])