<h1> Neural User Based Collaborative Filtering </h1>
<h2> Author:- Souvik Chakraborty </h2>

**References**

https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf - Neural collaborative filtering paper

Dataset Reference:-https://www.kaggle.com/code/kwonjeongmin/clothing-fit-dataset-for-size-recommendation/input

In [44]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Model, Sequential
from pathlib import Path
import matplotlib.pyplot as plt

In [45]:

df = pd.read_excel("Fit Recommendation.xlsx")


In [46]:
df.head()

Unnamed: 0,item_id,size,quality,length,fit,user_id
0,123373,7,5.0,just right,small,991571
1,123373,13,3.0,just right,small,587883
2,123373,7,2.0,slightly long,small,395665
3,123373,21,5.0,just right,fit,875643
4,123373,18,5.0,slightly long,small,944840


In [47]:
df[df.user_id==409547]

Unnamed: 0,item_id,size,quality,length,fit,user_id
7022,152702,4,3.0,just right,fit,409547
27873,399074,4,4.0,just right,fit,409547
41534,454030,4,5.0,slightly long,large,409547
46064,492279,15,4.0,slightly short,fit,409547
54190,630114,4,5.0,just right,fit,409547
68781,715662,8,5.0,just right,fit,409547
77171,766618,5,5.0,just right,fit,409547


In [48]:
df_1=df[["item_id","user_id","length"]]

In [49]:
df_1

Unnamed: 0,item_id,user_id,length
0,123373,991571,just right
1,123373,587883,just right
2,123373,395665,slightly long
3,123373,875643,just right
4,123373,944840,slightly long
...,...,...,...
82785,807722,727820,just right
82786,807722,197040,slightly long
82787,807722,102493,just right
82788,807722,756491,just right


In [50]:
df_1.isnull().sum()

item_id     0
user_id     0
length     35
dtype: int64

In [51]:
df_1=df_1.dropna()

In [52]:
df_1.isnull().sum()

item_id    0
user_id    0
length     0
dtype: int64

In [53]:
mappings = {
    'just right': 5,
    'slightly long': 4,
    'slightly short': 3,
    'very long': 2,
    'very short': 1
}

In [54]:
df_1["rating"]=df_1["length"].map(mappings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1["rating"]=df_1["length"].map(mappings)


In [55]:
df_1=df_1.drop(["length"],axis=1)
df_1

Unnamed: 0,item_id,user_id,rating
0,123373,991571,5
1,123373,587883,5
2,123373,395665,4
3,123373,875643,5
4,123373,944840,4
...,...,...,...
82785,807722,727820,5
82786,807722,197040,4
82787,807722,102493,5
82788,807722,756491,5


In [56]:
df_1.isnull().sum()

item_id    0
user_id    0
rating     0
dtype: int64

In [57]:
user_ids = df_1["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
item_ids = df_1["item_id"].unique().tolist()
item2item_encoded = {x: i for i, x in enumerate(item_ids)}
item_encoded2item = {i: x for i, x in enumerate(item_ids)}
df_1["user"] = df_1["user_id"].map(user2user_encoded)
df_1["item"] = df_1["item_id"].map(item2item_encoded)


In [58]:
min_rating = min(df_1["rating"])
max_rating = max(df_1["rating"])
df_1["rating"]=df_1["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values

In [59]:
df_1

Unnamed: 0,item_id,user_id,rating,user,item
0,123373,991571,1.00,0,0
1,123373,587883,1.00,1,0
2,123373,395665,0.75,2,0
3,123373,875643,1.00,3,0
4,123373,944840,0.75,4,0
...,...,...,...,...,...
82785,807722,727820,1.00,1528,1373
82786,807722,197040,0.75,47922,1373
82787,807722,102493,1.00,47923,1373
82788,807722,756491,1.00,3538,1373


In [60]:
from sklearn.model_selection import train_test_split as tts
x = df_1[["user", "item"]].values
df_1["rating"] = df_1["rating"].values.astype(np.float32)
y=df_1["rating"].values
x_train, x_val, y_train, y_val = tts(x,y,test_size = 0.1)

In [61]:
x

array([[    0,     0],
       [    1,     0],
       [    2,     0],
       ...,
       [47923,  1373],
       [ 3538,  1373],
       [47924,  1373]], dtype=int64)

In [62]:
x_train.shape

(74479, 2)

In [63]:
y_train.shape

(74479,)

In [64]:
x_val

array([[32791,   746],
       [28160,   587],
       [27750,  1036],
       ...,
       [34100,   799],
       [24109,  1277],
       [15355,   169]], dtype=int64)

In [65]:
pip install pydot

Note: you may need to restart the kernel to use updated packages.


In [66]:
pip install graphviz 

Note: you may need to restart the kernel to use updated packages.


In [67]:
## defining the model
from tensorflow.keras.optimizers.legacy import Adam
from keras.regularizers import l2
embedding_size = 50
num_users = len(user2user_encoded)
num_items = len(item2item_encoded)

user_ips= layers.Input(shape=[1])
user_embedding = layers.Embedding(num_users,embedding_size,embeddings_initializer="he_normal",embeddings_regularizer=keras.regularizers.l2(1e-12))(user_ips)
user_vect= layers.Flatten()(user_embedding)

item_ips= layers.Input(shape=[1])
item_embedding = layers.Embedding(num_items, embedding_size, embeddings_initializer="he_normal",embeddings_regularizer=keras.regularizers.l2(1e-12))(item_ips)
item_vect= layers.Flatten()(item_embedding)

prod = layers.dot(inputs=[user_vect, item_vect],axes=1)

dense1= layers.Dense(150, activation='relu', kernel_initializer="he_normal",kernel_regularizer=l2( 1e-12 ))(prod)
dense2= layers.Dense(50, activation='relu',kernel_initializer="he_normal",kernel_regularizer=l2(1e-12))(dense1)
dense3= layers.Dense(1,activation='relu',kernel_regularizer=l2( 1e-12))(dense2)

model = Model([user_ips, item_ips], dense3)
#optimizer = keras.optimizers.Adam(lr= 0.9)
model.compile(optimizer="adam",loss='mse',metrics=["mae"])


#keras.utils.plot_model(model,show_shapes=True)

In [68]:

history = model.fit([x_train[:,0], x_train[:,1]], y_train, batch_size=128,epochs=5,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [85]:
#user_id = df.user_id.sample(1).iloc[0]
#user_id

In [86]:
print("Items purchased by user user 770991")
print("===============================================")
df[df.user_id==770991][["item_id","user_id"]]

Items purchased by user user 770991


Unnamed: 0,item_id,user_id
27226,397005,770991
29299,402677,770991
32417,407134,770991
35277,412737,770991
76311,757731,770991


In [87]:
print("Items not purchased by user user 770991")
print("===============================================")
l=list(df[df.user_id!=770991]["item_id"])
l

Items not purchased by user user 770991


[123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 123373,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124024,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 124124,
 

In [114]:
items_not_purchased = list(
    set(l).intersection(set(item2item_encoded.keys()))
)
items_not_purchased

[204815,
 663571,
 696347,
 294941,
 401438,
 753696,
 294960,
 393265,
 729139,
 794675,
 778295,
 245815,
 507970,
 548931,
 647235,
 761925,
 188495,
 491604,
 180311,
 606302,
 245855,
 401503,
 573542,
 417902,
 303219,
 311425,
 598147,
 721034,
 549006,
 409758,
 245920,
 606369,
 589989,
 524455,
 442542,
 540857,
 557241,
 573631,
 196801,
 377027,
 237767,
 614600,
 262349,
 803028,
 483544,
 450779,
 434399,
 418018,
 401651,
 278772,
 286965,
 712948,
 147714,
 360707,
 459014,
 753935,
 770324,
 385302,
 254236,
 319776,
 287009,
 745765,
 336169,
 540970,
 213295,
 368949,
 704829,
 672063,
 155981,
 475478,
 475481,
 737625,
 680284,
 581984,
 639328,
 205161,
 344429,
 401773,
 360818,
 565629,
 647551,
 688521,
 303502,
 663957,
 467353,
 360875,
 631215,
 418224,
 573879,
 434622,
 778687,
 786908,
 803297,
 549348,
 123373,
 451055,
 172529,
 729587,
 664054,
 516599,
 426491,
 221693,
 360958,
 623103,
 442881,
 557588,
 320025,
 426521,
 606749,
 221728,
 369205,
 

In [88]:
df[df.user_id!=770991]["item_id"].unique()

array([123373, 124024, 124124, ..., 807252, 807384, 807722], dtype=int64)

In [115]:
items_not_purchased_index = [[item2item_encoded.get(x)] for x in items_not_purchased]

In [116]:
len(items_not_purchased_index)

1374

In [117]:
user_encoder = user2user_encoded.get(770991)
user_encoder

22132

In [92]:
df_1[df_1["user_id"]==770991]

Unnamed: 0,item_id,user_id,rating,user,item
27226,397005,770991,1.0,22132,549
29299,402677,770991,1.0,22132,563
32417,407134,770991,1.0,22132,572
35277,412737,770991,1.0,22132,581
76311,757731,770991,1.0,22132,1277


In [120]:
user_item_array = np.hstack(([[user_encoder]] * len(items_not_purchased), items_not_purchased_index))
#useruser_item_array_item_array=user_item_array.astype('float32') ## 1st column is user_index and 2nd col is the list of movie indexes not watched by the user
#user_item_array=user_item_array.astype('int64')

In [121]:
user_item_array=pd.DataFrame(user_item_array,columns=["user","item"])
user_item_array

Unnamed: 0,user,item
0,22132,157
1,22132,1080
2,22132,1147
3,22132,341
4,22132,559
...,...,...
1369,22132,526
1370,22132,1130
1371,22132,15
1372,22132,210


In [123]:
user_item_array.isnull().sum()

user    0
item    0
dtype: int64

In [124]:
m = user_item_array[["user", "item"]].values.astype('float32')

In [125]:
ratings = model.predict([m[:,0],m[:,1]]).flatten()
ratings



array([0.93648934, 0.72232753, 0.62228733, ..., 0.71272993, 0.97663623,
       0.9891717 ], dtype=float32)

In [127]:
top_ratings_indices = ratings.argsort()[-10:][::-1] # indices of highest 10 ratings

In [138]:
top_ratings_indices

array([ 847, 1148, 1338,   80,  718,  624, 1211,  137,  219, 1004],
      dtype=int64)

In [139]:
recommended_item_ids = [item_encoded2item.get(items_not_purchased_index[x][0]) for x in top_ratings_indices]

In [140]:
recommended_item_ids

[578438,
 580254,
 163603,
 647551,
 200824,
 413303,
 752615,
 393954,
 754891,
 194376]

In [150]:
print("----" * 12)
print("Top 10 item recommendations for user 770991 ")
print("----" * 12)
print(recommended_item_ids)

------------------------------------------------
Top 10 item recommendations for user 770991 
------------------------------------------------
[578438, 580254, 163603, 647551, 200824, 413303, 752615, 393954, 754891, 194376]
