# Recommendation System

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import random
from scipy import sparse
from scipy.sparse import csr_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.decomposition import TruncatedSVD

In [None]:
df=pd.read_csv('/kaggle/input/amazon-ratings/ratings_Beauty.csv')
print('Shape: ',df.shape)
df.head()

## Data Analysis

### ->Identifying Duplicates in data

In [None]:
duplicates = df.duplicated(["UserId","ProductId", "Rating", "Timestamp"]).sum()
print(' Duplicate records: ',duplicates)

### ->Unique features

In [None]:
print('unique users:',len(df.UserId.unique()))
print('unique products:',len(df.ProductId.unique()))
print("total ratings: ",df.shape[0])

### ->Checking for Nan values

In [None]:
df.isnull().any()

# ### -> Analysing the number of rated products per user:

In [None]:
products_user= df.groupby(by = "UserId")["Rating"].count().sort_values(ascending =False)
products_user.head()

### ->Analysing the Rating number per product:

In [None]:
product_rated = df.groupby(by = "ProductId")["Rating"].count().sort_values(ascending = False)
product_rated.head()



## Data Visualizing and Cleaning

In [None]:
rated_users=df.groupby("UserId")["ProductId"].count().sort_values(ascending=False)
print(rated_users)

In [None]:
print('Number of users rated atleast 5 prodcuts:',rated_users[rated_users>=5].count())
print('Number of users rated atleast 4 prodcuts:',rated_users[rated_users>=4].count())
print('Number of users rated atleast 3 prodcuts:',rated_users[rated_users>=3].count())
print('Number of users rated atleast 2 prodcuts:',rated_users[rated_users>=2].count())


In [None]:
users=list(rated_users[:rated_users[rated_users>=5].count()].index)
df=df[df['UserId'].isin(users)]


In [None]:
rated_products=df.groupby("ProductId")["UserId"].count().sort_values(ascending=False)

plt.scatter(x=df.Rating[:20], y=df.ProductId[:20])
plt.xlabel('Rating')
# Set the y axis label of the current axis.
plt.ylabel('Product ID')

plt.show()

plt.scatter(x=df.Rating[:20], y=df.UserId[:20])
plt.xlabel('Rating')
# Set the y axis label of the current axis.
plt.ylabel('User ID')

plt.show()

In [None]:
print('Number of products with minimum of 5 reviews/ratings:',rated_products[rated_products>5].count())
print('Number of products with minimum of 4 reviews/ratings:',rated_products[rated_products>4].count())
print('Number of products with minimum of 3 reviews/ratings:',rated_products[rated_products>3].count())
print('Number of products with minimum of 2 reviews/ratings:',rated_products[rated_products>2].count())
print('Number of products with minimum of 1 reviews/ratings:',rated_products[rated_products>1].count())


In [None]:
products=list(rated_products[:rated_products[rated_products>5].count()].index)
df=df[df['ProductId'].isin(products)]
products[:5]


In [None]:
u_v=df["UserId"].values
n_v=[]
for i in tqdm(range(df.shape[0])):
    n_v.append(users.index(u_v[i])+1)
df["UserId"]=n_v


In [None]:
b_set = set(n_v)
number_of_unique_values = len(b_set)
print(number_of_unique_values)
print(len(df['UserId'].unique()))


In [None]:
p_v=df["ProductId"].values
n_v=[]

for i in tqdm(range(df.shape[0])):
    n_v.append(products.index(p_v[i])+1)
df["ProductId"]=n_v

## Model building:

### -> Truncated SVD

In [None]:
matrix_user = csr_matrix((df.Rating.values, (df.UserId.values,df.ProductId.values)))
start = datetime.now()
u_svd = TruncatedSVD(n_components=50, algorithm='randomized', random_state=21)
t_svd = u_svd.fit_transform(matrix_user)
t_svd.shape

In [None]:
sparse_prod = csr_matrix((df.Rating.values, (df.ProductId.values,df.UserId.values)))
product_svd = TruncatedSVD(n_components=50, algorithm='randomized', random_state=21)
trunc_prod = product_svd.fit_transform(sparse_prod)
sparse_prod.shape

In [None]:
user=df[df["UserId"]==df["UserId"].iloc[4]]["Rating"].mean()
user

### -> Creating features

In [None]:
useravg=[]
productavg=[]

for i in tqdm(range(df.shape[0])):
    us=df[df["UserId"]==df["UserId"].iloc[i]]["Rating"].mean()
    ps=df[df["ProductId"]==df["ProductId"].iloc[i]]["Rating"].mean()
    useravg.append(us)
    productavg.append(ps)

df["user"]=useravg
df["prod"]=productavg
df['gavg']=df["Rating"].mean()
df=df.sort_values(by=['Timestamp'])
df.head()
    

### ->sorting the data w.r.t TimeStamp

In [None]:
df=df.sort_values(by=['Timestamp'])
df.head()

### -> Splitting the data

In [None]:
training=df[:int(df.shape[0]**0.8)]
testing=df[int(df.shape[0]**0.8):]

### -> Train and test Data Generators

In [None]:
def data_generator(up_data):
    samp = random.sample(range(0,up_data.shape[0]),10000)
    for j in range(0,len(samp)-32,32):
        m=samp[j:j+32]
        X=[]
        Y=[]
        for i in m: 
            x=[]
            r_s_t=[3]*5
            t=np.dot(t_svd[up_data["UserId"].iloc[i]-1],t_svd.T)/(np.linalg.norm(t_svd.T))
            sim_users = list(np.flipud(np.argsort(t)))
            c=0
            pr_d=up_data[up_data["ProductId"]==up_data["ProductId"].iloc[i]]
            pr_fd=pr_d[pr_d["UserId"].isin(sim_users)]
            ind=[]
            for j in pr_fd["UserId"].values:
                ind.append(sim_users.index(j))
            ind.sort()
            for k in range(len(ind)):
                r_s_t[k]=int(pr_fd[pr_fd["UserId"]==sim_users[ind[k]]]["Rating"])
                c+=1
                if c==5:
                    break
            x.extend(r_s_t)
            r_s_t=[3]*5
            y1=np.dot(trunc_prod[up_data["ProductId"].iloc[i]-1],trunc_prod.T)/(np.linalg.norm(trunc_prod.T))
            sim_prods = list(np.flipud(np.argsort(y1)))
            c=0
            us_d=up_data[up_data["UserId"]==up_data["UserId"].iloc[i]]
            us_fd=us_d[us_d["ProductId"].isin(sim_prods)]
            ind=[]
            for j in us_fd["ProductId"].values:
                ind.append(sim_prods.index(j))
            ind.sort()
    
            for k in range(len(ind)):
                r_s_t[k]=int(us_fd[us_fd["ProductId"]==sim_prods[ind[k]]]["Rating"])
                c+=1
                if c==5:
                    break
            x.extend(r_s_t)
            x.extend([up_data['user'].iloc[i],up_data['prod'].iloc[i],up_data['gavg'].iloc[i]])
            X.append(x)
            Y.append(up_data["Rating"].iloc[i])
     
        yield np.array(X).reshape(32,1,13),np.array(Y)
        

In [None]:
def test_data_generator(up_data):
    samp = random.sample(range(0,up_data.shape[0]),5000)
    for j in range(0,len(samp)-32,32):
        m=samp[j:j+32]
        X=[]
        Y=[]
        for i in m: 
            x=[]
            r_s_t=[3]*5
            t=np.dot(t_svd[up_data["UserId"].iloc[i]-1],t_svd.T)/(np.linalg.norm(t_svd.T))
            sim_users = list(np.flipud(np.argsort(t)))
            c=0
            pr_d=up_data[up_data["ProductId"]==up_data["ProductId"].iloc[i]]
            pr_fd=pr_d[pr_d["UserId"].isin(sim_users)]
            ind=[]
            for j in pr_fd["UserId"].values:
                ind.append(sim_users.index(j))
            ind.sort()
            for k in range(len(ind)):
                r_s_t[k]=int(pr_fd[pr_fd["UserId"]==sim_users[ind[k]]]["Rating"])
                c+=1
                if c==5:
                    break
            x.extend(r_s_t)
            r_s_t=[3]*5
            y1=np.dot(trunc_prod[up_data["ProductId"].iloc[i]-1],trunc_prod.T)/(np.linalg.norm(trunc_prod.T))
            sim_prods = list(np.flipud(np.argsort(y1)))
            c=0
            us_d=up_data[up_data["UserId"]==up_data["UserId"].iloc[i]]
            us_fd=us_d[us_d["ProductId"].isin(sim_prods)]
            ind=[]
            for j in us_fd["ProductId"].values:
                ind.append(sim_prods.index(j))
            ind.sort()
    
            for k in range(len(ind)):
                r_s_t[k]=int(us_fd[us_fd["ProductId"]==sim_prods[ind[k]]]["Rating"])
                c+=1
                if c==5:
                    break
            x.extend(r_s_t)
            x.extend([up_data['user'].iloc[i],up_data['prod'].iloc[i],up_data['gavg'].iloc[i]])
            X.append(x)
            Y.append(up_data["Rating"].iloc[i])
     
        yield np.array(X).reshape(32,1,13),np.array(Y)
        

In [None]:
train_gen=data_generator(training)
test_gen=test_data_generator(testing)

In [None]:
from tensorflow.keras import layers
import tensorflow as tf
tf.keras.backend.clear_session()  

def create_func_model():
    inputs = tf.keras.Input(shape=(1,13))
    
    o2=tf.keras.layers.LSTM(16,return_sequences=True, return_state=True)(inputs)
    o3=tf.keras.layers.LSTM(16)(o2)
    o4 = layers.Dense(32, activation='relu')(o3)
    outputs = layers.Dense(1)(o4)

    model_func = tf.keras.Model(inputs, outputs, name='model_func')

  
    model_func.compile(optimizer='adam',
              loss='mean_squared_error')
    return model_func

In [None]:
model=create_func_model()
model.summary()

In [None]:
for epoch in range(5):
    print('Epoch ------------> ',epoch+1)
    train_gen=data_generator(training)
    test_gen=test_data_generator(testing)
    model.fit_generator(train_gen,steps_per_epoch=10000 // 32,
        epochs=1  ,validation_data=test_gen,
        validation_steps=5000//32)

In [None]:
model.load_weights("../input/dataset1/reccom-model (1) (1).h5")


In [None]:
def prediction(userid):
    x1=users.index(userid)+1
    X2=random.sample(range(1,15000),100)
    X=[]
    rec_prods=[]
    for x2 in X2:
        x=[]
        r_s_t=[3]*5
        t=np.dot(t_svd[x1-1],t_svd.T)/(np.linalg.norm(t_svd.T))
        sim_users = list(np.flipud(np.argsort(t)))
        c=0
        pr_d=df[df["ProductId"]==x2]
        pr_fd=pr_d[pr_d["UserId"].isin(sim_users)]
        ind=[]
        for j in pr_fd["UserId"].values:
            ind.append(sim_users.index(j))
        ind.sort()
        for k in range(len(ind)):
            r_s_t[k]=int(pr_fd[pr_fd["UserId"]==sim_users[ind[k]]]["Rating"])
            c+=1
            if c==5:
                break
        x.extend(r_s_t)
        r_s_t=[3]*5
        y1=np.dot(trunc_prod[x2-1],trunc_prod.T)/(np.linalg.norm(trunc_prod.T))
        sim_prods = list(np.flipud(np.argsort(y1)))
        c=0
        us_d=df[df["UserId"]==x1]
        us_fd=us_d[us_d["ProductId"].isin(sim_prods)]
        ind=[]
        for j in us_fd["ProductId"].values:
            ind.append(sim_prods.index(j))
        ind.sort()
    
        for k in range(len(ind)):
            r_s_t[k]=int(us_fd[us_fd["ProductId"]==sim_prods[ind[k]]]["Rating"])
            c+=1
            if c==5:
                break
        x.extend(r_s_t)
        us=df[df["UserId"]==x1]["Rating"].mean()
        ps=df[df["ProductId"]==x2]["Rating"].mean()
        x.extend([us,ps,df['gavg'].iloc[0]])
        X.append(x)
        rec_prods.append(x2)
    y_pred=model.predict(np.array(X).reshape(100,1,13))
    r_prods=list(np.flipud(np.argsort(np.array(y_pred.reshape(-1)))))
    for i in range(10):
        print(products[rec_prods[r_prods[i]]-1])


In [None]:
prediction('A3KEZLJ59C1JVH') #Enter the UserID