In [1]:
import numpy as np
import pandas as pd

In [2]:
#Import the data set
df = pd.read_csv('ratings_Beauty.csv')


In [3]:
df = df.drop('Timestamp', axis=1) #Dropping timestamp

In [4]:
df

Unnamed: 0,UserId,ProductId,Rating
0,A39HTATAQ9V7YF,0205616461,5.0
1,A3JM6GV9MNOF9X,0558925278,3.0
2,A1Z513UWSAAO0F,0558925278,5.0
3,A1WMRR494NWEWV,0733001998,4.0
4,A3IAAVS479H7M7,0737104473,1.0
...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0


In [5]:
df.head()

Unnamed: 0,UserId,ProductId,Rating
0,A39HTATAQ9V7YF,205616461,5.0
1,A3JM6GV9MNOF9X,558925278,3.0
2,A1Z513UWSAAO0F,558925278,5.0
3,A1WMRR494NWEWV,733001998,4.0
4,A3IAAVS479H7M7,737104473,1.0


In [6]:
# checking if there missng values
df.isnull().sum().sum()

0

In [7]:
# checking if there duplicated values
df.duplicated().sum()

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
dtypes: float64(1), object(2)
memory usage: 46.3+ MB


In [9]:
#Modifing dataset
from surprise import Dataset, Reader
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(df[['UserId', 'ProductId', 'Rating']], reader)

In [10]:
#Splitting dataset
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(surprise_data, test_size=0.2)


In [11]:
#Training model
from surprise import SVD
model=SVD(reg_all=.001)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b064fb9610>

In [12]:
# Predictions
predictions = model.test(testset)

In [13]:
from surprise.accuracy import rmse

# Evaluate the model using RMSE
accuracy = rmse(predictions)
print(f"RMSE: {accuracy}")

RMSE: 1.2514
RMSE: 1.2513824407633767


In [14]:
#Save the model
from surprise.dump import dump
# Save the model to a file
filename = 'SVD_model.pkl'
dump(filename, algo=model)

In [15]:
model.predict(testset[2][0],testset[2][1]).est

3.6544046647146824

In [16]:
#Recommends products
def recommend_products(UserId,ProductId):
    all_products=df['ProductId'].unique()
    recommended_product=[]
    for pid in all_products:
        recommended_product.append((pid,model.predict(UserId,pid).est))
    
    recommended_product.sort(key=lambda x:x[1],reverse=True)
    return recommended_product[:5]

In [17]:
#Top 5 products recommended
for i in recommend_products(testset[2][0],testset[1000][1]):
    print("Recommended product:",i[0],", Predicted rate",i[1])
    

Recommended product: B002YFN49I , Predicted rate 4.946628448977508
Recommended product: B003E1B6UU , Predicted rate 4.917637709387527
Recommended product: B009OWSHQE , Predicted rate 4.915196865660542
Recommended product: B008DWSJ1O , Predicted rate 4.908958838713593
Recommended product: B00466VJ04 , Predicted rate 4.907905715098418


In [18]:
df['ProductId'].value_counts()[:8]


B001MA0QY2    7533
B0009V1YR8    2869
B0043OYFKU    2477
B0000YUXI0    2143
B003V265QW    2088
B000ZMBSPE    2041
B003BQ6QXK    1918
B004OHQR1Q    1885
Name: ProductId, dtype: int64

In [19]:
df['UserId'].value_counts()[:8]


A3KEZLJ59C1JVH    389
A281NPSIMI1C2R    336
A3M174IC0VXOS2    326
A2V5R832QCSOMX    278
A3LJLRIZL38GG3    276
ALQGOMOY1F5X9     275
AKMEY1BSHSDG7     269
A3R9H6OKZHHRJD    259
Name: UserId, dtype: int64

In [20]:
productid=[]
for i in df['ProductId'].unique() :
    productid.append(i)
productid[2]

'0733001998'

In [21]:
userid=[]
for i in df['UserId'].unique() :
    userid.append(i)
userid[2]

'A1Z513UWSAAO0F'

In [23]:
# Create a DataFrame from the list
df1 = pd.DataFrame(data={'UserId': userid})
df2=pd.DataFrame(data={'ProductId': productid})
# Define the file name for the CSV file
file_name2 = 'product_id.csv'
file_name1= 'user_id.csv'
# Save the DataFrame to CSV
df1.to_csv(file_name1, index=False)
df2.to_csv(file_name2, index=False)

In [24]:
pd.read_csv("user_id.csv")

Unnamed: 0,UserId
0,A39HTATAQ9V7YF
1,A3JM6GV9MNOF9X
2,A1Z513UWSAAO0F
3,A1WMRR494NWEWV
4,A3IAAVS479H7M7
...,...
1210266,ADQ41IJPQW2TN
1210267,A1SJD7QDROVPCC
1210268,AFPRQT3V8C1U1
1210269,A1RYQPQ01T5D5R


In [25]:
pd.read_csv("product_id.csv")

Unnamed: 0,ProductId
0,0205616461
1,0558925278
2,0733001998
3,0737104473
4,0762451459
...,...
249269,B00LORWRJA
249270,B00LOS7MEE
249271,B00LP2YB8E
249272,B00LPVG6V0
