In [34]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [35]:
# read the dataset
df = pd.read_csv('../ratings_Beauty.csv')
df

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,0205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,0558925278,3.0,1355443200
2,A1Z513UWSAAO0F,0558925278,5.0,1404691200
3,A1WMRR494NWEWV,0733001998,4.0,1382572800
4,A3IAAVS479H7M7,0737104473,1.0,1274227200
...,...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0,1405296000
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0,1405296000
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0,1405382400
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0,1405555200


### Checking data types of the columns

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 61.7+ MB


The columns UserId and ProductId are of type Object. To use them for building Machine Learning model, we need to convert them to type 'Numeric'

In [37]:
dataset = df.copy()
label_encoder = preprocessing.LabelEncoder()
dataset['user'] = label_encoder.fit_transform(df['UserId'])
dataset['product'] = label_encoder.fit_transform(df['ProductId'])
dataset.head()


Unnamed: 0,UserId,ProductId,Rating,Timestamp,user,product
0,A39HTATAQ9V7YF,205616461,5.0,1369699200,725046,0
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200,814606,1
2,A1Z513UWSAAO0F,558925278,5.0,1404691200,313101,1
3,A1WMRR494NWEWV,733001998,4.0,1382572800,291075,2
4,A3IAAVS479H7M7,737104473,1.0,1274227200,802842,3


In [38]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
 4   user       int64  
 5   product    int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 92.6+ MB


We can remove the original UserId and ProductId which are of type Object

In [39]:
dataset.drop(['UserId'], axis=1, inplace=True)
dataset.drop(['ProductId'], axis=1, inplace=True)

In [40]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Rating     float64
 1   Timestamp  int64  
 2   user       int64  
 3   product    int64  
dtypes: float64(1), int64(3)
memory usage: 61.7 MB


### Null Value Check

In [41]:
df.isnull().any()

UserId       False
ProductId    False
Rating       False
Timestamp    False
dtype: bool

We can see that there are no null values in the dataset. If there exists any null values, we have to remove the corresponding records.

### Duplicate Record Check

In [42]:
duplicates = df.duplicated(["UserId","ProductId", "Rating", "Timestamp"]).sum()
print(' Duplicate records: ',duplicates)

 Duplicate records:  0


There are no duplicate records. If there exists any duplicate records, we have to drop them.

# Normalizing the target variable

In [44]:
# average rating given by each user
average_rating = dataset.groupby(by="user", as_index=False)['Rating'].mean()
average_rating

Unnamed: 0,user,Rating
0,0,5.0
1,1,5.0
2,2,3.0
3,3,5.0
4,4,5.0
...,...,...
1210266,1210266,3.5
1210267,1210267,4.5
1210268,1210268,1.0
1210269,1210269,5.0


Merging the average_rating dataframe with the original dataframe to check the bias.

In [45]:
dataset = pd.merge(dataset, average_rating, on="user")
dataset

Unnamed: 0,Rating_x,Timestamp,user,product,Rating_y
0,5.0,1369699200,725046,0,4.25
1,3.0,1369699200,725046,81854,4.25
2,5.0,1369699200,725046,89013,4.25
3,4.0,1369699200,725046,154092,4.25
4,3.0,1355443200,814606,1,3.50
...,...,...,...,...,...
2023065,5.0,1405728000,1012502,249268,5.00
2023066,5.0,1405296000,254698,249268,5.00
2023067,5.0,1405468800,1030275,249268,5.00
2023068,5.0,1406073600,249628,249268,5.00


Now the dataframe has 2 columns with same name "Rating", so the python compiler creates to aliases. Rating_x and Rating_y. Now we need to rename them to easy understanding.

In [46]:
dataset = dataset.rename(columns={"Rating_x": "real_rating", "Rating_y": "average_rating"})
dataset

Unnamed: 0,real_rating,Timestamp,user,product,average_rating
0,5.0,1369699200,725046,0,4.25
1,3.0,1369699200,725046,81854,4.25
2,5.0,1369699200,725046,89013,4.25
3,4.0,1369699200,725046,154092,4.25
4,3.0,1355443200,814606,1,3.50
...,...,...,...,...,...
2023065,5.0,1405728000,1012502,249268,5.00
2023066,5.0,1405296000,254698,249268,5.00
2023067,5.0,1405468800,1030275,249268,5.00
2023068,5.0,1406073600,249628,249268,5.00


### Certain users tend to give higher ratings while others tend to gibve lower ratings. To negate this bias, we normalise the ratings given by the users.

In [47]:
dataset['normalized_rating'] = dataset['real_rating'] - dataset['average_rating']
dataset

Unnamed: 0,real_rating,Timestamp,user,product,average_rating,normalized_rating
0,5.0,1369699200,725046,0,4.25,0.75
1,3.0,1369699200,725046,81854,4.25,-1.25
2,5.0,1369699200,725046,89013,4.25,0.75
3,4.0,1369699200,725046,154092,4.25,-0.25
4,3.0,1355443200,814606,1,3.50,-0.50
...,...,...,...,...,...,...
2023065,5.0,1405728000,1012502,249268,5.00,0.00
2023066,5.0,1405296000,254698,249268,5.00,0.00
2023067,5.0,1405468800,1030275,249268,5.00,0.00
2023068,5.0,1406073600,249628,249268,5.00,0.00


Now the dataframe has 2 columns with same name "Rating", so the python compiler creates to aliases. Rating_x and Rating_y. Now we need to rename them to easy understanding.

In [49]:
dataset = dataset.rename(columns={"Rating_x": "real_rating", "Rating_y": "average_rating"})
dataset

Unnamed: 0,real_rating,Timestamp,user,product,average_rating,normalized_rating
0,5.0,1369699200,725046,0,4.25,0.75
1,3.0,1369699200,725046,81854,4.25,-1.25
2,5.0,1369699200,725046,89013,4.25,0.75
3,4.0,1369699200,725046,154092,4.25,-0.25
4,3.0,1355443200,814606,1,3.50,-0.50
...,...,...,...,...,...,...
2023065,5.0,1405728000,1012502,249268,5.00,0.00
2023066,5.0,1405296000,254698,249268,5.00,0.00
2023067,5.0,1405468800,1030275,249268,5.00,0.00
2023068,5.0,1406073600,249628,249268,5.00,0.00


### Certain users tend to give higher ratings while others tend to gibve lower ratings. To negate this bias, we normalise the ratings given by the users.

In [50]:
dataset['normalized_rating'] = dataset['real_rating'] - dataset['average_rating']
dataset

Unnamed: 0,real_rating,Timestamp,user,product,average_rating,normalized_rating
0,5.0,1369699200,725046,0,4.25,0.75
1,3.0,1369699200,725046,81854,4.25,-1.25
2,5.0,1369699200,725046,89013,4.25,0.75
3,4.0,1369699200,725046,154092,4.25,-0.25
4,3.0,1355443200,814606,1,3.50,-0.50
...,...,...,...,...,...,...
2023065,5.0,1405728000,1012502,249268,5.00,0.00
2023066,5.0,1405296000,254698,249268,5.00,0.00
2023067,5.0,1405468800,1030275,249268,5.00,0.00
2023068,5.0,1406073600,249628,249268,5.00,0.00
