In [29]:
# import required libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [30]:
# read the dataset
df = pd.read_csv('../ratings_Beauty.csv')
df

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,0205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,0558925278,3.0,1355443200
2,A1Z513UWSAAO0F,0558925278,5.0,1404691200
3,A1WMRR494NWEWV,0733001998,4.0,1382572800
4,A3IAAVS479H7M7,0737104473,1.0,1274227200
...,...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0,1405296000
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0,1405296000
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0,1405382400
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0,1405555200


### Checking data types of the columns

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 61.7+ MB


The columns UserId and ProductId are of type Object. To use them for building Machine Learning model, we need to convert them to type 'Numeric'

In [33]:
dataset = df.copy()
label_encoder = preprocessing.LabelEncoder()
dataset['user'] = label_encoder.fit_transform(df['UserId'])
dataset['product'] = label_encoder.fit_transform(df['ProductId'])
dataset.head()


Unnamed: 0,UserId,ProductId,Rating,Timestamp,user,product
0,A39HTATAQ9V7YF,205616461,5.0,1369699200,725046,0
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200,814606,1
2,A1Z513UWSAAO0F,558925278,5.0,1404691200,313101,1
3,A1WMRR494NWEWV,733001998,4.0,1382572800,291075,2
4,A3IAAVS479H7M7,737104473,1.0,1274227200,802842,3


In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
 4   user       int64  
 5   product    int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 92.6+ MB


We can remove the original UserId and ProductId which are of type Object

In [27]:
dataset.drop(['UserId'], axis=1, inplace=True)
dataset.drop(['ProductId'], axis=1, inplace=True)

Unnamed: 0,Rating,Timestamp,user,product
0,5.0,1369699200,725046,0
1,3.0,1355443200,814606,1
2,5.0,1404691200,313101,1
3,4.0,1382572800,291075,2
4,1.0,1274227200,802842,3


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   Rating     float64
 1   Timestamp  int64  
 2   user       int64  
 3   product    int64  
dtypes: float64(1), int64(3)
memory usage: 61.7 MB


### Null Value Check

In [19]:
df.isnull().any()

Rating       False
Timestamp    False
user         False
product      False
dtype: bool

We can see that there are no null values in the dataset. If there exists any null values, we have to remove the corresponding records.

### Duplicate Record Check

In [20]:
duplicates = df.duplicated(["UserId","ProductId", "Rating", "Timestamp"]).sum()
print(' Duplicate records: ',duplicates)

KeyError: Index(['ProductId', 'UserId'], dtype='object')

In [21]:
df

Unnamed: 0,Rating,Timestamp,user,product
0,5.0,1369699200,725046,0
1,3.0,1355443200,814606,1
2,5.0,1404691200,313101,1
3,4.0,1382572800,291075,2
4,1.0,1274227200,802842,3
...,...,...,...,...
2023065,5.0,1405296000,759765,249269
2023066,5.0,1405296000,759765,249270
2023067,5.0,1405382400,1035084,249271
2023068,5.0,1405555200,1195527,249272
