## Problem Statement: Provide a Product Recommendation System for Taobao based on User Behaviour data

In [1]:
import pandas as pd
import numpy as np

In [3]:
#Data Source:
#https://tianchi.aliyun.com/dataset/dataDetail?dataId=649

### Data Extraction and Cleaning

In [2]:
tb = pd.read_csv('./UserBehavior.csv')

In [5]:
tb.columns = ['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp']

In [6]:
tb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100150806 entries, 0 to 100150805
Data columns (total 5 columns):
user_id          int64
item_id          int64
category_id      int64
behavior_type    object
timestamp        int64
dtypes: int64(4), object(1)
memory usage: 3.7+ GB


In [7]:
tb.dtypes

user_id           int64
item_id           int64
category_id       int64
behavior_type    object
timestamp         int64
dtype: object

In [8]:
tb.describe()

Unnamed: 0,user_id,item_id,category_id,timestamp
count,100150800.0,100150800.0,100150800.0,100150800.0
mean,506943.1,2579775.0,2696380.0,1511951000.0
std,294060.5,1488056.0,1463155.0,5528006.0
min,1.0,1.0,80.0,-2134949000.0
25%,252429.0,1295225.0,1320293.0,1511762000.0
50%,504015.0,2580735.0,2671397.0,1511965000.0
75%,760949.0,3862042.0,4145813.0,1512179000.0
max,1018011.0,5163070.0,5162429.0,2122867000.0


In [9]:
tb.shape

(100150806, 5)

In [10]:
#Find out the number of users
len(tb.user_id.unique())

987994

In [11]:
#Find out the number of products
len(tb.item_id.unique())

4162024

In [12]:
#Find out the number of product categories
len(tb.category_id.unique())

9439

In [13]:
#Find out all the different types of user behaviour
# pv = page view i.e. item click
# cart = add item to cart
# buy = purchase item
# fav = favor an item (taobao has a feature to save an item for future reference)
tb.behavior_type.value_counts()

pv      89716263
cart     5530446
fav      2888258
buy      2015839
Name: behavior_type, dtype: int64

In [14]:
tb.head()

Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,2333346,2520771,pv,1511561733
1,1,2576651,149192,pv,1511572885
2,1,3830808,4181361,pv,1511593493
3,1,4365585,2520377,pv,1511596146
4,1,4606018,2735466,pv,1511616481


In [15]:
#Drop duplicates
tb = tb.drop_duplicates()

In [16]:
#Dropped 49 duplicate rows
tb.shape

(100150757, 5)

In [17]:
#Convert datetime

tb['timestamp'] = pd.to_datetime(tb['timestamp'],unit='s')

In [18]:
tb.head()

Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,2333346,2520771,pv,2017-11-24 22:15:33
1,1,2576651,149192,pv,2017-11-25 01:21:25
2,1,3830808,4181361,pv,2017-11-25 07:04:53
3,1,4365585,2520377,pv,2017-11-25 07:49:06
4,1,4606018,2735466,pv,2017-11-25 13:28:01


In [19]:
tb.tail()

Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
100150801,999999,4797808,11120,pv,2017-12-03 09:30:03
100150802,999999,4613472,4602841,pv,2017-12-03 09:36:06
100150803,999999,3647364,2304296,pv,2017-12-03 09:36:32
100150804,999999,1903801,2304296,pv,2017-12-03 09:37:07
100150805,999999,3696094,4602841,pv,2017-12-03 09:38:11


In [20]:
#Finding null rows
tb.isnull().sum()

user_id          0
item_id          0
category_id      0
behavior_type    0
timestamp        0
dtype: int64

In [21]:
tb.to_csv('tb_new.csv', index=False)