In [113]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import os
import datetime
pd.options.display.max_columns = None

In [133]:
# encode with utf-8
items = pd.read_csv("lazada reviews/20191002-items.csv", encoding='utf-8')
reviews = pd.read_csv("lazada reviews/20191002-reviews.csv", encoding='utf-8')

In [134]:
reviews.sample(10)
reviews.shape

Unnamed: 0,itemId,category,name,rating,originalRating,reviewTitle,reviewContent,likeCount,upVotes,downVotes,helpful,relevanceScore,boughtDate,clientType,retrievedDate
100864,377474273,beli-smart-tv,Lazada Customer,5,,,,0,0,0,True,29.83,14 Mei 2019,androidApp,2019-10-02
126657,160020846,jual-flash-drives,hari P.,5,,,mantappp bos,0,0,0,True,29.51,02 Feb 2019,androidApp,2019-10-02
32471,321194529,beli-harddisk-eksternal,Sulistya W.,1,,,"Lazada yg terhormat, Barang yang kami terima t...",0,0,0,True,30.26,09 Okt 2018,desktop,2019-10-02
78552,7940389,beli-harddisk-eksternal,Zuldian M.,5,,,,0,0,0,True,19.01,14 Jun 2019,androidApp,2019-10-02
134250,171506831,jual-flash-drives,Irdan T.,5,,,"Barang sudah sampai dengan selamat, mau dicoba...",0,0,0,True,24.51,23 Jan 2019,androidApp,2019-10-02
109916,419320884,beli-smart-tv,Widiastuti,5,,,,0,0,0,True,19.01,14 Mei 2019,androidApp,2019-10-02
140358,346370253,jual-flash-drives,Lazada Customer,5,,,,0,0,0,True,44.17,09 Sep 2019,androidApp,2019-10-02
58751,419970560,beli-harddisk-eksternal,johari,5,,,,0,0,0,True,22.01,23 Jan 2019,androidApp,2019-10-02
30425,2325989,beli-harddisk-eksternal,Rzkyarfsptro,4,,Barang bagus,Barang bagus tapi pengiriman lumayan lama,0,0,0,True,22.32,02 Nov 2016,mobile-app,2019-10-02
47293,377474277,beli-harddisk-eksternal,Fadjri R.,5,,,,0,0,0,True,17.01,11 Des 2018,androidApp,2019-10-02


(203787, 15)

# Cleaning data 
## Missing values

In [135]:
missing_values_count = reviews.isnull().sum()
missing_values_count

itemId                 0
category               0
name                   0
rating                 0
originalRating    203779
reviewTitle       180383
reviewContent      96758
likeCount              0
upVotes                0
downVotes              0
helpful                0
relevanceScore         0
boughtDate          7107
clientType             0
retrievedDate          0
dtype: int64

In [136]:
df_missing = pd.DataFrame(missing_values_count, columns=['cnt'])

df_missing['percentage'] = (df_missing['cnt'] / reviews.shape[0]) * 100
df_missing

Unnamed: 0,cnt,percentage
itemId,0,0.0
category,0,0.0
name,0,0.0
rating,0,0.0
originalRating,203779,99.996074
reviewTitle,180383,88.51546
reviewContent,96758,47.479967
likeCount,0,0.0
upVotes,0,0.0
downVotes,0,0.0


In [137]:
# total missing values
total_cells = np.product(reviews.shape)
total_cells
total_missing = missing_values_count.sum()
total_missing
(total_missing/total_cells) * 100

3056805

488027

15.965264385526718

In [138]:
reviews = reviews[reviews.columns.drop(['originalRating', 'reviewTitle'])]
reviews.head()

Unnamed: 0,itemId,category,name,rating,reviewContent,likeCount,upVotes,downVotes,helpful,relevanceScore,boughtDate,clientType,retrievedDate
0,100002528,beli-harddisk-eksternal,Kamal U.,5,bagus mantap dah sesui pesanan,0,0,0,True,26.51,09 Apr 2019,androidApp,2019-10-02
1,100002528,beli-harddisk-eksternal,yofanca m.,4,"Bagus, sesuai foto",0,0,0,True,22.49,24 Sep 2017,androidApp,2019-10-02
2,100002528,beli-harddisk-eksternal,Lazada Customer,5,okkkkk mantaaaaaaapppp ... goood,0,0,0,True,21.5,04 Apr 2018,androidApp,2019-10-02
3,100002528,beli-harddisk-eksternal,Lazada Customer,4,bagus sesuai,0,0,0,True,20.51,22 Sep 2017,androidApp,2019-10-02
4,100002528,beli-harddisk-eksternal,Yosep M.,5,,0,0,0,True,16.01,17 Agu 2018,androidApp,2019-10-02


# Cleaning data 
## Clean date types

In [139]:
reviews.boughtDate.sample(20)
#this data not too important for viewing reviews, maybe only 3% but i think this should not dropped

19367     27 Mar 2019
85100     03 Mei 2019
84957     16 Feb 2019
186614    13 Des 2017
201157    29 Mar 2019
115781    30 Agu 2019
18850     02 Des 2017
48967     16 Feb 2019
171107    01 Jul 2019
81289     01 Feb 2017
6897      28 Mar 2017
159567    05 Mar 2019
94539     22 Nov 2018
156682    06 Mar 2019
44467     02 Sep 2019
38733     27 Mar 2019
138338    14 Des 2017
140742    09 Sep 2019
52786     08 Sep 2018
83978     28 Apr 2019
Name: boughtDate, dtype: object

In [140]:
reviews['date_parsed'] = reviews['boughtDate'].str.replace('Mei', 'May')
reviews['date_parsed'] = reviews['date_parsed'].str.replace('Agu', 'Aug')
reviews['date_parsed'] = reviews['date_parsed'].str.replace('Okt', 'Oct')
reviews['date_parsed'] = reviews['date_parsed'].str.replace('Nop', 'Nov')
reviews['date_parsed'] = reviews['date_parsed'].str.replace('Des', 'Dec')

# convert to datetime YMD
reviews['date_parsed'] = pd.to_datetime(reviews['date_parsed'])

In [143]:
reviews.sample(20)

Unnamed: 0,itemId,category,name,rating,reviewContent,likeCount,upVotes,downVotes,helpful,relevanceScore,boughtDate,clientType,retrievedDate,date_parsed
99218,368162161,beli-smart-tv,Lazada Customer,5,Mantab barang sampai dalam 2 hari. Barang oke ...,0,0,0,True,33.01,19 Feb 2019,androidApp,2019-10-02,2019-02-19
112217,428575274,beli-smart-tv,Irawan I.,5,murah tapi tidak murahan 👍👍👍👍,0,0,0,True,49.17,01 Sep 2019,androidApp,2019-10-02,2019-09-01
11784,143669132,beli-harddisk-eksternal,Rahmat A.,4,Produknya cukup baik dgn harga terjangkau.,0,0,0,True,23.51,03 Okt 2018,mobile,2019-10-02,2018-10-03
92415,160022809,beli-smart-tv,Lazada Guest,3,"Good product, barang diterima 3 hari stlh orde...",0,0,0,True,17.0,,mobile-app,2019-10-02,NaT
196412,407406791,shop-televisi-digital,Ahmad S.,5,,0,0,0,True,19.01,20 Mei 2019,androidApp,2019-10-02,2019-05-20
84903,363803836,beli-laptop,Debora M.,5,,0,0,0,True,17.01,15 Jan 2019,androidApp,2019-10-02,2019-01-15
145941,368162165,jual-flash-drives,leli T.,5,,0,0,0,True,17.01,19 Okt 2018,androidApp,2019-10-02,2018-10-19
191880,363646891,shop-televisi-digital,Anna P.,5,good job,0,0,0,True,21.01,11 Nov 2018,androidApp,2019-10-02,2018-11-11
40490,361124046,beli-harddisk-eksternal,Lazada Customer,5,,0,0,0,True,19.01,07 Mei 2019,androidApp,2019-10-02,2019-05-07
171973,512604347,jual-flash-drives,Lazada Customer,5,"mantap, sesuai deskripsi produk",0,0,0,True,37.08,30 Jul 2019,androidApp,2019-10-02,2019-07-30


# Cleaning data 
## Check inconsistent data entry

In [144]:
reviews.category.unique()
reviews.clientType.unique()

# data column is safe, because no inconsistent data

array(['beli-harddisk-eksternal', 'beli-laptop', 'beli-smart-tv',
       'jual-flash-drives', 'shop-televisi-digital'], dtype=object)

array(['androidApp', 'mobile', 'mobile-app', 'desktop', 'iosApp'],
      dtype=object)