## Loading Libraries and Data

In [1]:
import pandas as pd
import os
import dask.dataframe as dd
import numpy as np

### Downloading Data from Shopee API

In [2]:
url_list = []
with open("url.txt", "r") as f:
    url_list = f.read().splitlines()

url_list

['https://shopee.ph/Trendy-Fashionable-Cotton-Jogger-Pants-For-Men-5-Colors-Good-Quality-JF09-i.78369718.5541866137?sp_atk=df76d618-dc6f-4d5a-ab0e-ee28e5c4b295&xptdk=df76d618-dc6f-4d5a-ab0e-ee28e5c4b295',
 'https://shopee.ph/%E2%9C%A8In-Stock!%E2%9C%A8AIXINI-26-PCS-Alphabet-Plush-Toy-Alphabets-Alphabet-Lore-Stuffed-Doll-Preschool-Educational-English-ABC-Letter-Toy-for-Kids-Children-A-Z-i.190409750.18956258616?sp_atk=2a2ff182-d854-41a6-9f34-4282de4ebf5d&xptdk=2a2ff182-d854-41a6-9f34-4282de4ebf5d',
 'https://shopee.ph/Siv-60-130-Cm-Safety-Gate-Fence-Guard-For-Baby-Child-Stairs-Dogs-Pets-i.165194842.8925151854?sp_atk=971cc1ad-3304-4b03-8690-db351546bc34&xptdk=971cc1ad-3304-4b03-8690-db351546bc34',
 'https://shopee.ph/Soccer-Ball-Football-Ball-F5V3400-Vantaggio-Series-Football-Size-5-with-32-Panels-and-PU-Leather-i.683474321.21020871259?sp_atk=19828d67-06d5-48a5-8171-027d211d0e49&xptdk=19828d67-06d5-48a5-8171-027d211d0e49',
 'https://shopee.ph/A51-COD-Kitchen-Cabinet-Rack-Organizer-(retrac

In [3]:
shopee_data_dir = "data/shopee"
if os.path.isdir(shopee_data_dir):
    print("Directory already exists")
else:
    os.makedirs(shopee_data_dir)

Directory already exists


In [4]:
if len(os.listdir(shopee_data_dir)) == len(url_list):
    print("All files are already downloaded.")
else:
    from scraper import ShopeeAPI
    api = ShopeeAPI(url_list,shopee_data_dir)
    api.scrape()

All files are already downloaded.


### Loading Data

In [5]:
dataset = os.listdir(shopee_data_dir)
dataset

['0.csv',
 '1.csv',
 '10.csv',
 '11.csv',
 '12.csv',
 '13.csv',
 '14.csv',
 '15.csv',
 '16.csv',
 '17.csv',
 '18.csv',
 '19.csv',
 '2.csv',
 '20.csv',
 '21.csv',
 '22.csv',
 '23.csv',
 '24.csv',
 '25.csv',
 '26.csv',
 '27.csv',
 '28.csv',
 '29.csv',
 '3.csv',
 '30.csv',
 '31.csv',
 '32.csv',
 '33.csv',
 '34.csv',
 '35.csv',
 '36.csv',
 '37.csv',
 '38.csv',
 '39.csv',
 '4.csv',
 '40.csv',
 '5.csv',
 '6.csv',
 '7.csv',
 '8.csv',
 '9.csv']

In [6]:
dtypes = {"userid": "int", "username": "str", "item": "str", "item_type": "int","comment": "str",
          "rating": "int", "product_quality": "object", "seller_service": "object",
          "delivery_service": "object", "has_template_tag": "bool", "template_tags": "object",
          "tags": "object", "is_oversea": "bool", "origin_region": "str", "like_count": "object",
          "is_repeated_purchase": "bool", "exclude_scoring_due_low_logistic": "bool"}

consumer_dd = dd.read_csv(
    shopee_data_dir+"/*.csv",
    blocksize='25MB',
    dtype=dtypes,
)

In [7]:
df = consumer_dd.compute()

In [8]:
df['ctime'] = pd.to_datetime(df['ctime'], unit='s')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72263 entries, 0 to 3019
Data columns (total 18 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   userid                            72263 non-null  int32         
 1   username                          72015 non-null  object        
 2   item                              72263 non-null  object        
 3   item_type                         72263 non-null  int32         
 4   ctime                             72263 non-null  datetime64[ns]
 5   comment                           38678 non-null  object        
 6   rating                            72263 non-null  int32         
 7   product_quality                   72263 non-null  object        
 8   seller_service                    31454 non-null  object        
 9   delivery_service                  31450 non-null  object        
 10  has_template_tag                  72263 non-nul

Column Descriptions 
 0.   userid - customer id
 1.   username - customer username
 2.   item - item name
 3.   ctime - time of review created
 4.   comment - customer review
 5.   rating - customer rating
 6.   detailed rating - product_quality, seller_service, delivery_service
 7.   has_template_tag - to identify if the comment has template tags (e.g. [appearance] [colour] [material] [quality])
 8.   template_tags - template tags in the comment
 9.   tags - default tags that can be used (similar with grab) e.g., [excellent] [will buy again]
 10.  is_oversea - to identify if the item is oversea
 11.  origin_region - origin region of the item (country code)
 12.  like_count - like count of the review
 13.  is_repeated_purchase - to identify if the customer has purchased the item before
 14.  exclude_scoring_due_low_logistic - to identify if the review is excluded from overall scoring due to low logistic

In [9]:
df.head(2)


Unnamed: 0,userid,username,item,item_type,ctime,comment,rating,product_quality,seller_service,delivery_service,has_template_tag,template_tags,tags,is_oversea,origin_region,like_count,is_repeated_purchase,exclude_scoring_due_low_logistic
0,458113431,p*****a,Trendy Fashionable Cotton Jogger Pants For Men...,0,2023-04-02 03:59:38,Colour:Black\nMaterial Quality:Good Quality\n\...,5,5,5.0,5.0,True,"['Colour', 'Material Quality', 'Appearance']",,False,ph,1.0,False,False
1,377240164,alcorizamhelody6,Trendy Fashionable Cotton Jogger Pants For Men...,0,2023-03-11 08:00:54,Appearance:ok\nColour:good\nMaterial Quality:m...,3,3,5.0,5.0,True,"['Appearance', 'Colour', 'Material Quality']",,False,ph,4.0,False,False


In [10]:
df.columns

Index(['userid', 'username', 'item', 'item_type', 'ctime', 'comment', 'rating',
       'product_quality', 'seller_service', 'delivery_service',
       'has_template_tag', 'template_tags', 'tags', 'is_oversea',
       'origin_region', 'like_count', 'is_repeated_purchase',
       'exclude_scoring_due_low_logistic'],
      dtype='object')

In [11]:
new_df = df.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17]].copy()
new_df.head(2)

Unnamed: 0,userid,username,item,item_type,ctime,comment,rating,product_quality,seller_service,delivery_service,has_template_tag,tags,is_oversea,origin_region,like_count,is_repeated_purchase,exclude_scoring_due_low_logistic
0,458113431,p*****a,Trendy Fashionable Cotton Jogger Pants For Men...,0,2023-04-02 03:59:38,Colour:Black\nMaterial Quality:Good Quality\n\...,5,5,5.0,5.0,True,,False,ph,1.0,False,False
1,377240164,alcorizamhelody6,Trendy Fashionable Cotton Jogger Pants For Men...,0,2023-03-11 08:00:54,Appearance:ok\nColour:good\nMaterial Quality:m...,3,3,5.0,5.0,True,,False,ph,4.0,False,False


### Exploratory Data Analysis

In [14]:
from dataprep.eda import *
report = create_report(df)
report.show_browser()

  0%|          | 0/1812 [00:00<?, ?it/s]

  return func(*(_execute_task(a, cache) for a in args))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
  df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))


## Sentiments Dataframe

In [13]:
sentiments_df = df[df['has_template_tag']==False].reset_index(drop=True)
sentiments_df = sentiments_df[['item','ctime','comment', 'rating', 'product_quality', 'seller_service', 'delivery_service']]
sentiments_df.sort_values(by="rating")

Unnamed: 0,item,ctime,comment,rating,product_quality,seller_service,delivery_service
7974,Yunos Adjustable Back Support Posture Correcto...,2022-04-14 03:08:36,Parang pang ipit lang ng tyan dnaman nakaka St...,1,1,1.0,5.0
882,JF09 Fashionable Jogger Pants Trendy Cotton Ne...,2021-11-04 07:56:50,Obviously wrong item received. I ordered black...,1,1,1.0,5.0
20062,SIV 60-130 CM Safety Gate Fence Guard Stairs F...,2021-11-09 03:10:17,Haist puro dents d malaman kung sa courier or ...,1,1,1.0,1.0
879,JF09 Fashionable Jogger Pants Trendy Cotton Ne...,2022-01-11 10:32:58,i ordered Black and Dark Grey.\nlook what i go...,1,5,1.0,3.0
878,JF09 Fashionable Jogger Pants Trendy Cotton Ne...,2021-10-18 09:23:52,Kapag black ang order DApat black.bakit pula?H...,1,1,1.0,3.0
...,...,...,...,...,...,...,...
21647,【On hand & 372 pages】It Ends with Us Books by ...,2022-07-28 05:52:00,,5,5,,
21648,【On hand & 372 pages】It Ends with Us Books by ...,2022-07-24 01:35:24,,5,5,,
21649,【Brand New】It Ends with Us Books by Colleen Ho...,2022-02-25 01:02:51,,5,5,,
21638,【Ready Stock】It Ends with Us Books by Colleen ...,2022-04-05 13:46:34,,5,5,,
