## Loading Libraries and Data

In [1]:
import pandas as pd
import os
import dask.dataframe as dd
import numpy as np

### Downloading Data from Shopee API

In [2]:
url_list = []
with open("url.txt", "r") as f:
    url_list = f.read().splitlines()

url_list

['https://shopee.ph/Trendy-Fashionable-Cotton-Jogger-Pants-For-Men-5-Colors-Good-Quality-JF09-i.78369718.5541866137?sp_atk=df76d618-dc6f-4d5a-ab0e-ee28e5c4b295&xptdk=df76d618-dc6f-4d5a-ab0e-ee28e5c4b295',
 'https://shopee.ph/%E2%9C%A8In-Stock!%E2%9C%A8AIXINI-26-PCS-Alphabet-Plush-Toy-Alphabets-Alphabet-Lore-Stuffed-Doll-Preschool-Educational-English-ABC-Letter-Toy-for-Kids-Children-A-Z-i.190409750.18956258616?sp_atk=2a2ff182-d854-41a6-9f34-4282de4ebf5d&xptdk=2a2ff182-d854-41a6-9f34-4282de4ebf5d',
 'https://shopee.ph/Siv-60-130-Cm-Safety-Gate-Fence-Guard-For-Baby-Child-Stairs-Dogs-Pets-i.165194842.8925151854?sp_atk=971cc1ad-3304-4b03-8690-db351546bc34&xptdk=971cc1ad-3304-4b03-8690-db351546bc34',
 'https://shopee.ph/Soccer-Ball-Football-Ball-F5V3400-Vantaggio-Series-Football-Size-5-with-32-Panels-and-PU-Leather-i.683474321.21020871259?sp_atk=19828d67-06d5-48a5-8171-027d211d0e49&xptdk=19828d67-06d5-48a5-8171-027d211d0e49',
 'https://shopee.ph/A51-COD-Kitchen-Cabinet-Rack-Organizer-(retrac

In [3]:
shopee_data_dir = "data/shopee"
if os.path.isdir(shopee_data_dir):
    print("Directory already exists")
else:
    os.makedirs(shopee_data_dir)

Directory already exists


In [4]:
if len(os.listdir(shopee_data_dir)) == len(url_list):
    print("All files are already downloaded.")
else:
    from scraper import ShopeeAPI
    api = ShopeeAPI(url_list,shopee_data_dir)
    api.scrape()

### Loading Data

In [5]:
dataset = os.listdir("data/")
dataset

['shopee']

In [6]:
dtypes = {"userid": "int", "username": "str", "item": "str", "item_type": "int","comment": "str",
          "rating": "int", "product_quality": "object", "seller_service": "object",
          "delivery_service": "object", "has_template_tag": "bool", "template_tags": "object",
          "tags": "object", "is_oversea": "bool", "origin_region": "str", "like_count": "object",
          "is_repeated_purchase": "bool", "exclude_scoring_due_low_logistic": "bool"}

consumer_dd = dd.read_csv(
    'data/*.csv',
    blocksize='25MB',
    dtype=dtypes,
)

OSError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: data/*.csv resolved to no files

In [None]:
df = consumer_dd.compute()

In [None]:
df['ctime'] = pd.to_datetime(df['ctime'], unit='s')
df.info()

Column Descriptions 
 0.   userid - customer id
 1.   username - customer username
 2.   item - item name
 3.   ctime - time of review created
 4.   comment - customer review
 5.   rating - customer rating
 6.   detailed rating - product_quality, seller_service, delivery_service
 7.   has_template_tag - to identify if the comment has template tags (e.g. [appearance] [colour] [material] [quality])
 8.   template_tags - template tags in the comment
 9.   tags - default tags that can be used (similar with grab) e.g., [excellent] [will buy again]
 10.  is_oversea - to identify if the item is oversea
 11.  origin_region - origin region of the item (country code)
 12.  like_count - like count of the review
 13.  is_repeated_purchase - to identify if the customer has purchased the item before
 14.  exclude_scoring_due_low_logistic - to identify if the review is excluded from overall scoring due to low logistic

In [None]:
df.head(2)

### Exploratory Data Analysis

In [None]:
from dataprep.eda import *
report = create_report(df)
report.show_browser()

## Sentiments Dataframe

In [None]:
sentiments_df = df[df['has_template_tag']==False].reset_index(drop=True)
sentiments_df = sentiments_df[['item','ctime','comment', 'rating', 'product_quality', 'seller_service', 'delivery_service']]
sentiments_df.sort_values(by="rating")