# Loading Libraries and Data

## Loading Libraries

In [None]:
import dask.dataframe as dd
import pandas as pd

## Loading Data

In [None]:
dtypes = {"userid": "int", "username": "str", "item": "str", "item_type": "int","comment": "str",
          "rating": "int", "product_quality": "object", "seller_service": "object",
          "delivery_service": "object", "has_template_tag": "bool", "template_tags": "object",
          "tags": "object", "is_oversea": "bool", "origin_region": "str", "like_count": "object",
          "is_repeated_purchase": "bool", "exclude_scoring_due_low_logistic": "bool"}

reviews_dd = dd.read_csv(
    "../data/*/*.csv",
    blocksize='25MB',
    dtype=dtypes,
)

In [None]:
df = reviews_dd.compute()
df.head(3)

# Exploratory Data Analysis - Preprocessing

## EDA

In [None]:
from dataprep.eda import *
# create_report(df, title="Shopee Reviews EDA Report").show()


In [None]:
df.drop_duplicates(inplace=True)
df.dropna(subset=['comment'], inplace=True)
df.dropna(subset=['template_tags'], inplace=True)

df['ctime'] = pd.to_datetime(df['ctime'], unit='s')
df.info()

In [None]:
dataset = df[["userid","item_type","comment","template_tags","rating","product_quality","origin_region"]]
dataset.head(5)

In [None]:
dataset["origin_region"].value_counts()

In [None]:
dataset[dataset["origin_region"]!= "ph"]['item_type'].unique()

In [None]:
dataset[dataset["origin_region"]!= "ph"]['comment'].tolist()[:5]

In [None]:
dataset.count()

In [None]:
dataset = dataset[dataset["origin_region"] == "ph"]
dataset.drop(columns=["origin_region"], inplace=True)
dataset.count()

In [None]:
dataset.head(3)

## Pre-processing - Template Tags

In [None]:
import ast
template_tags = []
for i in [ast.literal_eval(i) for i in dataset['template_tags'].unique().tolist()]:
    template_tags.extend(i)
template_tags

In [None]:
template_tags = list(set(template_tags))
template_tags.remove("Beauty Effect)")

In [None]:
string = dataset["comment"].iloc[2]
string

In [None]:
dataset.head(3)

## Data Transformation/Feature Extraction using template tags

In [None]:
import re
import numpy as np
for template in template_tags:
    dataset[template] = dataset['comment'].apply(lambda comment: re.sub(f"\\n","",re.sub(f"{template}\:","",re.findall(f"{template}\:.*?\\n",comment)[0])) if (comment.find(template) != -1 and len(re.findall(f"{template}\:.*?\\n",comment))>0) else np.nan)

dataset.head(3)

In [None]:
dataset['comment_no_tags'] = dataset['comment']
for _ in range(10):
    dataset['comment_no_tags'] = dataset['comment_no_tags'].apply(lambda comment: re.sub(f"^({'|'.join(template_tags)})\:.*(\\n)*","",comment))
dataset['comment_no_tags']

In [None]:
dataset.drop(columns=["template_tags"], inplace=True)
dataset = dataset.assign(userid=range(len(dataset)))
dataset.rename(columns={"userid":"id"},inplace=True)
dataset.head(3)

In [None]:
dataset.reset_index(drop=True, inplace=True)
dataset.head(3)

## Pre-processing Comments without Tags

In [None]:
subset_comment = dataset[["id","comment","comment_no_tags"]]
subset_comment.head(3)

In [None]:
import emoji

def clean(comment):
    comment = str(comment)
    comment = comment.lower()
    comment = " ".join(comment.split()).strip()
    comment = emoji.replace_emoji(comment,'')
    comment = re.sub(r"\\n", " ", comment) # \n / newline
    comment = re.sub(r"https?://[^\s]+?\.[a-z]{2,6}","",comment) # Web Links
    comment = re.sub(r"\B[^a-zA-Z]+\B","",comment) # Punctuations
    comment = comment.strip()
    return comment

In [None]:
tester = dataset['comment_no_tags'][0]
tester

In [None]:
clean(tester)
clean("The cat sw👈🎉eat on 🤩🫰🤩the 🤩 mathttps://facebook.com 🤩🤩.")

In [None]:
subset_comment["comment_cleaned"] = subset_comment["comment_no_tags"].apply(clean)
subset_comment.head(3)

In [None]:
subset_comment.loc[384]

In [None]:
subset_comment.comment_cleaned.duplicated().value_counts()

In [None]:
subset_comment[subset_comment.comment_cleaned.duplicated()].head(50)

In [None]:
subset_comment.loc[287].comment

In [None]:
subset_comment[subset_comment.comment_no_tags == ""].comment.tolist()

In [None]:
subset_comment.drop_duplicates(subset=["comment_cleaned"], inplace=True)
subset_comment.comment_cleaned.duplicated().value_counts()

In [None]:
subset_comment = subset_comment.drop(index=[subset_comment[subset_comment["comment_cleaned"] == ""].index[0]])

In [None]:
subset_comment[subset_comment.comment_cleaned == ""]

In [None]:
subset_comment

In [None]:
subset_comment.comment_cleaned.tolist()

## Data Cleaning using Language Detection