# Loading Libraries and Data


## Loading Libraries


In [202]:
import dask.dataframe as dd
import pandas as pd
import re
import numpy as np


## Loading Data


In [203]:
dtypes = {
    "userid": "int",
    "username": "str",
    "item": "str",
    "item_type": "int",
    "comment": "str",
    "rating": "int",
    "product_quality": "object",
    "seller_service": "object",
    "delivery_service": "object",
    "has_template_tag": "bool",
    "template_tags": "object",
    "tags": "object",
    "is_oversea": "bool",
    "origin_region": "str",
    "like_count": "object",
    "is_repeated_purchase": "bool",
    "exclude_scoring_due_low_logistic": "bool",
}

reviews_dd = dd.read_csv(
    "../data/shopee/*.csv",
    blocksize="25MB",
    dtype=dtypes,
)


In [204]:
df = reviews_dd.compute()
df


Unnamed: 0,userid,username,item,item_type,ctime,comment,rating,product_quality,seller_service,delivery_service,has_template_tag,template_tags,tags,is_oversea,origin_region,like_count,is_repeated_purchase,exclude_scoring_due_low_logistic
0,458113431,p*****a,Trendy Fashionable Cotton Jogger Pants For Men...,0,1680407978,Colour:Black\nMaterial Quality:Good Quality\n\...,5,5,5.0,5.0,True,"['Colour', 'Material Quality', 'Appearance']",,False,ph,5.0,False,False
1,377240164,alcorizamhelody6,Trendy Fashionable Cotton Jogger Pants For Men...,0,1678521654,Appearance:ok\nColour:good\nMaterial Quality:m...,3,3,5.0,5.0,True,"['Appearance', 'Colour', 'Material Quality']",,False,ph,5.0,False,False
2,808115836,r*****e,Trendy Fashionable Cotton Jogger Pants For Men...,0,1673088416,Appearance:pangit\nColour:good\nMaterial Quali...,1,1,5.0,5.0,True,"['Appearance', 'Colour', 'Material Quality']",,False,ph,12.0,False,False
3,852929457,lhetwacguiyan,Trendy Fashionable Cotton Jogger Pants For Men...,0,1675585547,Material Quality:malambot ang tela\nAppearance...,5,5,5.0,5.0,True,"['Appearance', 'Colour', 'Material Quality']",,False,ph,25.0,False,False
4,55091753,n*****8,Trendy Fashionable Cotton Jogger Pants For Men...,0,1673000517,"Thank you seller ang bilis dumating, hindi tul...",5,5,5.0,5.0,False,[],,False,ph,9.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1371,792424141,fionahmaemae,Fashion Detangling Hair Brush Scalp Massage Ha...,99,1656413111,,5,5,,,False,[],,False,ph,,False,False
1372,202479266,j*****o,EMS fashion good quality 5 pieces in 1 set Met...,99,1645336093,,5,5,,,False,[],,False,ph,,False,False
1373,382190246,7p1avmwtbw,EMS fashion good quality 5 pieces in 1 set Met...,99,1644716411,,5,5,,,False,[],,False,ph,,False,False
1374,164519547,sheannel,EMS fashion good quality 5 pieces in 1 set Met...,99,1643884726,,5,5,,,False,[],,False,ph,,False,False


# Exploratory Data Analysis - Preprocessing


## EDA


In [205]:
from dataprep.eda import *

# create_report(df, title="Shopee Reviews EDA Report").show()


In [206]:
df.drop_duplicates(inplace=True)
df.dropna(subset=["comment"], inplace=True)
df.dropna(subset=["template_tags"], inplace=True)
df["ctime"] = pd.to_datetime(df["ctime"], unit="s")
df = df.assign(id=range(len(df)))
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 327758 entries, 0 to 298
Data columns (total 19 columns):
 #   Column                            Non-Null Count   Dtype         
---  ------                            --------------   -----         
 0   userid                            327758 non-null  int32         
 1   username                          325472 non-null  object        
 2   item                              327758 non-null  object        
 3   item_type                         327758 non-null  int32         
 4   ctime                             327758 non-null  datetime64[ns]
 5   comment                           327758 non-null  object        
 6   rating                            327758 non-null  int32         
 7   product_quality                   327758 non-null  object        
 8   seller_service                    128679 non-null  object        
 9   delivery_service                  128678 non-null  object        
 10  has_template_tag                  3

In [207]:
dataset = df[
    [
        "id",
        "item_type",
        "comment",
        "template_tags",
        "rating",
        "product_quality",
        "origin_region",
    ]
]
dataset.head(5)


Unnamed: 0,id,item_type,comment,template_tags,rating,product_quality,origin_region
0,0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"['Colour', 'Material Quality', 'Appearance']",5,5,ph
1,1,0,Appearance:ok\nColour:good\nMaterial Quality:m...,"['Appearance', 'Colour', 'Material Quality']",3,3,ph
2,2,0,Appearance:pangit\nColour:good\nMaterial Quali...,"['Appearance', 'Colour', 'Material Quality']",1,1,ph
3,3,0,Material Quality:malambot ang tela\nAppearance...,"['Appearance', 'Colour', 'Material Quality']",5,5,ph
4,4,0,"Thank you seller ang bilis dumating, hindi tul...",[],5,5,ph


In [208]:
dataset["origin_region"].value_counts()


ph    321902
vn      2779
my      1438
br       765
id       517
th       216
sg       117
mx        19
cl         4
co         1
Name: origin_region, dtype: int64

In [209]:
dataset[dataset["origin_region"] != "ph"]["item_type"].unique()


array([  1, 118, 119, 143, 153, 154, 180, 212,  27, 273, 338, 354, 360,
        37,  41, 422, 427, 439, 450,  64,  70,  72,  87,  92])

In [210]:
dataset[dataset["origin_region"] != "ph"]["comment"].tolist()[:5]


['Material:very good, premium\n\nThis is very good product. Good materials. The plushies is good quality and sure will last long. I repeatedly by for so many times right now. Satisfied',
 'Material:Very good materials, premium and feels good.\nSafety:not to worry\n\nThis is very good product. Good materials. The plushies is good quality and sure will last long. I repeatedly by for so many times right now. Satisfied',
 'My son loved it. A little smaller than expected, but well made overall.',
 'Performance: very good\nQuality: GOOD PRICE FOR GOOD PRICE\nSuitable For: children\n\nThe seller is fast, the delivery is very fast, the goods are very good, the abal2 is according to the price, the material is smooth, the color is according to the shape according to the recommended seller 🙏💖 Thanks seller',
 'Value for money: great\nSimilar to ad:same\nSecurity: good\n\nI loved it very well worked and arrived quickly my son asked for Christmas kkk I really loved it']

In [211]:
dataset.count()


id                 327758
item_type          327758
comment            327758
template_tags      327758
rating             327758
product_quality    327758
origin_region      327758
dtype: int64

In [212]:
dataset = dataset[dataset["origin_region"] == "ph"]
dataset.drop(columns=["origin_region"], inplace=True)
dataset.count()


id                 321902
item_type          321902
comment            321902
template_tags      321902
rating             321902
product_quality    321902
dtype: int64

In [213]:
dataset.head(3)


Unnamed: 0,id,item_type,comment,template_tags,rating,product_quality
0,0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"['Colour', 'Material Quality', 'Appearance']",5,5
1,1,0,Appearance:ok\nColour:good\nMaterial Quality:m...,"['Appearance', 'Colour', 'Material Quality']",3,3
2,2,0,Appearance:pangit\nColour:good\nMaterial Quali...,"['Appearance', 'Colour', 'Material Quality']",1,1


## Pre-processing - Template Tags


In [214]:
import ast

template_tags = []
for i in [ast.literal_eval(i) for i in dataset["template_tags"].unique().tolist()]:
    template_tags.extend(i)
template_tags


['Colour',
 'Material Quality',
 'Appearance',
 'Appearance',
 'Colour',
 'Material Quality',
 'Material Quality',
 'Appearance',
 'Colour',
 'Colour',
 'Appearance',
 'Material Quality',
 'Appearance',
 'Material Quality',
 'Colour',
 'Material Quality',
 'Colour',
 'Appearance',
 'Performance',
 'Quality',
 'Suitability',
 'Quality',
 'Suitability',
 'Performance',
 'Quality',
 'Performance',
 'Suitability',
 'Suitability',
 'Performance',
 'Quality',
 'Suitability',
 'Quality',
 'Performance',
 'Performance',
 'Suitability',
 'Quality',
 'Packaging',
 'User Tips',
 'Usage Experience',
 'Effectiveness',
 'Material Quality',
 'User Tips',
 'Packaging',
 'Usage Experience',
 'User Tips',
 'Usage Experience',
 'Packaging',
 'Packaging',
 'Usage Experience',
 'User Tips',
 'Material Quality',
 'Effectiveness',
 'Usage Experience',
 'Packaging',
 'User Tips',
 'Appearance',
 'Quality',
 'Suitability',
 'Suitability',
 'Quality',
 'Appearance',
 'Appearance',
 'Suitability',
 'Quality',
 '

In [215]:
template_tags = list(set(template_tags))
template_tags.remove("Beauty Effect)")
template_tags


['Beauty Profile',
 'Fragrance',
 'User Tips',
 'Benefits',
 'Beauty Effect',
 'Quality',
 'Appearance',
 'Effectiveness',
 'Occasion',
 'Taste',
 'Packaging',
 'Texture',
 'Performance',
 'Material Quality',
 'Best Feature',
 'Suitability',
 'Usage Experience',
 'Colour',
 'Product Quality',
 'Value For Money']

In [216]:
string = dataset["comment"].iloc[2]
string


'Appearance:pangit\nColour:good\nMaterial Quality:manipis\n\nmanipis yung tela ngayon pangalawang order ko..mas makapal kasie yung unang order ko.. at mali pa yung pinadala yung sa short hindi manlang nag sabie na pinalitan pala yung order ko..🤦'

In [217]:
dataset = dataset[["id", "comment", "rating", "product_quality", "template_tags"]]
dataset


Unnamed: 0,id,comment,rating,product_quality,template_tags
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,5,5,"['Colour', 'Material Quality', 'Appearance']"
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,3,3,"['Appearance', 'Colour', 'Material Quality']"
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,1,1,"['Appearance', 'Colour', 'Material Quality']"
3,3,Material Quality:malambot ang tela\nAppearance...,5,5,"['Appearance', 'Colour', 'Material Quality']"
4,4,"Thank you seller ang bilis dumating, hindi tul...",5,5,[]
...,...,...,...,...,...
294,327753,100%✓,5,5,[]
295,327754,Okay cia gamitin!I like it ❤️❤️,5,5,[]
296,327755,maganda siya malaki pa\nthank you seller🥰❤️,5,5,[]
297,327756,"walang damage yung product, mukha nakakasuklay...",5,5,[]


In [218]:
dataset["template_tags_extracted"] = dataset["comment"].apply(
    lambda comment: [i[0] for i in re.findall(r"(([A-Z][a-z]*? ?){1,3}):", comment)]
)
dataset


Unnamed: 0,id,comment,rating,product_quality,template_tags,template_tags_extracted
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,5,5,"['Colour', 'Material Quality', 'Appearance']","[Colour, Material Quality]"
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,3,3,"['Appearance', 'Colour', 'Material Quality']","[Appearance, Colour, Material Quality]"
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,1,1,"['Appearance', 'Colour', 'Material Quality']","[Appearance, Colour, Material Quality]"
3,3,Material Quality:malambot ang tela\nAppearance...,5,5,"['Appearance', 'Colour', 'Material Quality']","[Material Quality, Appearance, Colour]"
4,4,"Thank you seller ang bilis dumating, hindi tul...",5,5,[],[]
...,...,...,...,...,...,...
294,327753,100%✓,5,5,[],[]
295,327754,Okay cia gamitin!I like it ❤️❤️,5,5,[],[]
296,327755,maganda siya malaki pa\nthank you seller🥰❤️,5,5,[],[]
297,327756,"walang damage yung product, mukha nakakasuklay...",5,5,[],[]


In [219]:
for i in dataset["template_tags_extracted"]:
    template_tags.extend(i)
template_tags = [" ".join(i.split()).strip() for i in template_tags]
template_tags


['Beauty Profile',
 'Fragrance',
 'User Tips',
 'Benefits',
 'Beauty Effect',
 'Quality',
 'Appearance',
 'Effectiveness',
 'Occasion',
 'Taste',
 'Packaging',
 'Texture',
 'Performance',
 'Material Quality',
 'Best Feature',
 'Suitability',
 'Usage Experience',
 'Colour',
 'Product Quality',
 'Value For Money',
 'Colour',
 'Material Quality',
 'Appearance',
 'Colour',
 'Material Quality',
 'Appearance',
 'Colour',
 'Material Quality',
 'Material Quality',
 'Appearance',
 'Colour',
 'Appearance',
 'Colour',
 'Material Quality',
 'Material Quality',
 'Appearance',
 'Colour',
 'Colour',
 'Material Quality',
 'Appearance',
 'Appearance',
 'Colour',
 'Material Quality',
 'Colour',
 'Appearance',
 'Colour',
 'Material Quality',
 'Material Quality',
 'Appearance',
 'Colour',
 'Material Quality',
 'Appearance',
 'Colour',
 'Appearance',
 'Colour',
 'Material Quality',
 'Appearance',
 'Colour',
 'Material Quality',
 'Appearance',
 'Colour',
 'Material Quality',
 'Appearance',
 'Colour',
 'Mate

In [220]:
from collections import Counter

temp_tags = Counter(template_tags)
temp_tags


Counter({'Effectiveness': 43697,
         'Fragrance': 38894,
         'Texture': 35739,
         'Appearance': 34883,
         'Material Quality': 24931,
         'Colour': 20971,
         'Quality': 19750,
         'Suitability': 17357,
         'Performance': 16186,
         'Product Quality': 10254,
         'Best Feature': 9831,
         'Packaging': 3134,
         'Usage Experience': 2614,
         'User Tips': 2002,
         'Benefits': 1777,
         'Beauty Profile': 1679,
         'Taste': 1525,
         'Beauty Effect': 1037,
         'Value For Money': 722,
         'Ordered': 423,
         'Received': 369,
         'Occasion': 336,
         'Shipped': 186,
         'Delivered': 112,
         'Edit': 87,
         'Update': 78,
         'Delivery': 65,
         'PS': 57,
         'Note': 48,
         'Ps': 47,
         'Product': 40,
         'Seller': 39,
         'Shipping': 34,
         'Cons': 33,
         'Item': 33,
         'Price': 32,
         'Order': 31,
         

In [221]:
template_tags = [key for key, value in temp_tags.items() if value > 300]
template_tags += ["Seller", "Delivery", "Beauty Effect"]
template_tags


['Beauty Profile',
 'Fragrance',
 'User Tips',
 'Benefits',
 'Beauty Effect',
 'Quality',
 'Appearance',
 'Effectiveness',
 'Occasion',
 'Taste',
 'Packaging',
 'Texture',
 'Performance',
 'Material Quality',
 'Best Feature',
 'Suitability',
 'Usage Experience',
 'Colour',
 'Product Quality',
 'Value For Money',
 'Received',
 'Ordered',
 'Seller',
 'Delivery',
 'Beauty Effect']

## Data Transformation/Feature Extraction using template tags


In [222]:
for template in template_tags:
    dataset[template] = dataset["comment"].apply(
        lambda comment: re.sub(
            f"\\n",
            "",
            re.sub(f"{template}\:", "", re.findall(f"{template}\:.*?\\n", comment)[0]),
        )
        if (
            comment.find(template) != -1
            and len(re.findall(f"{template}\:.*?\\n", comment)) > 0
        )
        else np.nan
    )

dataset.head(3)


Unnamed: 0,id,comment,rating,product_quality,template_tags,template_tags_extracted,Beauty Profile,Fragrance,User Tips,Benefits,...,Best Feature,Suitability,Usage Experience,Colour,Product Quality,Value For Money,Received,Ordered,Seller,Delivery
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,5,5,"['Colour', 'Material Quality', 'Appearance']","[Colour, Material Quality]",,,,,...,,,,Black,,,,,,
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,3,3,"['Appearance', 'Colour', 'Material Quality']","[Appearance, Colour, Material Quality]",,,,,...,,,,good,,,,,,
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,1,1,"['Appearance', 'Colour', 'Material Quality']","[Appearance, Colour, Material Quality]",,,,,...,,,,good,,,,,,


In [223]:
dataset["comment_no_tags"] = dataset["comment"]
for _ in range(10):
    dataset["comment_no_tags"] = dataset["comment_no_tags"].apply(
        lambda comment: re.sub(f"^({'|'.join(template_tags)})\:.*(\\n)*", "", comment)
    )
dataset.drop(columns=["template_tags"], inplace=True)
dataset["comment_no_tags"]


0      True to size, and color. Maganda ang kulay, te...
1      May konting damage lang sya Ok nmn mejo makapa...
2      manipis yung tela ngayon pangalawang order ko....
3      The ietem is good it is soft and safe po dumat...
4      Thank you seller ang bilis dumating, hindi tul...
                             ...                        
294                                                100%✓
295                      Okay cia gamitin!I like it ❤️❤️
296          maganda siya malaki pa\nthank you seller🥰❤️
297    walang damage yung product, mukha nakakasuklay...
298                                                     
Name: comment_no_tags, Length: 321902, dtype: object

In [224]:
col_x = dataset.pop("comment_no_tags")
dataset.insert(2, "comment_no_tags", col_x)
dataset.drop(columns=["template_tags_extracted"], inplace=True)
dataset.reset_index(drop=True, inplace=True)
dataset.head(3)


Unnamed: 0,id,comment,comment_no_tags,rating,product_quality,Beauty Profile,Fragrance,User Tips,Benefits,Beauty Effect,...,Best Feature,Suitability,Usage Experience,Colour,Product Quality,Value For Money,Received,Ordered,Seller,Delivery
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...",5,5,,,,,,...,,,,Black,,,,,,
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,3,3,,,,,,...,,,,good,,,,,,
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,1,1,,,,,,...,,,,good,,,,,,


## Pre-processing Comments without Tags


In [225]:
subset_comment = dataset[["id", "comment", "comment_no_tags"]]
subset_comment.head(3)


Unnamed: 0,id,comment,comment_no_tags
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te..."
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....


In [226]:
import emoji
import regex


def clean_uncased(comment):
    comment = str(comment)
    comment = comment.lower()
    comment = " ".join(comment.split()).strip()
    comment = emoji.replace_emoji(comment, "")
    comment = re.sub(r"\\n", " ", comment)  # \n / newline
    comment = re.sub(r"https?://[^\s]+?\.[a-z]{2,6}", "", comment)  # Web Links
    comment = regex.sub(r"\B[\p{P}\p{S}]+? ", " ", comment)  # Punctuations
    comment = regex.sub(r"\B[\p{P}\p{S}]+", "", comment)  # Punctuations
    comment = " ".join(comment.split()).strip()
    comment = comment.strip()
    return comment


def clean_cased(comment):
    comment = str(comment)
    comment = " ".join(comment.split()).strip()
    comment = emoji.replace_emoji(comment, "")
    comment = re.sub(r"\\n", " ", comment)  # \n / newline
    comment = re.sub(r"https?://[^\s]+?\.[a-zA-Z]{2,6}", "", comment)  # Web Links
    comment = re.sub(r"\.0", "", comment)  # .0
    comment = regex.sub(r"\B[\p{P}\p{S}]+? ", " ", comment)  # Punctuations
    comment = regex.sub(r"\B[\p{P}\p{S}]+", "", comment)  # Punctuations
    comment = " ".join(comment.split()).strip()
    comment = comment.strip()
    return comment


In [227]:
from random import randint

tester = dataset["comment_no_tags"][randint(0, len(dataset["comment_no_tags"]))]
tester


'Thank you so much seller for the fast delivery and transaction very affordable price and excellent quality very accommodating seller 😍'

In [228]:
clean_cased(tester)


'Thank you so much seller for the fast delivery and transaction very affordable price and excellent quality very accommodating seller'

In [229]:
clean_cased("hel....lo hi..f")


'hel.lo hi.f'

In [230]:
clean_uncased(tester)


'thank you so much seller for the fast delivery and transaction very affordable price and excellent quality very accommodating seller'

In [231]:
subset_comment["comment_cleaned_uncased"] = subset_comment["comment_no_tags"].apply(
    clean_uncased
)
subset_comment.head(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_comment["comment_cleaned_uncased"] = subset_comment["comment_no_tags"].apply(


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...","true to size, and color. maganda ang kulay, te..."
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,may konting damage lang sya ok nmn mejo makapa...
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....


In [232]:
subset_comment["comment_cleaned_cased"] = subset_comment["comment_no_tags"].apply(
    clean_cased
)
subset_comment.head(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_comment["comment_cleaned_cased"] = subset_comment["comment_no_tags"].apply(


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...","true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te..."
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....


In [233]:
subset_comment.loc[384]


id                                                                       384
comment                    Hindi ko na pinicturan, pero hindi ako satisfi...
comment_no_tags            Hindi ko na pinicturan, pero hindi ako satisfi...
comment_cleaned_uncased    hindi ko na pinicturan, pero hindi ako satisfi...
comment_cleaned_cased      Hindi ko na pinicturan, pero hindi ako satisfi...
Name: 384, dtype: object

In [234]:
subset_comment.comment_cleaned_uncased.duplicated().value_counts()


False    228952
True      92950
Name: comment_cleaned_uncased, dtype: int64

In [235]:
subset_comment[subset_comment.comment_cleaned_uncased.duplicated()].head(50)


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased
130,130,Bilib ako SA shop Nato Kasi kulang Yung item n...,Bilib ako SA shop Nato Kasi kulang Yung item n...,bilib ako sa shop nato kasi kulang yung item n...,Bilib ako SA shop Nato Kasi kulang Yung item n...
287,287,Material Quality:cotton\nAppearance:maganda sy...,,,
317,317,Appearance:nice fabric\nColour:ok\nMaterial Qu...,,,
528,528,Colour:gray good 10\nAppearance:10\nMaterial Q...,,,
529,529,Appearance:good\nColour:good\nMaterial Quality...,nice,nice,nice
543,543,Ang Ganda po🤗🤗,Ang Ganda po🤗🤗,ang ganda po,Ang Ganda po
544,544,👍👍👍👍👍,👍👍👍👍👍,,
575,575,Good,Good,good,Good
597,597,"Ang ganda ng item nila, nagustohan ko lahat ng...","Ang ganda ng item nila, nagustohan ko lahat ng...","ang ganda ng item nila, nagustohan ko lahat ng...","Ang ganda ng item nila, nagustohan ko lahat ng..."
605,605,🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰,🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰🥰,,


In [236]:
subset_comment[subset_comment.comment_no_tags == ""].comment.tolist()[:5]


['Material Quality:cotton\nAppearance:maganda sya  nakakanumbok ng pwet. . will order again .  . askfkfldhxneududosjshsbfndjudhdjdjjdhhdjd\nColour:black and gray',
 "Appearance:nice fabric\nColour:ok\nMaterial Quality:it's fine so affordable",
 'Colour:gray good 10\nAppearance:10\nMaterial Quality:10',
 'Appearance:maliit skanya kaya binigay nalang ulit',
 'Appearance:parang dark blue di sya black then ung isa ok nmn\nColour:dark blue and gray\nMaterial Quality:ok nmn']

In [237]:
subset_comment.drop_duplicates(subset=["comment_cleaned_uncased"], inplace=True)
subset_comment.comment_cleaned_uncased.duplicated().value_counts()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_comment.drop_duplicates(subset=["comment_cleaned_uncased"], inplace=True)


False    228952
Name: comment_cleaned_uncased, dtype: int64

In [238]:
subset_comment = subset_comment.drop(
    index=[subset_comment[subset_comment["comment_cleaned_uncased"] == ""].index[0]]
)
subset_comment[subset_comment.comment_cleaned_uncased == ""]


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased


In [239]:
subset_comment.head(3)


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...","true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te..."
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....


## Data Cleaning using Language Detection


In [240]:
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 143


In [241]:
def language_detect(x):
    try:
        return detect(x)
    except Exception as e:
        return ""


In [242]:
subset_comment["lang"] = subset_comment["comment_cleaned_uncased"].apply(
    language_detect
)
subset_comment


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased,lang
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...","true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te...",tl
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...,tl
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,tl
3,3,Material Quality:malambot ang tela\nAppearance...,The ietem is good it is soft and safe po dumat...,the ietem is good it is soft and safe po dumat...,The ietem is good it is soft and safe po dumat...,en
4,4,"Thank you seller ang bilis dumating, hindi tul...","Thank you seller ang bilis dumating, hindi tul...","thank you seller ang bilis dumating, hindi tul...","Thank you seller ang bilis dumating, hindi tul...",tl
...,...,...,...,...,...,...
321180,327036,Thank youuu! nakarating ng maaga order ko! ❤️❤️❤️,Thank youuu! nakarating ng maaga order ko! ❤️❤️❤️,thank youuu! nakarating ng maaga order ko!,Thank youuu! nakarating ng maaga order ko!,tl
321181,327037,Nice quality!! Will surely purchase again tha...,Nice quality!! Will surely purchase again tha...,nice quality! will surely purchase again thank...,Nice quality! Will surely purchase again thank...,en
321182,327038,Malaki ung flower at maganda,Malaki ung flower at maganda,malaki ung flower at maganda,Malaki ung flower at maganda,tl
321183,327039,"Super realistic, salamat shopeee 🥰 order po ak...","Super realistic, salamat shopeee 🥰 order po ak...","super realistic, salamat shopeee order po ako ...","Super realistic, salamat shopeee order po ako ...",en


In [243]:
subset_comment[subset_comment["lang"] == ""]["comment_cleaned_uncased"].tolist()[:5]


['𝙈𝙖𝙜𝙖𝙣𝙙𝙖 𝙮𝙪𝙣𝙜 𝙩𝙚𝙡𝙖, 𝙨𝙪𝙡𝙞𝙩 𝙛𝙤𝙧 𝙞𝙩𝙨 𝙥𝙧𝙞𝙘𝙚 𝙥𝙤. 𝙆𝙖𝙨𝙤 𝙬𝙧𝙤𝙣𝙜 𝙘𝙤𝙡𝙤𝙧 𝙥𝙞𝙣𝙖𝙙𝙖𝙡𝙖 𝙣𝙞𝙡𝙖. 𝙋𝙚𝙧𝙤 𝙩𝙝𝙖𝙣𝙠 𝙮𝙤𝙪 𝙥𝙖𝙙𝙚𝙣 𝙢𝙖𝙜𝙖𝙣𝙙𝙖 𝙥𝙖 𝙙𝙞𝙣 𝙣𝙖𝙢𝙖𝙣. 𝙎𝙖𝙣𝙖 𝙡𝙖𝙣𝙜 𝙣𝙖𝙜-𝙪𝙥𝙙𝙖𝙩𝙚 𝙨𝙞𝙡𝙖 𝙤𝙧 𝙧𝙚𝙢𝙞𝙣𝙙 𝙨𝙖 𝙘𝙤𝙡𝙤𝙧𝙨.',
 '𝚆𝚛𝚘𝚗𝚐 𝚌𝚘𝚕𝚘𝚛 𝚊𝚗𝚍 𝚜𝚒𝚣𝚎. 𝙸 𝚘𝚛𝚍𝚎𝚛𝚎𝚍 𝙻𝚒𝚐𝚑𝚝 𝚐𝚛𝚊𝚢 𝚇𝙻, 𝚊𝚗𝚐 𝚍𝚞𝚖𝚊𝚝𝚒𝚗𝚐 𝚈𝚎𝚕𝚕𝚘𝚠 𝙻𝚊𝚛𝚐𝚎.',
 'mᴀɢᴀɴᴅᴀ sʏᴀ ᴀᴛ ᴍᴀʟᴀᴍʙᴏᴛ ᴛᴇʟᴀ ɴɪʏᴀ ᴍᴀᴀʏᴜs ᴀɴɢ ᴘᴀɢᴋᴀᴋᴀᴅᴇʟɪᴠᴇʀ ɴɪ ᴋᴜʏᴀ',
 'ﾉ',
 't ⓨ ⓢ ⓜ']

In [244]:
subset_comment[subset_comment["lang"] == ""].head(50)


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased,lang
119,119,"𝙈𝙖𝙜𝙖𝙣𝙙𝙖 𝙮𝙪𝙣𝙜 𝙩𝙚𝙡𝙖, 𝙨𝙪𝙡𝙞𝙩 𝙛𝙤𝙧 𝙞𝙩𝙨 𝙥𝙧𝙞𝙘𝙚 𝙥𝙤. 𝙆𝙖𝙨...","𝙈𝙖𝙜𝙖𝙣𝙙𝙖 𝙮𝙪𝙣𝙜 𝙩𝙚𝙡𝙖, 𝙨𝙪𝙡𝙞𝙩 𝙛𝙤𝙧 𝙞𝙩𝙨 𝙥𝙧𝙞𝙘𝙚 𝙥𝙤. 𝙆𝙖𝙨...","𝙈𝙖𝙜𝙖𝙣𝙙𝙖 𝙮𝙪𝙣𝙜 𝙩𝙚𝙡𝙖, 𝙨𝙪𝙡𝙞𝙩 𝙛𝙤𝙧 𝙞𝙩𝙨 𝙥𝙧𝙞𝙘𝙚 𝙥𝙤. 𝙆𝙖𝙨...","𝙈𝙖𝙜𝙖𝙣𝙙𝙖 𝙮𝙪𝙣𝙜 𝙩𝙚𝙡𝙖, 𝙨𝙪𝙡𝙞𝙩 𝙛𝙤𝙧 𝙞𝙩𝙨 𝙥𝙧𝙞𝙘𝙚 𝙥𝙤. 𝙆𝙖𝙨...",
1592,1592,"𝚆𝚛𝚘𝚗𝚐 𝚌𝚘𝚕𝚘𝚛 𝚊𝚗𝚍 𝚜𝚒𝚣𝚎. 𝙸 𝚘𝚛𝚍𝚎𝚛𝚎𝚍 𝙻𝚒𝚐𝚑𝚝 𝚐𝚛𝚊𝚢 𝚇𝙻,...","𝚆𝚛𝚘𝚗𝚐 𝚌𝚘𝚕𝚘𝚛 𝚊𝚗𝚍 𝚜𝚒𝚣𝚎. 𝙸 𝚘𝚛𝚍𝚎𝚛𝚎𝚍 𝙻𝚒𝚐𝚑𝚝 𝚐𝚛𝚊𝚢 𝚇𝙻,...","𝚆𝚛𝚘𝚗𝚐 𝚌𝚘𝚕𝚘𝚛 𝚊𝚗𝚍 𝚜𝚒𝚣𝚎. 𝙸 𝚘𝚛𝚍𝚎𝚛𝚎𝚍 𝙻𝚒𝚐𝚑𝚝 𝚐𝚛𝚊𝚢 𝚇𝙻,...","𝚆𝚛𝚘𝚗𝚐 𝚌𝚘𝚕𝚘𝚛 𝚊𝚗𝚍 𝚜𝚒𝚣𝚎. 𝙸 𝚘𝚛𝚍𝚎𝚛𝚎𝚍 𝙻𝚒𝚐𝚑𝚝 𝚐𝚛𝚊𝚢 𝚇𝙻,...",
1886,1886,Mᴀɢᴀɴᴅᴀ sʏᴀ ᴀᴛ ᴍᴀʟᴀᴍʙᴏᴛ ᴛᴇʟᴀ ɴɪʏᴀ\nᴍᴀᴀʏᴜs ᴀɴɢ ...,Mᴀɢᴀɴᴅᴀ sʏᴀ ᴀᴛ ᴍᴀʟᴀᴍʙᴏᴛ ᴛᴇʟᴀ ɴɪʏᴀ\nᴍᴀᴀʏᴜs ᴀɴɢ ...,mᴀɢᴀɴᴅᴀ sʏᴀ ᴀᴛ ᴍᴀʟᴀᴍʙᴏᴛ ᴛᴇʟᴀ ɴɪʏᴀ ᴍᴀᴀʏᴜs ᴀɴɢ ᴘ...,Mᴀɢᴀɴᴅᴀ sʏᴀ ᴀᴛ ᴍᴀʟᴀᴍʙᴏᴛ ᴛᴇʟᴀ ɴɪʏᴀ ᴍᴀᴀʏᴜs ᴀɴɢ ᴘ...,
2713,2713,(●’◡’●)ﾉ,(●’◡’●)ﾉ,ﾉ,ﾉ,
4608,4764,Appearance:✅️\nQuality:✅️\nSuitability:✅️\n\nT...,TⒽⒶⓃⓀ ⓎⓄⓊ ⓈⓄ ⓂⓊⒸⒽ 🫰,t ⓨ ⓢ ⓜ,T Ⓨ Ⓢ Ⓤ,
5532,5688,1111111111111111111111111111111,1111111111111111111111111111111,1111111111111111111111111111111,1111111111111111111111111111111,
6223,6379,9,9,9,9,
9825,9981,5⭐⭐⭐⭐⭐,5⭐⭐⭐⭐⭐,5,5,
9924,10080,Performance:𝐬𝐮𝐩𝐞𝐫𝐛\nBest Feature:𝐬𝐮𝐩𝐞𝐫𝐛\nProdu...,𝐆𝐚𝐧𝐝𝐚 𝐧𝐠 𝐩𝐡𝐨𝐧𝐞 𝐠𝐫𝐚𝐛𝐞 𝐠𝐚𝐠𝐚𝐧𝐝𝐚 𝐧 𝐦𝐠𝐚 𝐤𝐮𝐡𝐚 𝐤𝐨 𝐬𝐚 ...,𝐆𝐚𝐧𝐝𝐚 𝐧𝐠 𝐩𝐡𝐨𝐧𝐞 𝐠𝐫𝐚𝐛𝐞 𝐠𝐚𝐠𝐚𝐧𝐝𝐚 𝐧 𝐦𝐠𝐚 𝐤𝐮𝐡𝐚 𝐤𝐨 𝐬𝐚 ...,𝐆𝐚𝐧𝐝𝐚 𝐧𝐠 𝐩𝐡𝐨𝐧𝐞 𝐠𝐫𝐚𝐛𝐞 𝐠𝐚𝐠𝐚𝐧𝐝𝐚 𝐧 𝐦𝐠𝐚 𝐤𝐮𝐡𝐚 𝐤𝐨 𝐬𝐚 ...,
10545,10701,<33,<33,33,33,


In [245]:
subset_comment["lang"].value_counts()


en       124455
tl        88577
id         3022
so         2270
da         1092
af         1048
no          846
it          684
ro          539
et          533
ca          530
hr          507
fi          485
sl          484
pl          430
cy          424
nl          414
sv          333
sq          330
fr          288
sw          239
sk          225
cs          170
tr          155
            145
es          136
ko          134
pt          126
de          114
hu           63
lv           51
vi           33
lt           32
ja           21
zh-cn         7
el            4
te            1
uk            1
bg            1
th            1
ml            1
Name: lang, dtype: int64

In [246]:
subset_comment[subset_comment["lang"] == "id"]["comment_cleaned_uncased"].tolist()[:5]


['hindi ako ooder sa shop na ito ang pangit tela parang okay okay tapos logi ako dahil makati ang tela hindi nya deserve ang 5 stars wag na kayung bumili dito kasi magsisi kaya sa bandang huli byeeeeeee gjjjjjjjjjjjjjjjjjjjjjjjjjjj udjckfkvkvkvkgkfkdkdkjddjjdjdjsjjjxjsjsjsjdjdjdjdjdjdjdjdjfjfjfjfjs',
 'maganda yung tela salamat seller tska tama yung kulay at size thanks seller',
 'maganda yung tela nya talaga, pangalawa kuna tong bili, at napaka bilis talaga superr salamat poo mwamwamwa hahahahahhha vdusijakanzhss vgsusjanahzs vhssuiakanzhdjs hsiainajzjzkkaia nsjajanjz',
 'salamat dumating narin ang order thanks seller',
 'mejo manipis pero goods na sa price. salamat seller salamat rider salamat shopee']

In [247]:
subset_comment[subset_comment["lang"] == "so"]["comment_cleaned_uncased"].tolist()[:5]


['good',
 'hsisjcpfpfufljdludiflflfljfljdludhkdkhzukskhdkhduksukdkuxkuxkudkydouxuldhldulxhmckjcjlxhkxhkxlhxlhxlhxlhdkhdhldkhxhkxkhskhskhhxhxhxjsvlsvooosgsgsovzgozgs9s9ys9gsvookvlvlngiuyoohihgkxah a jaljcjlaxhlwxwudpupdackalhqxjajlwxajjlcwiaciwupxww hlxapaxisipsxupusxups ouduo sjls ljsj ls jlsupsclcjslcspj psc',
 'magandaaaaahhhhj',
 'maganda lahat',
 'goods']

In [248]:
subset_comment[subset_comment["lang"] == "tl"]["comment_cleaned_uncased"].tolist()[:5]


['true to size, and color. maganda ang kulay, tela at malambot hindi rin mainit kapag suot, mabilis dumating ang item.',
 'may konting damage lang sya ok nmn mejo makapal lng pero pwd na sa price kala ko makapl kase base sa review makapal pero mejo lng pala pero ok na din kase ayos lng sa price',
 'manipis yung tela ngayon pangalawang order ko.mas makapal kasie yung unang order ko. at mali pa yung pinadala yung sa short hindi manlang nag sabie na pinalitan pala yung order ko.',
 'thank you seller ang bilis dumating, hindi tulad ng ibang order ko last year pa hanggang ngayon wala pa din. yung quality okay naman para sa price and nagustuhan din ni husband, sa uulitin.',
 "sobrang nipis niya natatakot akong isoot to baka mapunit yong sa may bandang pwetan sana pala xl or xxl yong kinuha ko, baka di ko nalang palagi susuotin to, ok naman siya fit na fit. btw im 5'6 and 75kg and l yong enorder ko i think bagay sakin xxl. pero all in all goods siya maraming tnx seller"]

In [249]:
subset_comment[subset_comment["lang"] == "en"]["comment_cleaned_uncased"].tolist()[:5]


['the ietem is good it is soft and safe po dumating. thanks to all riders and seller',
 'computers are those devices that play a very important role in our everyday life we are so accustomed to using them that we barely notice their presence howevertheir absence can spell trouble for a lot of us beginning with a school project to electricity payment to work from homecomputers have become part and parcel of our daily life.',
 "this happened to me twice from this shop, last was july 10. what's the point on color selection of your products if you will not follow your customers order? i'm not happy with the color they chose to send me! inconsiderate irresponsible seller! waste of time gas returning this",
 'all pants i order are strictly followed. right size for my waist. thank you seller thank you shopee. will order again. good job.',
 "this my 2nd time to order this store, maganda ang tela tapos makapal sya tama yung kulay at size mabilis din ang shipping, tapos mabait din nagdeliver. i'

In [250]:
subset_comment = subset_comment[
    (subset_comment["lang"] == "tl") | (subset_comment["lang"] == "en")
]
subset_comment.count()


id                         213032
comment                    213032
comment_no_tags            213032
comment_cleaned_uncased    213032
comment_cleaned_cased      213032
lang                       213032
dtype: int64

In [251]:
subset_comment.reset_index(drop=True, inplace=True)
subset_comment


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased,lang
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...","true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te...",tl
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...,tl
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,tl
3,3,Material Quality:malambot ang tela\nAppearance...,The ietem is good it is soft and safe po dumat...,the ietem is good it is soft and safe po dumat...,The ietem is good it is soft and safe po dumat...,en
4,4,"Thank you seller ang bilis dumating, hindi tul...","Thank you seller ang bilis dumating, hindi tul...","thank you seller ang bilis dumating, hindi tul...","Thank you seller ang bilis dumating, hindi tul...",tl
...,...,...,...,...,...,...
213027,327036,Thank youuu! nakarating ng maaga order ko! ❤️❤️❤️,Thank youuu! nakarating ng maaga order ko! ❤️❤️❤️,thank youuu! nakarating ng maaga order ko!,Thank youuu! nakarating ng maaga order ko!,tl
213028,327037,Nice quality!! Will surely purchase again tha...,Nice quality!! Will surely purchase again tha...,nice quality! will surely purchase again thank...,Nice quality! Will surely purchase again thank...,en
213029,327038,Malaki ung flower at maganda,Malaki ung flower at maganda,malaki ung flower at maganda,Malaki ung flower at maganda,tl
213030,327039,"Super realistic, salamat shopeee 🥰 order po ak...","Super realistic, salamat shopeee 🥰 order po ak...","super realistic, salamat shopeee order po ako ...","Super realistic, salamat shopeee order po ako ...",en


In [252]:
from random import randrange

rand_num = randrange(subset_comment.comment.count())
print(subset_comment.loc[rand_num].id)
subset_comment.loc[rand_num].comment


92630


'Finally dumating din sya..though matagal talaga kasi pre order. Walang damage and secured ung item. Thank you seller, will order again.'

In [253]:
dataset.loc[subset_comment.loc[rand_num].id]


id                                                              95508
comment             Arrived well packed and in perfect condition. ...
comment_no_tags     Arrived well packed and in perfect condition. ...
rating                                                              5
product_quality                                                     5
Beauty Profile                                                    NaN
Fragrance                                                         NaN
User Tips                                                         NaN
Benefits                                                          NaN
Beauty Effect                                                     NaN
Quality                                                           NaN
Appearance                                                        NaN
Effectiveness                                                     NaN
Occasion                                                          NaN
Taste               

In [254]:
df[df.comment == subset_comment.loc[rand_num].comment]


Unnamed: 0,userid,username,item,item_type,ctime,comment,rating,product_quality,seller_service,delivery_service,has_template_tag,template_tags,tags,is_oversea,origin_region,like_count,is_repeated_purchase,exclude_scoring_due_low_logistic,id
736,90671062,a*****n,【Brand New】It Ends with Us Books by Colleen Ho...,20,2022-02-21 13:54:44,Finally dumating din sya..though matagal talag...,5,5,,,False,[],"[{'tag_id': 51, 'tag_description': 'Excellent ...",False,ph,,False,False,92630


In [255]:
subset_comment[subset_comment.comment.duplicated()].head(3)


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased,lang


In [256]:
from os import makedirs, path

data_dir = "../data/model-data/"

if path.isdir(data_dir):
    print("Directory already exists")
else:
    makedirs(data_dir)


Directory already exists


In [257]:
subset_comment.merge(dataset, on="id", how="left").drop(
    columns=["comment_y", "comment_no_tags_y"]
).rename(
    columns={"comment_x": "comment", "comment_no_tags_x": "comment_no_tags"}
).to_csv(
    data_dir + "comment-2-orig.csv", index=False
)

subset_comment[subset_comment["comment_cleaned_uncased"].str.len() > 50].merge(
    dataset, on="id", how="left"
).drop(columns=["comment_y", "comment_no_tags_y"]).rename(
    columns={"comment_x": "comment", "comment_no_tags_x": "comment_no_tags"}
).to_csv(
    data_dir + "comment-2-50.csv", index=False
)

subset_comment[subset_comment["comment_cleaned_uncased"].str.len() > 100].merge(
    dataset, on="id", how="left"
).drop(columns=["comment_y", "comment_no_tags_y"]).rename(
    columns={"comment_x": "comment", "comment_no_tags_x": "comment_no_tags"}
).to_csv(
    data_dir + "comment-2-100.csv", index=False
)
