# Loading Libraries and Data


## Loading Libraries


In [2]:
import pandas as pd
import regex, re
import nltk
from collections import Counter


## Loading Data


In [3]:
df = pd.read_csv("../data/model-data/comment-3-50.csv", low_memory=False)
df["rating"] = df["rating"].replace(1, 0)
df["rating"] = df["rating"].replace(2, 1)
df["rating"] = df["rating"].replace(3, 2)
df["rating"] = df["rating"].replace(4, 3)
df["rating"] = df["rating"].replace(5, 4)
df.head(3)


Unnamed: 0,id,comment,comment_no_tags,comment_cleaned_uncased,comment_cleaned_cased,lang,rating,product_quality,Suitability,Benefits,...,Material Quality,Best Feature,User Tips,Packaging,Texture,Quality,Received,Ordered,Seller,Delivery
0,0,Colour:Black\nMaterial Quality:Good Quality\n\...,"True to size, and color. Maganda ang kulay, te...","true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te...",tl,4,5,,,...,Good Quality,,,,,Good Quality,,,,
1,1,Appearance:ok\nColour:good\nMaterial Quality:m...,May konting damage lang sya Ok nmn mejo makapa...,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...,tl,2,3,,,...,mejo manipis pero pwd na dn sa price,,,,,mejo manipis pero pwd na dn sa price,,,,
2,2,Appearance:pangit\nColour:good\nMaterial Quali...,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....,tl,0,1,,,...,manipis,,,,,manipis,,,,


# Pre-processing Tasks


In [7]:
print(df[df.rating == 0].info())
print(df[df.rating == 1].info())
print(df[df.rating == 2].info())
print(df[df.rating == 3].info())
print(df[df.rating == 4].info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13730 entries, 2 to 300696
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       13730 non-null  int64 
 1   comment                  13730 non-null  object
 2   comment_no_tags          13730 non-null  object
 3   comment_cleaned_uncased  13730 non-null  object
 4   comment_cleaned_cased    13730 non-null  object
 5   lang                     13730 non-null  object
 6   rating                   13730 non-null  int64 
 7   product_quality          13730 non-null  int64 
 8   Suitability              377 non-null    object
 9   Benefits                 8 non-null      object
 10  Beauty Profile           9 non-null      object
 11  Usage Experience         25 non-null     object
 12  Value For Money          504 non-null    object
 13  Effectiveness            207 non-null    object
 14  Performance              793 non-null

In [3]:
df.shape


(300705, 32)

In [4]:
temp = df[df["lang"] == "tl"]
data = temp[["rating", "comment_cleaned_uncased", "comment_cleaned_cased"]]

data = data.rename(
    columns={
        "comment_cleaned_uncased": "msg_uncased",
        "comment_cleaned_cased": "msg_cased",
        "rating": "label",
    }
)

data.reset_index(drop=True, inplace=True)
data


Unnamed: 0,label,msg_uncased,msg_cased
0,4,"true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te..."
1,2,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...
2,0,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....
3,4,"thank you seller ang bilis dumating, hindi tul...","Thank you seller ang bilis dumating, hindi tul..."
4,4,sobrang nipis niya natatakot akong isoot to ba...,Sobrang nipis niya natatakot akong isoot to ba...
...,...,...,...
146583,3,medyo na yupi yung flowers next time sana sa m...,Medyo na yupi yung flowers next time sana sa m...
146584,4,super tagal. di mo na magagamit sa date na kai...,Super Tagal. Di mo na magagamit sa Date na Kai...
146585,2,"okay sya mabilis, pero yung sinend na picture ...","Okay sya mabilis, pero Yung sinend na picture ..."
146586,0,hindi nakakatuwa seller. pinatrouble mo ako sa...,Hindi nakakatuwa seller. Pinatrouble mo ako sa...


In [5]:
data.label.value_counts()


4    115613
3     10129
0      9740
2      7122
1      3984
Name: label, dtype: int64

In [6]:
def clean_text(text):
    text = str(text)
    text = regex.sub(r"\b([\p{P}\p{S}]+?)\b", r"\1 ", text)
    text = " ".join(text.lower().split()).strip()
    return text


def clean_text_cased(text):
    text = str(text)
    text = regex.sub(r"\b([\p{P}\p{S}]+?)\b", r"\1 ", text)
    text = " ".join(text.split()).strip()
    return text


# def clean_text(text):
#     text = str(text)
#     if regex.search(r"\b[\p{P}\p{S}]\b",text):
#         print(regex.split(r"\b[\p{P}\p{S}]\b",text))
#     return text


In [7]:
data.msg_uncased = data.msg_uncased.apply(clean_text)
data.msg_cased = data.msg_cased.apply(clean_text_cased)

data


Unnamed: 0,label,msg_uncased,msg_cased
0,4,"true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te..."
1,2,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...
2,0,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....
3,4,"thank you seller ang bilis dumating, hindi tul...","Thank you seller ang bilis dumating, hindi tul..."
4,4,sobrang nipis niya natatakot akong isoot to ba...,Sobrang nipis niya natatakot akong isoot to ba...
...,...,...,...
146583,3,medyo na yupi yung flowers next time sana sa m...,Medyo na yupi yung flowers next time sana sa m...
146584,4,super tagal. di mo na magagamit sa date na kai...,Super Tagal. Di mo na magagamit sa Date na Kai...
146585,2,"okay sya mabilis, pero yung sinend na picture ...","Okay sya mabilis, pero Yung sinend na picture ..."
146586,0,hindi nakakatuwa seller. pinatrouble mo ako sa...,Hindi nakakatuwa seller. Pinatrouble mo ako sa...


## Data Reduction


In [8]:
dictionary = []
for i in range(5):
    for j in data[data.label == (i + 1)].msg_uncased.tolist():
        dictionary.extend(nltk.word_tokenize(j))

dictionary[:5]


['kelan', 'pa', 'naging', 'black', 'ang']

In [9]:
len(dictionary)


3881997

In [10]:
dictionary_without_dups = list(set(dictionary))
Counter([len(word) for word in dictionary_without_dups])


Counter({7: 11044,
         6: 10512,
         8: 9875,
         5: 8946,
         9: 8207,
         4: 6445,
         10: 5992,
         11: 3834,
         3: 3602,
         12: 2346,
         13: 1344,
         2: 1061,
         14: 757,
         15: 427,
         16: 305,
         17: 204,
         18: 148,
         1: 145,
         20: 96,
         19: 94,
         21: 74,
         22: 63,
         24: 53,
         23: 50,
         25: 41,
         26: 39,
         27: 33,
         28: 32,
         29: 31,
         33: 28,
         30: 25,
         34: 24,
         38: 19,
         31: 18,
         32: 16,
         36: 16,
         40: 16,
         39: 16,
         35: 16,
         41: 12,
         42: 12,
         51: 11,
         37: 11,
         48: 10,
         46: 8,
         43: 8,
         63: 7,
         57: 7,
         49: 7,
         53: 7,
         61: 6,
         55: 6,
         50: 6,
         44: 6,
         54: 6,
         64: 5,
         45: 5,
         56: 5,
     

## Normalize


In [11]:
def normalize_text(text):
    text = str(text)
    for token in nltk.word_tokenize(text):
        if len(token) > 17:
            text = text.replace(token, "")
            text = " ".join(text.split()).strip()
            text = regex.sub(r" ([\p{P}\p{S}]+?) ", r"\1 ", text)
    text = " ".join(text.split()).strip()
    return text


data.msg_uncased = data.msg_uncased.apply(normalize_text)
data.msg_cased = data.msg_cased.apply(normalize_text)
data.head(3)


Unnamed: 0,label,msg_uncased,msg_cased
0,4,"true to size, and color. maganda ang kulay, te...","True to size, and color. Maganda ang kulay, te..."
1,2,may konting damage lang sya ok nmn mejo makapa...,May konting damage lang sya Ok nmn mejo makapa...
2,0,manipis yung tela ngayon pangalawang order ko....,manipis yung tela ngayon pangalawang order ko....


In [12]:
data.shape


(146588, 3)

In [13]:
data.msg_uncased.isna().sum()


0

In [14]:
data.dropna(subset=["msg_cased"], inplace=True)
data.shape


(146588, 3)

In [15]:
data[data.label == 3].msg_cased.tolist()[:5]


['Okay naman yung dalawa, tama ang size at kulay, pero yung isa sablay. instead of color blue and xxl ang size, black at xl lang ang size. pero okay na din naman. pero sana nag iinform ang seller, hindi pa gaano nagrereply sa message. paayos po seller',
 '4 stars dahil hindi nasunod yung color na napili at wala sa choicessuggest ko po kay seller add choices ng colors, chat si buyer na hindi available color, maganda tela makapal naman. pang girl siya. sizing was true measure muna waistline para sure sa oordering size. di yung isisisi kay seller',
 'Tamang2 yung nga sizes nya. Ayos din yung item. Yun nga, di lang magkapareho ang mga tela. Thanks parin',
 'Salamat shopee at seller. Nakarating ng Maayos item ko. Sulit bayad banda ng tela. Kung ano sa picture un din madeliver sayo. Salamat deliver man. God Bless!',
 "Maayos naman ang quality nito kaso hindi siya kasya sa pinsan ko. Sana po ay naglagay kayo ng mas specific measurements pero kasalanan naman talaga to' ng pinsan ko kasi siya y

In [16]:
model_data = data[data.label == 0].sample(3000)
for i in range(1, 5):
    model_data = model_data.append(data[data.label == i].sample(3000))
model_data.reset_index(drop=True, inplace=True)
model_data.shape

model_data.shape


  model_data = model_data.append(data[data.label == i].sample(3000))
  model_data = model_data.append(data[data.label == i].sample(3000))
  model_data = model_data.append(data[data.label == i].sample(3000))
  model_data = model_data.append(data[data.label == i].sample(3000))


(15000, 3)

In [20]:
model_data


Unnamed: 0,label,msg_uncased,msg_cased
0,0,ginamit ko ang silicon sealant for my aquarium...,Ginamit ko ang silicon sealant for my aquarium...
1,0,super bagal n dumating yung item. hindi p guma...,Super bagal n dumating yung item. hindi p guma...
2,0,"bukod sa matagal na nga, sira pa yung item, bi...","Bukod sa matagal na nga, sira pa yung item, bi..."
3,0,ang panget ng items in person quality iba yung...,Ang panget ng items in person quality iba Yung...
4,0,d maayos pag pa pack. sa susunod paki ayos par...,D maayos pag pa pack. sa susunod paki ayos par...
...,...,...,...
14995,4,"haven' t try it yet pero sobrang ganda, and gu...","Haven' t try it yet pero sobrang ganda, and gu..."
14996,4,grabe sobrang ganda yung design modern na tala...,Grabe sobrang ganda yung design modern na tala...
14997,4,"ang ganda ng pagkakabalot niya, gusto ko na na...","Ang Ganda ng pagkakabalot niya, gusto ko na na..."
14998,4,gift q to sa hubby q sana magustohan nya kasi ...,Gift q to sa hubby q sana magustohan nya kasi ...


In [15]:
model_data.to_csv("../data/model-data/dataset2.csv", index=False)
