In [1]:
# import library
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# load data from csv
products_df = pd.read_csv('../data/products.csv')
profiles_rating_df = pd.read_csv('../data/profiles_rating.csv')

In [3]:
print(products_df.to_string())

      id  category_id  subcategory_id                                                                          name            brand  Skin Type                  type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [4]:
print(profiles_rating_df.to_string())

     id  user_id                                  name  product_id  rating     gender    age skin_type_face hair_issue skin_type_body                                             allergy_history                                preferred_products                                                     avoided_products                                                                                                                   specific_needs
0     6        7                        Dini Sipahutar       215.0     5.0  Perempuan  18-25         normal  berminyak         normal                                        ["tidak_ada_alergi"]       ["cruelty_free","pewangi","minyak_mineral"]                                                    ["paraben","sls"]                                                                                ["meratakan_warna_kulit","perlindungan_matahari"]
1     6        7                        Dini Sipahutar        22.0     5.0  Perempuan  18-25         normal  berminy

In [5]:
# Case folding
products_df['name'] = products_df['name'].str.lower()
products_df['brand'] = products_df['brand'].str.lower()
products_df['type'] = products_df['type'].str.lower()
products_df['description'] = products_df['description'].str.lower()

profiles_rating_df['skin_type_face'] = profiles_rating_df['skin_type_face'].str.lower()
profiles_rating_df['hair_issue'] = profiles_rating_df['hair_issue'].str.lower()
profiles_rating_df['skin_type_body'] = profiles_rating_df['skin_type_body'].str.lower()
profiles_rating_df['allergy_history'] = profiles_rating_df['allergy_history'].str.lower()
profiles_rating_df['preferred_products'] = profiles_rating_df['preferred_products'].str.lower()
profiles_rating_df['avoided_products'] = profiles_rating_df['avoided_products'].str.lower()
profiles_rating_df['specific_needs'] = profiles_rating_df['specific_needs'].str.lower()

In [6]:
# Punctuational removal
products_df['name'] = products_df['name'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
products_df['brand'] = products_df['brand'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
products_df['type'] = products_df['type'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
products_df['description'] = products_df['description'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

profiles_rating_df['skin_type_face'] = profiles_rating_df['skin_type_face'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
profiles_rating_df['hair_issue'] = profiles_rating_df['hair_issue'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
profiles_rating_df['skin_type_body'] = profiles_rating_df['skin_type_body'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
profiles_rating_df['allergy_history'] = profiles_rating_df['allergy_history'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
profiles_rating_df['preferred_products'] = profiles_rating_df['preferred_products'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
profiles_rating_df['avoided_products'] = profiles_rating_df['avoided_products'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))
profiles_rating_df['specific_needs'] = profiles_rating_df['specific_needs'].astype(str).apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [7]:
# Tokenizing
stop_words = set(stopwords.words('indonesian'))

def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

products_df['name'] = products_df['name'].apply(tokenize_text)
products_df['brand'] = products_df['brand'].apply(tokenize_text)
products_df['type'] = products_df['type'].apply(tokenize_text)
products_df['description'] = products_df['description'].apply(tokenize_text)
        
profiles_rating_df['skin_type_face'] = profiles_rating_df['skin_type_face'].apply(tokenize_text)
profiles_rating_df['hair_issue'] = profiles_rating_df['hair_issue'].apply(tokenize_text)
profiles_rating_df['skin_type_body'] = profiles_rating_df['skin_type_body'].apply(tokenize_text)
profiles_rating_df['allergy_history'] = profiles_rating_df['allergy_history'].apply(tokenize_text)
profiles_rating_df['preferred_products'] = profiles_rating_df['preferred_products'].apply(tokenize_text)
profiles_rating_df['avoided_products'] = profiles_rating_df['avoided_products'].apply(tokenize_text)
profiles_rating_df['specific_needs'] = profiles_rating_df['specific_needs'].apply(tokenize_text)


In [8]:
# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(tokens):
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

products_df['name'] = products_df['name'].apply(stem_text)
products_df['brand'] = products_df['brand'].apply(stem_text)
products_df['type'] = products_df['type'].apply(stem_text)
products_df['description'] = products_df['description'].apply(stem_text)

profiles_rating_df['skin_type_face'] = profiles_rating_df['skin_type_face'].apply(stem_text)
profiles_rating_df['hair_issue'] = profiles_rating_df['hair_issue'].apply(stem_text)
profiles_rating_df['skin_type_body'] = profiles_rating_df['skin_type_body'].apply(stem_text)
profiles_rating_df['allergy_history'] = profiles_rating_df['allergy_history'].apply(stem_text)
profiles_rating_df['preferred_products'] = profiles_rating_df['preferred_products'].apply(stem_text)
profiles_rating_df['avoided_products'] = profiles_rating_df['avoided_products'].apply(stem_text)
profiles_rating_df['specific_needs'] = profiles_rating_df['specific_needs'].apply(stem_text)

In [None]:
# Save preprocessed data back to CSV
products_df.to_csv('../data/preprocessed_products.csv', index=False)
profiles_rating_df.to_csv('../data/preprocessed_profiles_rating.csv', index=False)