In [2]:
# import library
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
# Load data from CSV
products_df = pd.read_csv('../data/csv/products.csv')
profiles_rating_df = pd.read_csv('../data/csv/profiles_rating.csv')

In [None]:
# Case folding
columns_to_fold = ['name', 'brand', 'skin_type', 'type', 'description']
columns_to_fold_profiles = ['skin_type_face', 'hair_issue', 'skin_type_body', 
                            'allergy_history', 'preferred_products', 
                            'avoided_products', 'specific_needs']

for column in columns_to_fold:
    products_df[column] = products_df[column].str.lower()

for column in columns_to_fold_profiles:
    profiles_rating_df[column] = profiles_rating_df[column].str.lower()

In [None]:
# Punctuational removal
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

columns_to_clean = ['name', 'brand', 'skin_type', 'type', 'description']
columns_to_clean_profiles = ['skin_type_face', 'hair_issue', 'skin_type_body', 
                            'allergy_history', 'preferred_products', 
                            'avoided_products', 'specific_needs']

for column in columns_to_clean:
    products_df[column] = products_df[column].astype(str).apply(remove_punctuation)

for column in columns_to_clean_profiles:
    profiles_rating_df[column] = profiles_rating_df[column].astype(str).apply(remove_punctuation)

In [None]:
# Tokenizing and stop words removal
stop_words = set(stopwords.words('indonesian'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

columns_to_tokenize_products = ['name', 'brand', 'skin_type', 'type', 'description']
columns_to_tokenize_profiles = ['skin_type_face', 'hair_issue', 'skin_type_body', 
                                'allergy_history', 'preferred_products', 
                                'avoided_products', 'specific_needs']

for column in columns_to_tokenize_products:
    products_df[column] = products_df[column].apply(tokenize_and_remove_stopwords)

for column in columns_to_tokenize_profiles:
    profiles_rating_df[column] = profiles_rating_df[column].apply(tokenize_and_remove_stopwords)

In [None]:
# Stemming
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

columns_to_stem_products = ['name', 'brand', 'skin_type', 'type', 'description']
columns_to_stem_profiles = ['skin_type_face', 'hair_issue', 'skin_type_body', 
                            'allergy_history', 'preferred_products', 
                            'avoided_products', 'specific_needs'] 

for column in columns_to_stem_products:
    products_df[column] = products_df[column].apply(stem_tokens)

for column in columns_to_stem_profiles:
    profiles_rating_df[column] = profiles_rating_df[column].apply(stem_tokens)

In [None]:
# Save preprocessed data back to CSV
products_df.to_csv('../data/preprosessing_data/preprocessed_products.csv', index=False)
profiles_rating_df.to_csv('../data/preprosessing_data/preprocessed_profiles_rating.csv', index=False)