# Text Preprocessing Bahasa Indonesia

This notebook performs comprehensive text preprocessing for Bahasa Indonesia including:
1. Tokenization
2. Text Filtering
3. Text Normalization
4. Stopword Removal
5. Stemming using Sastrawi

Each preprocessing step result will be saved to separate CSV columns.

In [1]:
# Install required packages
!pip install sastrawi pandas nltk regex




[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

True

In [3]:
# Load the dataset
df = pd.read_csv('google_play_reviews_DigitalBank_20250703_205650.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 5 rows of content column:")
print(df['content'].head())

Dataset shape: (10000, 15)
Columns: ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion', 'app_id', 'scraped_at', 'review_length', 'word_count']

First 5 rows of content column:
0    sangat mudah digunakan untuk melakukan sesuatu...
1                                     mantap jiwa‚ù§Ô∏èüëçüëçüëç
2                                         membantu bgt
3                 gangguan terus transfer sy g masuk ¬≤
4                        loading isi pulsa lama sekali
Name: content, dtype: object


In [4]:
# Initialize Sastrawi components
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()
stopwords = stopword_factory.get_stop_words()

print(f"Number of stopwords: {len(stopwords)}")
print(f"Sample stopwords: {list(stopwords)[:10]}")

Number of stopwords: 126
Sample stopwords: ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua']


In [5]:
# Define preprocessing functions

def tokenize_text(text):
    """Tokenize text into words"""
    if pd.isna(text):
        return []
    try:
        tokens = word_tokenize(str(text).lower())
        return tokens
    except:
        return str(text).lower().split()

def filter_text(tokens):
    """Filter tokens: remove punctuation, numbers, and very short words"""
    if not tokens:
        return []
    
    filtered_tokens = []
    for token in tokens:
        # Remove punctuation and numbers
        if (token.isalpha() and 
            len(token) > 2 and 
            token not in string.punctuation):
            filtered_tokens.append(token)
    
    return filtered_tokens

def normalize_text(tokens):
    """Normalize text: handle common Indonesian text variations"""
    if not tokens:
        return []
    
    # Common Indonesian text normalizations
    normalization_dict = {
        'gak': 'tidak',
        'ga': 'tidak',
        'nggak': 'tidak',
        'ngga': 'tidak',
        'gk': 'tidak',
        'tdk': 'tidak',
        'bgt': 'banget',
        'bgd': 'banget',
        'bgt': 'banget',
        'yg': 'yang',
        'dgn': 'dengan',
        'utk': 'untuk',
        'krn': 'karena',
        'krna': 'karena',
        'tp': 'tapi',
        'tpi': 'tapi',
        'jd': 'jadi',
        'jdi': 'jadi',
        'sy': 'saya',
        'aq': 'aku',
        'ak': 'aku',
        'klo': 'kalau',
        'kalo': 'kalau',
        'udh': 'sudah',
        'udah': 'sudah',
        'dah': 'sudah',
        'blm': 'belum',
        'blom': 'belum',
        'hrs': 'harus',
        'hrus': 'harus',
        'bs': 'bisa',
        'bsa': 'bisa',
        'gmn': 'gimana',
        'gmna': 'gimana',
        'knp': 'kenapa',
        'knpa': 'kenapa',
        'emg': 'memang',
        'emang': 'memang',
        'org': 'orang',
        'orng': 'orang'
    }
    
    normalized_tokens = []
    for token in tokens:
        # Remove repeated characters (e.g., 'bagussss' -> 'bagus')
        token = re.sub(r'(.)\1{2,}', r'\1', token)
        
        # Apply normalization dictionary
        if token in normalization_dict:
            token = normalization_dict[token]
        
        normalized_tokens.append(token)
    
    return normalized_tokens

def remove_stopwords(tokens):
    """Remove Indonesian stopwords"""
    if not tokens:
        return []
    
    return [token for token in tokens if token not in stopwords]

def stem_tokens(tokens):
    """Apply stemming using Sastrawi"""
    if not tokens:
        return []
    
    stemmed_tokens = []
    for token in tokens:
        stemmed = stemmer.stem(token)
        stemmed_tokens.append(stemmed)
    
    return stemmed_tokens

In [6]:
# Apply preprocessing steps
print("Starting text preprocessing...")

# Step 1: Tokenization
print("1. Tokenizing text...")
df['tokens'] = df['content'].apply(tokenize_text)
df['tokenized_text'] = df['tokens'].apply(lambda x: ' '.join(x) if x else '')

# Step 2: Text Filtering
print("2. Filtering text...")
df['filtered_tokens'] = df['tokens'].apply(filter_text)
df['filtered_text'] = df['filtered_tokens'].apply(lambda x: ' '.join(x) if x else '')

# Step 3: Text Normalization
print("3. Normalizing text...")
df['normalized_tokens'] = df['filtered_tokens'].apply(normalize_text)
df['normalized_text'] = df['normalized_tokens'].apply(lambda x: ' '.join(x) if x else '')

# Step 4: Stopword Removal
print("4. Removing stopwords...")
df['no_stopwords_tokens'] = df['normalized_tokens'].apply(remove_stopwords)
df['no_stopwords_text'] = df['no_stopwords_tokens'].apply(lambda x: ' '.join(x) if x else '')

# Step 5: Stemming
print("5. Applying stemming...")
df['stemmed_tokens'] = df['no_stopwords_tokens'].apply(stem_tokens)
df['stemmed_text'] = df['stemmed_tokens'].apply(lambda x: ' '.join(x) if x else '')

print("Text preprocessing completed!")

Starting text preprocessing...
1. Tokenizing text...
2. Filtering text...
3. Normalizing text...
4. Removing stopwords...
5. Applying stemming...
Text preprocessing completed!


In [7]:
# Display preprocessing results for sample texts
print("Sample preprocessing results:")
print("=" * 80)

for i in range(min(3, len(df))):
    print(f"\nSample {i+1}:")
    print(f"Original: {df.iloc[i]['content']}")
    print(f"Tokenized: {df.iloc[i]['tokenized_text']}")
    print(f"Filtered: {df.iloc[i]['filtered_text']}")
    print(f"Normalized: {df.iloc[i]['normalized_text']}")
    print(f"No Stopwords: {df.iloc[i]['no_stopwords_text']}")
    print(f"Stemmed: {df.iloc[i]['stemmed_text']}")
    print("-" * 80)

Sample preprocessing results:

Sample 1:
Original: sangat mudah digunakan untuk melakukan sesuatu tanpa harus menunggu lama
Tokenized: sangat mudah digunakan untuk melakukan sesuatu tanpa harus menunggu lama
Filtered: sangat mudah digunakan untuk melakukan sesuatu tanpa harus menunggu lama
Normalized: sangat mudah digunakan untuk melakukan sesuatu tanpa harus menunggu lama
No Stopwords: sangat mudah digunakan melakukan menunggu lama
Stemmed: sangat mudah guna laku tunggu lama
--------------------------------------------------------------------------------

Sample 2:
Original: mantap jiwa‚ù§Ô∏èüëçüëçüëç
Tokenized: mantap jiwa‚ù§Ô∏èüëçüëçüëç
Filtered: mantap
Normalized: mantap
No Stopwords: mantap
Stemmed: mantap
--------------------------------------------------------------------------------

Sample 3:
Original: membantu bgt
Tokenized: membantu bgt
Filtered: membantu bgt
Normalized: membantu banget
No Stopwords: membantu banget
Stemmed: bantu banget
-------------------------------

In [8]:
# Create final dataset with preprocessing results
# Remove intermediate token columns to keep only text columns
final_columns = [
    'reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount',
    'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion',
    'app_id', 'scraped_at', 'review_length', 'word_count',
    'tokenized_text', 'filtered_text', 'normalized_text', 
    'no_stopwords_text', 'stemmed_text'
]

df_final = df[final_columns].copy()

print(f"Final dataset shape: {df_final.shape}")
print(f"New preprocessing columns added: {['tokenized_text', 'filtered_text', 'normalized_text', 'no_stopwords_text', 'stemmed_text']}")

Final dataset shape: (10000, 20)
New preprocessing columns added: ['tokenized_text', 'filtered_text', 'normalized_text', 'no_stopwords_text', 'stemmed_text']


In [9]:
# Save preprocessed data to new CSV file
output_filename = 'google_play_reviews_DigitalBank_preprocessed.csv'
df_final.to_csv(output_filename, index=False)

print(f"Preprocessed data saved to: {output_filename}")
print(f"Total rows: {len(df_final)}")
print(f"Total columns: {len(df_final.columns)}")

Preprocessed data saved to: google_play_reviews_DigitalBank_preprocessed.csv
Total rows: 10000
Total columns: 20


In [10]:
# Display statistics about preprocessing results
print("\nPreprocessing Statistics:")
print("=" * 50)

# Calculate average word counts for each preprocessing step
stats = {
    'Original': df_final['content'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0).mean(),
    'Tokenized': df_final['tokenized_text'].apply(lambda x: len(x.split()) if x else 0).mean(),
    'Filtered': df_final['filtered_text'].apply(lambda x: len(x.split()) if x else 0).mean(),
    'Normalized': df_final['normalized_text'].apply(lambda x: len(x.split()) if x else 0).mean(),
    'No Stopwords': df_final['no_stopwords_text'].apply(lambda x: len(x.split()) if x else 0).mean(),
    'Stemmed': df_final['stemmed_text'].apply(lambda x: len(x.split()) if x else 0).mean()
}

for step, avg_words in stats.items():
    print(f"{step}: {avg_words:.2f} average words per review")

# Show reduction percentages
original_avg = stats['Original']
print("\nReduction from original:")
for step, avg_words in stats.items():
    if step != 'Original':
        reduction = ((original_avg - avg_words) / original_avg) * 100
        print(f"{step}: {reduction:.1f}% reduction")


Preprocessing Statistics:
Original: 12.95 average words per review
Tokenized: 14.37 average words per review
Filtered: 11.58 average words per review
Normalized: 11.58 average words per review
No Stopwords: 9.00 average words per review
Stemmed: 8.99 average words per review

Reduction from original:
Tokenized: -11.0% reduction
Filtered: 10.5% reduction
Normalized: 10.5% reduction
No Stopwords: 30.5% reduction
Stemmed: 30.6% reduction


In [11]:
# Display final dataset info
print("\nFinal Dataset Information:")
print("=" * 50)
print(df_final.info())

print("\nFirst 3 rows of preprocessing columns:")
preprocessing_cols = ['content', 'tokenized_text', 'filtered_text', 'normalized_text', 'no_stopwords_text', 'stemmed_text']
print(df_final[preprocessing_cols].head(3).to_string())


Final Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              10000 non-null  object
 1   userName              10000 non-null  object
 2   userImage             10000 non-null  object
 3   content               10000 non-null  object
 4   score                 10000 non-null  int64 
 5   thumbsUpCount         10000 non-null  int64 
 6   reviewCreatedVersion  7652 non-null   object
 7   at                    10000 non-null  object
 8   replyContent          5691 non-null   object
 9   repliedAt             5691 non-null   object
 10  appVersion            7652 non-null   object
 11  app_id                10000 non-null  object
 12  scraped_at            10000 non-null  object
 13  review_length         10000 non-null  int64 
 14  word_count            10000 non-null  int64 
 15  tokenized

In [12]:
df_final

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,app_id,scraped_at,review_length,word_count,tokenized_text,filtered_text,normalized_text,no_stopwords_text,stemmed_text
0,1be1a93d-9f51-4611-83c9-04b82b0ddec8,Jaja yanti05,https://play-lh.googleusercontent.com/a-/ALV-U...,sangat mudah digunakan untuk melakukan sesuatu...,5,0,8.65.3,2025-07-02 20:35:08,,,8.65.3,com.jago.digitalBanking,2025-07-03T20:55:15.795381,72,10,sangat mudah digunakan untuk melakukan sesuatu...,sangat mudah digunakan untuk melakukan sesuatu...,sangat mudah digunakan untuk melakukan sesuatu...,sangat mudah digunakan melakukan menunggu lama,sangat mudah guna laku tunggu lama
1,b64ad41a-32c6-4541-b0f9-bf5569d966c7,Kama Daeng87,https://play-lh.googleusercontent.com/a/ACg8oc...,mantap jiwa‚ù§Ô∏èüëçüëçüëç,5,0,,2025-07-02 20:26:19,,,,com.jago.digitalBanking,2025-07-03T20:55:15.795381,16,2,mantap jiwa‚ù§Ô∏èüëçüëçüëç,mantap,mantap,mantap,mantap
2,79870da4-736a-4231-a111-11c9fca52c05,Yusep,https://play-lh.googleusercontent.com/a/ACg8oc...,membantu bgt,5,0,,2025-07-02 20:09:51,,,,com.jago.digitalBanking,2025-07-03T20:55:15.795381,12,2,membantu bgt,membantu bgt,membantu banget,membantu banget,bantu banget
3,df833340-c8b6-456c-8050-ef291fec92ec,Ii Irmayanti,https://play-lh.googleusercontent.com/a/ACg8oc...,gangguan terus transfer sy g masuk ¬≤,1,0,8.67.0,2025-07-02 19:57:12,"Halo, Jagoan. Mohon maaf atas kendala yang ter...",2025-07-02 21:42:48,8.67.0,com.jago.digitalBanking,2025-07-03T20:55:15.795381,36,7,gangguan terus transfer sy g masuk ¬≤,gangguan terus transfer masuk,gangguan terus transfer masuk,gangguan terus transfer masuk,ganggu terus transfer masuk
4,56e7f5fb-ec7a-46d7-a7f2-4c6a837e49ae,Aderio Ramadhan,https://play-lh.googleusercontent.com/a-/ALV-U...,loading isi pulsa lama sekali,3,0,8.67.0,2025-07-02 19:19:56,"Halo, Jagoan. Mohon maaf atas kendala yang ter...",2025-07-02 22:35:17,8.67.0,com.jago.digitalBanking,2025-07-03T20:55:15.795381,29,5,loading isi pulsa lama sekali,loading isi pulsa lama sekali,loading isi pulsa lama sekali,loading isi pulsa lama sekali,loading isi pulsa lama sekali
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,880d6bf7-f70a-4bbb-949e-9d088ffbcd5a,Sastra Sakti,https://play-lh.googleusercontent.com/a/ACg8oc...,Aplikasi ga jelas mau daftar tapi verivikasi w...,1,0,,2025-02-28 01:19:06,"Hai Kak, mohon maaf atas kendala yang terjadi....",2025-02-28 08:13:57,,com.bnc.finance,2025-07-03T20:56:46.885400,63,10,aplikasi ga jelas mau daftar tapi verivikasi w...,aplikasi jelas mau daftar tapi verivikasi waja...,aplikasi jelas mau daftar tapi verivikasi waja...,aplikasi jelas mau daftar verivikasi wajah gag...,aplikasi jelas mau daftar verivikasi wajah gag...
9996,80270aa8-1500-470f-8428-6e8b0b7d1a31,Jayla Watkins,https://play-lh.googleusercontent.com/a/ACg8oc...,DANA TIDAK KUNJUNG DIKEMBALIKAN TRANSFER VIA V...,1,0,,2025-02-28 01:06:17,"Hai Kak, mohon maaf atas kendala yang terjadi....",2025-02-28 08:13:21,,com.bnc.finance,2025-07-03T20:56:46.885400,55,8,dana tidak kunjung dikembalikan transfer via v...,dana tidak kunjung dikembalikan transfer via g...,dana tidak kunjung dikembalikan transfer via g...,dana kunjung dikembalikan transfer via gabisa,dana kunjung kembali transfer via gabisa
9997,d5b0b35c-320b-467d-b2ec-7358ac588e2d,Angga Weldy,https://play-lh.googleusercontent.com/a/ACg8oc...,ok,5,0,3.4.40,2025-02-27 23:27:03,"Hai kak, terima kasih sudah mencoba Neo Experi...",2025-02-28 08:02:55,3.4.40,com.bnc.finance,2025-07-03T20:56:46.885400,2,1,ok,,,,
9998,ddfdaf2c-8f05-4f77-b153-cd7b9e04a1d2,Bambang Indradjaja,https://play-lh.googleusercontent.com/a-/ALV-U...,Sdh selesai kendalanya di nohp lama sy ganti k...,5,0,3.4.40,2025-02-27 22:38:26,"Hai Kak, apabila masih memiliki kendala maka u...",2025-04-01 09:54:23,3.4.40,com.bnc.finance,2025-07-03T20:56:46.885400,57,11,sdh selesai kendalanya di nohp lama sy ganti k...,sdh selesai kendalanya nohp lama ganti nohp baru,sdh selesai kendalanya nohp lama ganti nohp baru,sdh selesai kendalanya nohp lama ganti nohp baru,sdh selesai kendala nohp lama ganti nohp baru
