# Data Cleaning and Preprocessing for Google Maps Reviews

This notebook contains the data cleaning and preprocessing pipeline for Google Maps reviews. It includes the following steps:

1. **Setup and Dependencies** - Importing required libraries and setting up the environment
2. **Loading Data** - Loading the raw review data
3. **Text Preprocessing** - Cleaning and normalizing the text data
4. **Translation** - Translating English words to Indonesian
5. **Tokenization** - Breaking down text into tokens
6. **Stopword Removal** - Removing common words
7. **Final Processing** - Preparing the final cleaned text

---

In [11]:
# 1. Setup and Dependencies
# Import required libraries
import os
import json
import re
import string

import nltk
import pandas as pd
from nltk.corpus import stopwords
from transformers import BertTokenizer

# Download required NLTK data
nltk.download('stopwords')

# Initialize BERT tokenizer for Indonesian language
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p2")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elang\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 2. Loading Data

In this section, we load the raw review data from the CSV file and take an initial look at the dataset structure.

```python
# Load the dataset from CSV file
file_path = "data_clean/all_reviews_merged.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
df.head()

# 2. Loading Data

In this section, we load the raw review data from the CSV file and take an initial look at the dataset structure.

In [12]:
# Load the dataset from CSV file
file_path = "data_clean/all_reviews_merged.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print("\nSample of the dataset:")
df.sample(5)

Dataset shape: (9610, 2)

Sample of the dataset:


Unnamed: 0,nama_tempat,review
5797,Kopi Studio 24 Panglima Sudirman,tempatnya bagus harganya juga ramah dikantong
3462,Kopi Studio 24,makanan dan minuman ada di harga yg sangat ter...
8022,Mie Gacoan Suhat,"cakep tempatnya bersih layanannya cepat, makan..."
6954,Mie Gacoan Sawojajar Kota Malang,"Sangat ramai, tapi tidak menunggu lama karena ..."
3925,Kopi Studio 24,Minumannya enak banget sama murah


## 3. Text Preprocessing Functions

This section contains all the necessary functions for text cleaning and preprocessing. These functions will be applied to the review text in the next section.

In [13]:
# Translation Functions (HF Transformers: en -> id)
from wordfreq import zipf_frequency
from transformers import pipeline as hf_pipeline

# Initialize HF translator once (English -> Indonesian)
# Force PyTorch backend to avoid TensorFlow/Keras dependency
try:
    translator_en_id = hf_pipeline("translation", model="Helsinki-NLP/opus-mt-en-id", framework="pt")
except Exception as e:
    raise RuntimeError(
        "Failed to initialize translation pipeline with PyTorch. "
        "Please ensure PyTorch and sentencepiece are installed.\n"
        "Try: pip install -U torch sentencepiece transformers\n"
        f"Original error: {e}"
    )

# Higher threshold to reduce false positives for Indonesian words
ZIPF_THRESHOLD = 3.0

# Simple cache for span translations to avoid repeated calls
_translation_cache = {}

def is_probably_english_token(token):
    """
    Determine if a token is likely English based on word frequency.
    Uses wordfreq.zipf_frequency(token, 'en').
    If frequency for 'en' is above threshold, consider it English.
    """
    # Skip numbers, emoticons, or very short non-letter tokens
    if not re.search(r'[A-Za-z]', token):
        return False
    # Remove non-letters from the start/end
    cleaned = re.sub(r'^[^A-Za-z]+|[^A-Za-z]+$', '', token)
    if len(cleaned) < 2:
        return False
    # Quick guard: common Indonesian short words
    common_id = {"ya","yg","ygnya","yang","itu","ini","dan","di","ke","kok","lah","sih","mau","tapi"}
    if cleaned.lower() in common_id:
        return False
    freq = zipf_frequency(cleaned.lower(), 'en')
    return freq >= ZIPF_THRESHOLD

def split_preserve_delimiters(text):
    """
    Split text into tokens while preserving whitespace and punctuation.
    Example: ['Hello', ', ', 'apa', ' ', 'kabar', '?']
    """
    parts = re.findall(r"[A-Za-z']+|\d+|\s+|[^\w\s]", text, flags=re.UNICODE)
    return parts

def merge_english_spans(parts):
    """
    Merge consecutive English tokens into spans for translation.
    Returns a list of dicts with 'type' ('en' or 'other') and 'text'.
    """
    out = []
    i = 0
    while i < len(parts):
        p = parts[i]
        if is_probably_english_token(p):
            span = [p]
            i += 1
            while i < len(parts) and (is_probably_english_token(parts[i]) or 
                                       re.match(r"[-']", parts[i]) or 
                                       parts[i].isdigit()):
                span.append(parts[i])
                i += 1
            out.append({'type': 'en', 'text': ''.join(span)})
        else:
            out.append({'type': 'other', 'text': p})
            i += 1
    return out

def translate_spans(items, max_length=256, batch_size=32):
    """
    Translate only the 'en' spans using HF pipeline (en->id), batched.
    Returns the combined text with Indonesian kept as-is.
    """
    # Collect indices and texts for English spans
    en_indices = []
    en_texts = []
    for idx, it in enumerate(items):
        if it['type'] == 'en':
            en_indices.append(idx)
            en_texts.append(it['text'])

    # Use cache and figure out which need translation
    to_translate = []
    map_idx_to_key = {}
    for i, text in zip(en_indices, en_texts):
        key = text.strip()
        map_idx_to_key[i] = key
        if key not in _translation_cache:
            to_translate.append(key)

    # Batch translate missing keys
    if to_translate:
        results = []
        for start in range(0, len(to_translate), batch_size):
            batch = to_translate[start:start+batch_size]
            outs = translator_en_id(batch, max_length=max_length)
            results.extend([o.get('translation_text', '') for o in outs])
        for key, translated in zip(to_translate, results):
            _translation_cache[key] = translated

    # Reconstruct output
    out_parts = []
    for idx, it in enumerate(items):
        if it['type'] == 'en':
            key = map_idx_to_key[idx]
            out_parts.append(_translation_cache.get(key, it['text']))
        else:
            out_parts.append(it['text'])
    return ''.join(out_parts)

Device set to use cuda:0


### 3.2 Slang Word Normalization

This function replaces informal/slang words with their standard equivalents using a predefined dictionary.

In [14]:
### 3.1 Text Cleaning Functions

def cleaningText(text):
    """
    Clean the input text by removing mentions, hashtags, URLs, numbers, and special characters.
    """
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)  # Remove mentions
    text = re.sub(r'#[A-Za-z0-9]+', ' ', text)   # Remove hashtags
    text = re.sub(r"http\S+", '', text)        # Remove URLs
    text = re.sub(r'[0-9]+', '', text)          # Remove numbers
    text = re.sub(r'[^\w\s]', ' ', text)        # Remove special characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = text.replace('\n', ' ')              # Remove newlines
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip(' ')                      # Remove leading/trailing spaces
    return text

def casefoldingText(text):
    """Convert text to lowercase."""
    return (text or '').lower()

def normalize_repeated_chars(text):
    """
    Normalize repeated characters (more than 2) to exactly 2.
    Example: "enakkkk" -> "enakk"
    """
    return re.sub(r'(.)\1{2,}', r'\1\1', (text or ''))

def tokenizingText(text):
    """Tokenize text using IndoBERT tokenizer."""
    text = (text or '').strip()
    if not text:
        return []
    return tokenizer.tokenize(text)

def filteringText(tokens):
    """
    Remove stopwords from the token list.
    Uses Indonesian and English stopwords with additional custom stopwords.
    """
    listStopwords = set(stopwords.words('indonesian'))
    listStopwords.update(set(stopwords.words('english')))
    additional_stopwords = ['iya','yaa','gk','gak','g','dr','nya','na','sih','ku','di','ga','ya',
                          'gaa','loh','kah','woi','woii','woy','pas','c','deh','eh']
    listStopwords.update(additional_stopwords)
    return [word for word in tokens if word not in listStopwords]

def toSentence(list_words):
    """Convert a list of words back to a sentence."""
    return ' '.join(word for word in list_words)

In [15]:
# Load slang words dictionary
with open('data_clean/slangwords.json', 'r', encoding='utf-8') as f:
    slangwords = json.load(f)

def fix_slangwords(text):
    """
    Replace slang words with their standard equivalents.
    Uses a predefined dictionary of slang words and their standard forms.
    """
    if not isinstance(text, str):
        return ""
        
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    return ' '.join(fixed_words)

In [16]:
# 4.1 Text Cleaning and Normalization

# Clean the text by removing special characters, URLs, etc.
df['text_clean'] = df['review'].apply(cleaningText)

# Convert all text to lowercase
df['text_casefoldingText'] = df['text_clean'].apply(casefoldingText)

# Normalize repeated characters (e.g., 'sooo' -> 'soo')
df['text_normrepeat'] = df['text_casefoldingText'].apply(normalize_repeated_chars)

# Replace slang words with their standard forms
df['text_slangwords'] = df['text_normrepeat'].apply(fix_slangwords)

# 4.2 Translation of English Words (HF en->id)
# Translate any English words to Indonesian while preserving the rest of the text
def apply_translation(text):
    parts = split_preserve_delimiters(text)
    items = merge_english_spans(parts)
    # HF translate_spans no longer uses dest, it is fixed to en->id
    return translate_spans(items)

df['text_translated'] = df['text_slangwords'].apply(apply_translation)

# 4.3 Tokenization and Stopword Removal
# Tokenize the text
df['text_tokenizingText'] = df['text_translated'].apply(tokenizingText)

# Remove stopwords
df['text_stopword'] = df['text_tokenizingText'].apply(filteringText)

# 4.4 Final Text Processing
# Convert tokens back to sentences
df['text_akhir'] = df['text_stopword'].apply(toSentence)

# Store the final cleaned text
df['text_clean'] = df['text_translated']
df['sentence'] = df['text_akhir']

In [17]:
# 4.5 De-duplicate and remove original review column
# Drop duplicate rows based on the cleaned text to avoid redundancy
if 'text_clean' in df.columns:
    df.drop_duplicates(subset=['text_clean'], inplace=True, ignore_index=True)

# Remove the original 'review' column to avoid redundancy after cleaning
if 'review' in df.columns:
    df.drop(columns=['review'], inplace=True)

In [18]:
# 4.6 Remove single-word reviews
# Filter out rows where the cleaned text has only one word
if 'text_clean' in df.columns:
    word_counts = df['text_clean'].fillna('').str.findall(r'\b\w+\b').str.len()
    df = df[word_counts > 1].reset_index(drop=True)

In [19]:
def fix_slangwords(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords:
            fixed_words.append(slangwords[word.lower()])
        else:
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text

## 4. Text Processing Pipeline

This section applies all the preprocessing steps to the review data in a sequential manner.

## 5. English Translation (Optional)

This section translates the cleaned Indonesian text to English for further analysis. This is optional and can be skipped if not needed.

```python
# Note: This section requires the Helsinki-NLP/opus-mt-id-en model
# Uncomment and run this cell if you need English translations
"""
from transformers import pipeline as hf_pipeline

# Initialize the translation pipeline
translator = hf_pipeline("translation", model="Helsinki-NLP/opus-mt-id-en")

def translate_texts(texts, batch_size=32, max_length=256):
    """
    Translate a batch of texts from Indonesian to English.
    Handles batching to avoid memory issues with large datasets.
    """
    results = []
    n = len(texts)
    for i in range(0, n, batch_size):
        batch = [t if isinstance(t, str) else "" for t in texts[i:i+batch_size]]
        outs = translator(batch, max_length=max_length)
        results.extend([o.get("translation_text", "") for o in outs])
    return results

# Translate the cleaned text to English
print("Translating to English (this may take a while)...")
texts_to_translate = df['text_clean'].fillna("").tolist()
df['text_clean_en'] = translate_texts(texts_to_translate, batch_size=32)
"""

In [20]:
## 6. Save Processed Data

# Display a sample of the processed data
print("Sample of processed data:")
cols_to_show = [c for c in ['text_clean', 'sentence'] if c in df.columns]
df[cols_to_show].sample(5)

# Save the cleaned data to a CSV file
output_dir = "data_clean"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_csv = os.path.join(output_dir, "all_reviews_cleaned.csv")
output_cols = ['text_clean']

# Include English translation if available
if 'text_clean_en' in df.columns:
    output_cols.append('text_clean_en')

# Guard in case some columns are missing
output_cols = [c for c in output_cols if c in df.columns]

df_out = df[output_cols].copy()
df_out.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"\nProcessed data saved to: {output_csv}")
print(f"Total unique reviews processed (after de-dup): {len(df)}")

Sample of processed data:

Processed data saved to: data_clean\all_reviews_cleaned.csv
Total unique reviews processed (after de-dup): 9183


In [23]:
# 7. Generate Vocabulary / Keywords and save to TXT (tokens only, no counts)
from sklearn.feature_extraction.text import CountVectorizer

vocab_input_csv = os.path.join("data_clean", "all_reviews_cleaned.csv")
vocab_output_txt = os.path.join("data_clean", "vocab.txt")

# Load cleaned data
_vdf = pd.read_csv(vocab_input_csv, encoding="utf-8-sig")
if "text_clean" not in _vdf.columns:
    raise ValueError("'text_clean' column not found in data_clean/all_reviews_cleaned.csv")

_texts = _vdf["text_clean"].astype(str).fillna("").tolist()

# Build vocabulary (tokens only)
_vec = CountVectorizer(
    analyzer="word",
    ngram_range=(1, 1),  # set to (1, 2) if you want bigrams too
    min_df=1,
    max_df=1.0,
    lowercase=True,
)
_vec.fit(_texts)
_vocab = _vec.get_feature_names_out()

# Sort tokens alphabetically for determinism
_tokens = sorted(_vocab.tolist())

# Write tokens only (one token per line)
os.makedirs(os.path.dirname(vocab_output_txt), exist_ok=True)
with open(vocab_output_txt, "w", encoding="utf-8") as f:
    for tok in _tokens:
        f.write(f"{tok}\n")

print(f"[Vocab] Wrote {len(_tokens)} tokens to {vocab_output_txt}")

[Vocab] Wrote 10100 tokens to data_clean\vocab.txt
