# Preprocessing

In [2]:
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize, word_tokenize

# Download NLTK tokenizer (only needs to run once)
nltk.download('punkt')

# --- Step 1: Define cleaning functions ---

def clean_text(text):
    # Convert tabs and weird encodings to space
    text = text.replace('\t', ' ')
    text = text.replace('""', '"')  # Excel-style double quotes
    text = text.replace('",', '')
    text = text.replace(',"', '')
    text = text.replace('","', '')
    text = text.replace('“', '"').replace('”', '"').replace('’', "'")

    # Remove comma right before or after a quote
    text = re.sub(r'"\s*,\s*', ' ', text)
    text = re.sub(r'\s*,\s*"', ' ', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove wrapping quotes
    if text.startswith('"') and text.endswith('"'):
        text = text[1:-1]

    return text


def standarize_quotation(input_df):
    quotation_marks = ['“', '”', "''", "``", "’’", "’", '"""']
    for mark in quotation_marks:
        input_df['text'] = input_df['text'].str.replace(mark, '"', regex=False)
    return input_df

# --- Step 2: Load the CSV ---

df = pd.read_csv("Data Cerita Rakyat.csv")

# Only keep needed columns
df = df[['no', 'judul', 'text']].rename(columns={'no': 'story_id'})

# Standardize quotes
df = standarize_quotation(df)

# Clean each text field
df['text'] = df['text'].apply(clean_text)

# --- Step 3: Tokenization ---

rows = []

for _, row in df.iterrows():
    story_id = row['story_id']
    title = row['judul']
    text = row['text']

    sentences = sent_tokenize(text)
    for sent_id, sentence in enumerate(sentences):
        words = word_tokenize(sentence)
        for word in words:
            rows.append({
                'story_id': story_id,
                'judul': title,
                'sentence_id': sent_id,
                'word': word
            })

# --- Step 4: Final clean-up & save ---

result_df = pd.DataFrame(rows)

# Remove leftover junk quote tokens
junk_tokens = ['"', '`']
result_df = result_df[~result_df['word'].isin(junk_tokens)]

# Save to file
result_df.to_csv("cerita_rakyat_tokenized_clean.csv", index=False)

# Optional: Show sample
print(result_df.head(15))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rayssa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


    story_id               judul  sentence_id      word
0          1  Legenda Danau Toba            0        Di
1          1  Legenda Danau Toba            0    sebuah
2          1  Legenda Danau Toba            0      desa
3          1  Legenda Danau Toba            0        di
4          1  Legenda Danau Toba            0   wilayah
5          1  Legenda Danau Toba            0  Sumatera
6          1  Legenda Danau Toba            0         ,
7          1  Legenda Danau Toba            0  hiduplah
8          1  Legenda Danau Toba            0   seorang
9          1  Legenda Danau Toba            0    petani
10         1  Legenda Danau Toba            0      yang
11         1  Legenda Danau Toba            0     rajin
12         1  Legenda Danau Toba            0   bekerja
13         1  Legenda Danau Toba            0  meskipun
14         1  Legenda Danau Toba            0     lahan


In [3]:
# After your tokenization and cleaning

# Count how many unique (story_id, sentence_id) pairs
n_sentences = result_df[['story_id', 'sentence_id']].drop_duplicates().shape[0]

print(f"Total number of sentences: {n_sentences}")


Total number of sentences: 14067


In [4]:
# --- Count sentences per story ---

# Step 1: Count unique sentence_id per story_id
sentences_per_story = result_df[['story_id', 'sentence_id']].drop_duplicates().groupby('story_id').size()

# Step 2: Calculate the mean
mean_sentences_per_story = sentences_per_story.mean()

print(f"Mean number of sentences per story: {mean_sentences_per_story:.2f}")


Mean number of sentences per story: 63.36
