# Text Preprocessing

This notebook demonstrates essential preprocessing steps for Amharic text data, focusing on normalization and cleaning while preserving linguistic information crucial for transformer-based models. It avoids steps that harm transformer performance, such as stop word removal, stemming, or lowercasing.

In [8]:
# Import required libraries
import pandas as pd
import re

# Example: Load your dataset (update path as needed)
df = pd.read_csv("../dataset/combined_dataset.csv")

## Amharic Character Normalization

In [9]:
# Define Amharic normalization mapping (expand as needed)
amharic_normalization_map = {
    "ሀ": "ሃ", "ሐ": "ሃ", "ኀ": "ሃ",  # All to 'ሃ'
    "ሰ": "ሠ",  # Example: map 'ሰ' to 'ሠ' (customize as needed)
    # Add more mappings as appropriate for your data
}
def normalize_amharic(text):
    for src, tgt in amharic_normalization_map.items():
        text = text.replace(src, tgt)
    return text

df["text"] = df["text"].astype(str).apply(normalize_amharic)

## Removal of URLs and User Mentions

In [10]:
def remove_urls_mentions(text):
    text = re.sub(r"https?://\S+", "", text)  # Remove URLs
    text = re.sub(r"www\.\S+", "", text)     # Remove www URLs
    text = re.sub(r"@\w+", "", text)          # Remove @mentions
    return text

df["text"] = df["text"].apply(remove_urls_mentions)

## Whitespace Standardization

In [11]:
def standardize_whitespace(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

df["text"] = df["text"].apply(standardize_whitespace)

## Punctuation Standardization

In [12]:
def standardize_punctuation(text):
    # Collapse repeated exclamation/question marks and Amharic equivalents
    text = re.sub(r"([!！፣።፤፥፦፧፨?？])\1+", r"\1", text)
    return text

df["text"] = df["text"].apply(standardize_punctuation)

## Train-Test-Dev Split

In [13]:
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
RANDOM_STATE = 42

# First, split off the test set (10%)
df_temp, df_test = train_test_split(df, test_size=0.10, random_state=RANDOM_STATE, shuffle=True)
# Then, split the remaining into train (80%) and dev (10%)
df_train, df_dev = train_test_split(df_temp, test_size=0.1111, random_state=RANDOM_STATE, shuffle=True)  # 0.1111*0.9 ≈ 0.10

# Add a new column indicating the split
df_train = df_train.copy()
df_train['split'] = 'train'
df_dev = df_dev.copy()
df_dev['split'] = 'dev'
df_test = df_test.copy()
df_test['split'] = 'test'

# Concatenate and save
df_split = pd.concat([df_train, df_dev, df_test], axis=0).reset_index(drop=True)
df_split.to_csv("../dataset/preprocessed_dataset.csv", index=False)

df_split['split'].value_counts()

split
train    58442
test      7306
dev       7305
Name: count, dtype: int64

## Preview the Preprocessed Data

In [14]:
# Preview the first few rows of the final split dataset
# Show text, label, and split columns
df_split[['text', 'label', 'split']].head()

Unnamed: 0,text,label,split
0,እንደጫት መቃም መሠለ የሚኒሊክ ቁራጭ አርፈሽ ተቀመጪ,hate,train
1,ሁሉም ተቃዋሚዎች ችግር ያቀርበሉ ማለት ግን ራሱ ችግር ነው,hate,train
2,አህያ ወንድሙ አህያ ነው ወራሪና ተሠፋፊ ወንድም አናውቅም። ትግራይ ትሠእር,hate,train
3,ወንድማ ቢሆን አቃጣሪ አይሆንም ነበር ይሄ እንዳልከው ወንዳገረድ ነው ሽን...,hate,train
4,ይሸጣል ባህዳር ቀበሌ ባታ ሳይት ካሬ ሜትር ባለበት መዞር የሚችል ዋጋ በ...,normal,train


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Check most common unigrams and bigrams in the preprocessed text
vectorizer = CountVectorizer(ngram_range=(1,2), max_features=20)
X = vectorizer.fit_transform(df_split['text'])
features = vectorizer.get_feature_names_out()
counts = X.sum(axis=0).A1
ngram_freq = pd.DataFrame({'ngram': features, 'count': counts}).sort_values('count', ascending=False)
ngram_freq

Unnamed: 0,ngram,count
8,ነው,30666
2,ላይ,8060
13,እና,7405
19,ግን,6045
1,ህዝብ,5140
11,አማራ,4632
4,ምን,4430
5,ሠው,4312
9,ነገር,4000
12,አንተ,3865
