In [13]:
import pandas as pd
import glob
import os

In [14]:
# Folder where raw scraped CSVs are stored
RAW_DATA_DIR = "../data/raw"
# Folder to save cleaned CSVs
CLEAN_DATA_DIR = "../data/clean"
os.makedirs(CLEAN_DATA_DIR, exist_ok=True)

In [15]:
def preprocess_file(file_path):
    print(f"Processing {file_path} ...")
    df = pd.read_csv(file_path)

    # Drop rows with missing review_text or rating or date
    df = df.dropna(subset=['review_text', 'rating', 'date'])

    # Remove duplicates based on review text and rating
    df = df.drop_duplicates(subset=['review_text', 'rating'])

    # Normalize date format to YYYY-MM-DD
    df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')

    # Drop rows where date conversion failed
    df = df.dropna(subset=['date'])

    # Rename columns to required names
    df_clean = df.rename(columns={
        'review_text': 'review',
        'bank_name': 'bank',
        'source': 'source'
    })

    # Select only the required columns
    df_clean = df_clean[['review', 'rating', 'date', 'bank', 'source']]

    # Save cleaned CSV
    filename = os.path.basename(file_path).replace('raw', 'clean')
    save_path = os.path.join(CLEAN_DATA_DIR, filename)
    df_clean.to_csv(save_path, index=False)
    print(f"Saved cleaned file to {save_path}")


In [16]:
def preprocess_all_files():
    # Process all raw CSV files in the folder
    files = glob.glob(os.path.join(RAW_DATA_DIR, '*.csv'))
    for file in files:
        preprocess_file(file)

In [17]:
preprocess_all_files()


Processing ../data/raw\bank_of_abyssinia_reviews_20250608_090559.csv ...
Saved cleaned file to ../data/clean\bank_of_abyssinia_reviews_20250608_090559.csv
Processing ../data/raw\commercial_bank_of_ethiopia_reviews_20250608_090558.csv ...
Saved cleaned file to ../data/clean\commercial_bank_of_ethiopia_reviews_20250608_090558.csv
Processing ../data/raw\dashen_bank_reviews_20250608_090601.csv ...
Saved cleaned file to ../data/clean\dashen_bank_reviews_20250608_090601.csv
