In [1]:
import sys
import os
import pandas as pd
from IPython.display import display


sys.path.append('../scripts')

In [2]:
from Preprocessing import (
    load_data, add_metadata, show_duplicates, remove_duplicates,
    show_missing, handle_missing, normalize_dates, save_cleaned_reviews
)

In [3]:
# bank_files = {
    
#     'CBE': '../data/CBE_reviews_20250608_100550.csv',
#     'BOA': '../data/BOA_reviews_20250608_100554.csv',
#     'Dashen': '../data/Dashen_reviews_20250608_100557.csv'
# }

bank_files = {
    'CBE': '../data/CBE_reviews_20250608_100550.csv',
    'BOA': '../data/BOA_reviews_20250608_100554.csv',
    'Dashen': '../data/Dashen_reviews_20250608_100557.csv'
}

In [4]:
bank_dfs = {}
for bank, filepath in bank_files.items():
    if not os.path.exists(filepath):
        print(f"Warning: File not found at {filepath} for {bank}. Please update the path.")
        continue
    df = load_data(filepath)
    df = add_metadata(df, bank_name=bank)
    bank_dfs[bank] = df

In [5]:
# Step 1: Show head of all datasets
print("\n--- Raw data heads ---")
for bank, df in bank_dfs.items():
    print(f"\n{bank} data head:")
    display(df.head())


--- Raw data heads ---

CBE data head:


Unnamed: 0,review_text,rating,date,bank_name,source,bank
0,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,CBE,Telegram,CBE
1,what is this app problem???,1,2025-06-05,CBE,Telegram,CBE
2,the app is proactive and a good connections.,5,2025-06-05,CBE,Telegram,CBE
3,I cannot send to cbebirr app. through this app.,3,2025-06-05,CBE,Telegram,CBE
4,good,4,2025-06-05,CBE,Telegram,CBE



BOA data head:


Unnamed: 0,review_text,rating,date,bank_name,source,bank
0,it's not working,3,2025-06-05,BOA,Telegram,BOA
1,"Hello, I’m facing a problem with the BOA Mobil...",1,2025-06-03,BOA,Telegram,BOA
2,exceptional,5,2025-06-03,BOA,Telegram,BOA
3,BoA Mobile good bank,5,2025-06-02,BOA,Telegram,BOA
4,this is worest app 24/7 loading,1,2025-06-01,BOA,Telegram,BOA



Dashen data head:


Unnamed: 0,review_text,rating,date,bank_name,source,bank
0,love,3,2025-06-06,Dashen,Telegram,Dashen
1,መቸሸጠ,5,2025-06-03,Dashen,Telegram,Dashen
2,wow,5,2025-06-03,Dashen,Telegram,Dashen
3,gadaa,5,2025-06-01,Dashen,Telegram,Dashen
4,Massive upgrade from the Amole app.,5,2025-05-31,Dashen,Telegram,Dashen


In [6]:
# Step 2: Show duplicates for all banks
print("\n--- Duplicates in each dataset ---")
for bank, df in bank_dfs.items():
    duplicates = show_duplicates(df)
    print(f"\nDuplicates in {bank}:")
    if duplicates.empty:
        print("No duplicates found.")
    else:
        display(duplicates)


--- Duplicates in each dataset ---

Duplicates in CBE:


Unnamed: 0,review_text,rating,date,bank_name,source,bank
4,good,4,2025-06-05,CBE,Telegram,CBE
5,not functional,1,2025-06-05,CBE,Telegram,CBE
8,best,5,2025-06-04,CBE,Telegram,CBE
11,good,5,2025-06-04,CBE,Telegram,CBE
13,Good,5,2025-06-04,CBE,Telegram,CBE
...,...,...,...,...,...,...
4880,Ethiopia,5,2023-01-04,CBE,Telegram,CBE
4897,Amazing app,5,2022-12-29,CBE,Telegram,CBE
4898,Easy to use,1,2022-12-27,CBE,Telegram,CBE
4902,Ok,5,2022-12-27,CBE,Telegram,CBE



Duplicates in BOA:


Unnamed: 0,review_text,rating,date,bank_name,source,bank
10,good,5,2025-05-22,BOA,Telegram,BOA
12,good,5,2025-05-21,BOA,Telegram,BOA
14,BoA,2,2025-05-20,BOA,Telegram,BOA
15,good,5,2025-05-20,BOA,Telegram,BOA
19,excellent app,4,2025-05-16,BOA,Telegram,BOA
...,...,...,...,...,...,...
1026,👏👏👏,5,2024-01-16,BOA,Telegram,BOA
1027,Wow,5,2024-01-16,BOA,Telegram,BOA
1028,Nice,5,2024-01-15,BOA,Telegram,BOA
1035,👍,3,2024-01-11,BOA,Telegram,BOA



Duplicates in Dashen:


Unnamed: 0,review_text,rating,date,bank_name,source,bank
2,wow,5,2025-06-03,Dashen,Telegram,Dashen
5,good,4,2025-05-31,Dashen,Telegram,Dashen
9,Wow,5,2025-05-30,Dashen,Telegram,Dashen
10,good,5,2025-05-30,Dashen,Telegram,Dashen
16,good,5,2025-05-28,Dashen,Telegram,Dashen
17,nice application,5,2025-05-27,Dashen,Telegram,Dashen
18,good,5,2025-05-27,Dashen,Telegram,Dashen
19,wow,5,2025-05-27,Dashen,Telegram,Dashen
27,its best,5,2025-05-23,Dashen,Telegram,Dashen
30,wow,5,2025-05-20,Dashen,Telegram,Dashen


In [7]:
# Step 3: Remove duplicates for all banks
print("\n--- Removing duplicates ---")
for bank, df in bank_dfs.items():
    bank_dfs[bank] = remove_duplicates(df)
    print(f"{bank} shape after removing duplicates: {bank_dfs[bank].shape}")


--- Removing duplicates ---
CBE shape after removing duplicates: (3648, 6)
BOA shape after removing duplicates: (889, 6)
Dashen shape after removing duplicates: (407, 6)


In [8]:
# Step 4: Show missing values for all banks
print("\n--- Missing values in each dataset ---")
for bank, df in bank_dfs.items():
    missing = show_missing(df)
    print(f"\nMissing values in {bank}:")
    if missing.empty:
        print("No missing values found.")
    else:
        display(missing)


--- Missing values in each dataset ---

Missing values in CBE:
No missing values found.

Missing values in BOA:
No missing values found.

Missing values in Dashen:
No missing values found.


In [9]:
# Step 6: Normalize dates for all banks
print("\n--- Normalizing dates ---")
for bank, df in bank_dfs.items():
    bank_dfs[bank] = normalize_dates(df)
    print(f"{bank} shape after normalizing dates: {bank_dfs[bank].shape}")


--- Normalizing dates ---
CBE shape after normalizing dates: (3648, 6)
BOA shape after normalizing dates: (889, 6)
Dashen shape after normalizing dates: (407, 6)


In [10]:
# Step 7: Save cleaned datasets for all banks inside the data folder
output_dir = '../data'
os.makedirs(output_dir, exist_ok=True)  # Ensure folder exists

for bank, df in bank_dfs.items():
    output_path = os.path.join(output_dir, f"{bank}_cleaned.csv")
    df.to_csv(output_path, index=False)
    print(f"Saved cleaned data for {bank} to {output_path}")


Saved cleaned data for CBE to ../data\CBE_cleaned.csv
Saved cleaned data for BOA to ../data\BOA_cleaned.csv
Saved cleaned data for Dashen to ../data\Dashen_cleaned.csv
