In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../src') 

from utils.data_loader import load_data
from utils.data_loader import load_data

# Load data
df_cbe = load_data('../data/raw/Commercial Bank of Ethiopia_raw_reviews.csv')
df_boa = load_data('../data/raw/Bank of Abyssinia_raw_reviews.csv')
df_dsn = load_data('../data/raw/Dashen Bank_raw_reviews.csv')



Data loaded successfully from ../data/raw/Commercial Bank of Ethiopia_raw_reviews.csv
Data loaded successfully from ../data/raw/Bank of Abyssinia_raw_reviews.csv
Data loaded successfully from ../data/raw/Dashen Bank_raw_reviews.csv


In [2]:
df_cbe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  5000 non-null   object
 1   rating       5000 non-null   int64 
 2   date         5000 non-null   object
 3   bank_name    5000 non-null   object
 4   source       5000 non-null   object
dtypes: int64(1), object(4)
memory usage: 195.4+ KB


In [3]:
df_boa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045 entries, 0 to 1044
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  1045 non-null   object
 1   rating       1045 non-null   int64 
 2   date         1045 non-null   object
 3   bank_name    1045 non-null   object
 4   source       1045 non-null   object
dtypes: int64(1), object(4)
memory usage: 40.9+ KB


In [4]:
df_dsn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448 entries, 0 to 447
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_text  448 non-null    object
 1   rating       448 non-null    int64 
 2   date         448 non-null    object
 3   bank_name    448 non-null    object
 4   source       448 non-null    object
dtypes: int64(1), object(4)
memory usage: 17.6+ KB


Data Preprocessing

1. Missing Values check and removal

In [8]:
print(df_cbe.isna().sum())
print(df_boa.isna().sum())
print(df_dsn.isna().sum())

review_text    0
rating         0
date           0
bank_name      0
source         0
dtype: int64
review_text    0
rating         0
date           0
bank_name      0
source         0
dtype: int64
review_text    0
rating         0
date           0
bank_name      0
source         0
dtype: int64


We can see that all the datasets have no missing values

2. Duplicate Values

In [14]:
print(df_cbe.duplicated().sum())
print(df_boa.duplicated().sum())
print(df_dsn.duplicated().sum())

0
5
1


In [11]:
# print the original plus the duplicate
print(df_cbe[df_cbe.duplicated(keep=False)])

     review_text  rating        date                    bank_name       source
8           good       5  2025-06-04  Commercial Bank of Ethiopia  Google Play
11          good       5  2025-06-04  Commercial Bank of Ethiopia  Google Play
67          good       5  2025-05-23  Commercial Bank of Ethiopia  Google Play
70          good       5  2025-05-23  Commercial Bank of Ethiopia  Google Play
72          good       5  2025-05-23  Commercial Bank of Ethiopia  Google Play
...          ...     ...         ...                          ...          ...
3640        Good       5  2024-01-05  Commercial Bank of Ethiopia  Google Play
3680        Good       5  2023-12-30  Commercial Bank of Ethiopia  Google Play
3681        Good       5  2023-12-30  Commercial Bank of Ethiopia  Google Play
4005        Nice       5  2023-11-04  Commercial Bank of Ethiopia  Google Play
4006        Nice       5  2023-11-04  Commercial Bank of Ethiopia  Google Play

[150 rows x 5 columns]


In [15]:
print(df_boa[df_boa.duplicated(keep=False)])

    review_text  rating        date          bank_name       source
31         good       4  2025-04-30  Bank of Abyssinia  Google Play
32         good       4  2025-04-30  Bank of Abyssinia  Google Play
643        best       5  2024-05-02  Bank of Abyssinia  Google Play
645        best       5  2024-05-02  Bank of Abyssinia  Google Play
743        Best       5  2024-04-24  Bank of Abyssinia  Google Play
748        Best       5  2024-04-24  Bank of Abyssinia  Google Play
764        Good       5  2024-04-22  Bank of Abyssinia  Google Play
765        Good       5  2024-04-22  Bank of Abyssinia  Google Play
930    Best app       5  2024-02-16  Bank of Abyssinia  Google Play
934    Best app       5  2024-02-16  Bank of Abyssinia  Google Play


In [16]:
print(df_dsn[df_dsn.duplicated(keep=False)])

                         review_text  rating        date    bank_name  \
211  best mobile banking application       5  2025-04-21  Dashen Bank   
213  best mobile banking application       5  2025-04-21  Dashen Bank   

          source  
211  Google Play  
213  Google Play  


In [18]:
# Remove duplicates
df_cbe = df_cbe.drop_duplicates() 
df_boa = df_boa.drop_duplicates() 
df_dsn = df_dsn.drop_duplicates() 

print(df_cbe.duplicated().sum())
print(df_boa.duplicated().sum())
print(df_dsn.duplicated().sum())

0
0
0


3. Normalize dates 

In [24]:
# Converts to datetime64[ns]
df_cbe["date"] = pd.to_datetime(df_cbe["date"]).dt.strftime('%Y-%m-%d') 
df_boa["date"] = pd.to_datetime(df_boa["date"]).dt.strftime('%Y-%m-%d') 
df_dsn["date"] = pd.to_datetime(df_dsn["date"]).dt.strftime('%Y-%m-%d') 
