In [1]:
# preprocess_reviews.py
import pandas as pd
import os
from datetime import datetime

# Directory containing raw CSV files
input_dir = 'raw_data'
output_file = 'cleaned_reviews.csv'

# Combine all raw CSV files
def combine_reviews(input_dir):
    all_data = []
    for file_name in os.listdir(input_dir):
        if file_name.startswith('raw_reviews_') and file_name.endswith('.csv'):
            file_path = os.path.join(input_dir, file_name)
            df = pd.read_csv(file_path)
            all_data.append(df)
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    else:
        raise ValueError("No raw review CSV files found in raw_data/")

# Preprocess the combined data
def preprocess_reviews(df):
    # Step 1: Remove duplicates based on review_id
    df = df.drop_duplicates(subset='review_id', keep='first')
    
    # Step 2: Handle missing data
    df = df.dropna(subset=['review', 'rating'])
    
    # Step 3: Normalize dates to YYYY-MM-DD
    df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    
    # Step 4: Ensure rating is an integer (1-5)
    df['rating'] = df['rating'].astype(int)
    df = df[df['rating'].between(1, 5)]
    
    # Step 5: Clean review text (remove extra whitespace)
    df['review'] = df['review'].str.strip()
    
    return df

# Main function
def main():
    print("Combining raw review files...")
    combined_df = combine_reviews(input_dir)
    
    print("Preprocessing data...")
    cleaned_df = preprocess_reviews(combined_df)
    
    # Save cleaned data
    cleaned_df.to_csv(output_file, index=False)
    
    # Print summary
    print(f"Total reviews after preprocessing: {len(cleaned_df)}")
    print(f"Missing data:\n{cleaned_df.isnull().sum()}")
    print(f"Reviews per bank:\n{cleaned_df['bank'].value_counts()}")

if __name__ == "__main__":
    main()

Combining raw review files...
Preprocessing data...
Total reviews after preprocessing: 1200
Missing data:
review_id    0
review       0
rating       0
date         0
bank         0
source       0
dtype: int64
Reviews per bank:
bank
Bank of Abyssinia              400
Commercial Bank of Ethiopia    400
Dashen Bank                    400
Name: count, dtype: int64
