In [None]:
# scrape_reviews.py
from google_play_scraper import Sort, reviews
import pandas as pd
import time
import os

# Define App IDs for the three banks (replace with actual IDs if different)
app_ids = {
    'Commercial Bank of Ethiopia': 'com.combanketh.mobilebanking',
    'Bank of Abyssinia': 'com.boa.boaMobileBanking',
    'Dashen Bank': 'com.dashen.dashensuperapp'
}

# Function to scrape reviews for a single app
def scrape_app_reviews(app_id, app_name, min_reviews=400, output_dir='raw_data'):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    all_reviews = []
    sort_options = [Sort.MOST_RELEVANT, Sort.NEWEST, Sort.RATING]
    
    for sort_option in sort_options:
        continuation_token = None
        while len(all_reviews) < min_reviews:
            try:
                result, continuation_token = reviews(
                    app_id,
                    lang='en',  # Language (English)
                    country='et',  # Country (Ethiopia)
                    sort=sort_option,
                    count=100,  # Smaller batch size to avoid rate-limiting
                    continuation_token=continuation_token
                )
                all_reviews.extend(result)
                print(f"Scraped {len(all_reviews)} reviews for {app_name} (sort: {sort_option})")
                
                if not continuation_token:
                    break
            except Exception as e:
                print(f"Error scraping {app_name} with sort {sort_option}: {e}")
                break
        time.sleep(2)  # Delay between sort options
    
    # Convert to DataFrame
    if all_reviews:
        reviews_df = pd.DataFrame(all_reviews)
        reviews_df = reviews_df[['reviewId', 'content', 'score', 'at']]
        reviews_df = reviews_df.rename(columns={
            'reviewId': 'review_id',
            'content': 'review',
            'score': 'rating',
            'at': 'date'
        })
        reviews_df['bank'] = app_name
        reviews_df['source'] = 'Google Play'
        
        # Save to CSV
        output_file = os.path.join(output_dir, f'raw_reviews_{app_name.lower().replace(" ", "_")}.csv')
        reviews_df.to_csv(output_file, index=False)
        print(f"Saved {len(reviews_df)} reviews to {output_file}")
        
        return reviews_df
    else:
        print(f"No reviews scraped for {app_name}")
        return pd.DataFrame()

# Scrape reviews for each bank individually
def main():
    for app_name, app_id in app_ids.items():
        print(f"Starting scrape for {app_name}...")
        scrape_app_reviews(app_id, app_name, min_reviews=400)
        print(f"Finished scraping for {app_name}\n")
        time.sleep(5)  # Delay between banks

if __name__ == "__main__":
    main()

Starting scrape for Commercial Bank of Ethiopia...
Scraped 100 reviews for Commercial Bank of Ethiopia (sort: 1)
Scraped 200 reviews for Commercial Bank of Ethiopia (sort: 1)
Scraped 300 reviews for Commercial Bank of Ethiopia (sort: 1)
Scraped 400 reviews for Commercial Bank of Ethiopia (sort: 1)
Saved 400 reviews to raw_data\raw_reviews_commercial_bank_of_ethiopia.csv
Finished scraping for Commercial Bank of Ethiopia

Starting scrape for Bank of Abyssinia...
Scraped 100 reviews for Bank of Abyssinia (sort: 1)
Scraped 200 reviews for Bank of Abyssinia (sort: 1)
Scraped 300 reviews for Bank of Abyssinia (sort: 1)
Scraped 400 reviews for Bank of Abyssinia (sort: 1)
Saved 400 reviews to raw_data\raw_reviews_bank_of_abyssinia.csv
Finished scraping for Bank of Abyssinia

Starting scrape for Dashen Bank...
Scraped 100 reviews for Dashen Bank (sort: 1)
