In [6]:
# Import libraries
from google_play_scraper import reviews, Sort
import pandas as pd
from datetime import datetime
import time

# Updated bank apps configuration
apps = {
    'CBE': 'com.combanketh.mobilebanking',
    'BOA': 'com.boa.boaMobileBanking', 
    'Dashen': 'com.dashen.dashensuperapp'
}

def scrape_app_reviews(app_id, bank_name, target_count=400):
    """Scrape reviews with pagination to reach target count"""
    all_reviews = []
    continuation_token = None
    batch_size = 100  # Max per request
    retry_count = 0
    
    while len(all_reviews) < target_count and retry_count < 3:
        try:
            # Get reviews in batches
            result, continuation_token = reviews(
                app_id,
                lang='en',
                country='et',
                sort=Sort.NEWEST,
                count=batch_size,
                continuation_token=continuation_token
            )
            
            all_reviews.extend(result)
            print(f"Collected {len(all_reviews)}/{target_count} reviews for {bank_name}")
            
            if not continuation_token:
                break
                
            # Be polite with delay between requests
            time.sleep(2)
            
        except Exception as e:
            print(f"Error: {str(e)}")
            retry_count += 1
            time.sleep(5)
    
    # Process the collected reviews
    df = pd.DataFrame(all_reviews[:target_count])  # Trim to exact target
    df['bank'] = bank_name
    df['date'] = pd.to_datetime(df['at']).dt.date
    
    return df[['content', 'score', 'date', 'bank']].rename(columns={
        'content': 'review',
        'score': 'rating'
    })

# Scrape reviews for all banks (400 each)
all_reviews = pd.DataFrame()

for bank_name, app_id in apps.items():
    print(f"\n=== Scraping {bank_name} ===")
    try:
        bank_reviews = scrape_app_reviews(app_id, bank_name, 400)
        all_reviews = pd.concat([all_reviews, bank_reviews], ignore_index=True)
        print(f"✅ Success: Collected {len(bank_reviews)} reviews")
    except Exception as e:
        print(f"❌ Failed to scrape {bank_name}: {str(e)}")

# Save and verify results
if not all_reviews.empty:
    review_counts = all_reviews['bank'].value_counts()
    print("\n=== Collection Summary ===")
    print(review_counts)
    
    all_reviews.to_csv('bank_reviews.csv', index=False)
    print("\nSample reviews:")
    display(all_reviews.sample(5))
else:
    print("\nNo reviews collected - please check configuration")


=== Scraping CBE ===
Collected 100/400 reviews for CBE
Collected 200/400 reviews for CBE
Collected 300/400 reviews for CBE
Collected 400/400 reviews for CBE
✅ Success: Collected 400 reviews

=== Scraping BOA ===
Collected 100/400 reviews for BOA
Collected 200/400 reviews for BOA
Collected 300/400 reviews for BOA
Collected 400/400 reviews for BOA
✅ Success: Collected 400 reviews

=== Scraping Dashen ===
Collected 100/400 reviews for Dashen
Collected 200/400 reviews for Dashen
Collected 300/400 reviews for Dashen
Collected 400/400 reviews for Dashen
✅ Success: Collected 400 reviews

=== Collection Summary ===
bank
CBE       400
BOA       400
Dashen    400
Name: count, dtype: int64

Sample reviews:


Unnamed: 0,review,rating,date,bank
397,very nice,5,2025-03-31,CBE
360,safe easy & fast,5,2025-04-01,CBE
1196,Dashen yichalal. Ewnetem one step a head,5,2025-01-17,Dashen
211,goid,5,2025-04-28,CBE
243,always CBE is the leading Commercial Bank💪💪💪,5,2025-04-18,CBE


In [10]:
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class TextPreprocessor:
    def __init__(self):
        self._setup_nltk()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = self._get_tokenizer()
        
    def _setup_nltk(self):
        """Ensure all required NLTK data is available"""
        try:
            nltk.data.find('tokenizers/punkt')
            nltk.data.find('corpora/stopwords')
            nltk.data.find('corpora/wordnet')
        except LookupError:
            nltk.download('punkt')
            nltk.download('stopwords')
            nltk.download('wordnet')
    
    def _get_tokenizer(self):
        """Get the best available tokenizer with fallback"""
        try:
            # Test if advanced tokenization works
            word_tokenize("test")
            return word_tokenize
        except:
            # Fallback to simple whitespace tokenizer
            return WhitespaceTokenizer().tokenize
    
    def clean_text(self, text):
        """Robust text cleaning with fallback tokenization"""
        if not isinstance(text, str):
            return ""
            
        text = text.lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        
        # Tokenize with fallback
        tokens = self.tokenizer(text)
        
        # Lemmatize and remove stopwords
        tokens = [self.lemmatizer.lemmatize(t) for t in tokens 
                 if t not in self.stop_words and len(t) > 2]
        
        return ' '.join(tokens)

def preprocess_reviews(input_path, output_path):
    """Complete preprocessing pipeline"""
    try:
        # Initialize preprocessor
        preprocessor = TextPreprocessor()
        
        # Load data
        df = pd.read_csv(input_path)
        print(f"Loaded {len(df)} raw reviews")
        
        # Clean data
        df = df.dropna(subset=['review']).drop_duplicates('review')
        df['clean_review'] = df['review'].apply(preprocessor.clean_text)
        df['source'] = 'Google Play'
        
        # Select final columns
        final_columns = ['review', 'rating', 'date', 'bank', 'source', 'clean_review']
        df[final_columns].to_csv(output_path, index=False)
        
        print(f"Successfully saved {len(df)} processed reviews to {output_path}")
        return df[final_columns].head()
    
    except Exception as e:
        print(f"Error during preprocessing: {str(e)}")
        return None

# Example usage
preprocess_reviews('bank_reviews.csv', 'cleaned_reviews.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loaded 1200 raw reviews
Successfully saved 987 processed reviews to cleaned_reviews.csv


Unnamed: 0,review,rating,date,bank,source,clean_review
0,20 years,5,2025-06-08,CBE,Google Play,year
1,A great app. It's like carrying a bank in your...,4,2025-06-07,CBE,Google Play,great app like carrying bank pocket
2,More than garrantty bank EBC.,4,2025-06-07,CBE,Google Play,garrantty bank ebc
3,really am happy to this app it is Siple to use...,5,2025-06-07,CBE,Google Play,really happy app siple use everything
4,I liked this app. But the User interface is ve...,2,2025-06-07,CBE,Google Play,liked app user interface basic attractive
