In [1]:
!pip install google-play-scraper
!pip install keybert
!pip install rapidfuzz
!pip install transformers
!pip install torch
!pip install pandas numpy scipy

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keybert
Successfully installed keybert-0.9.0
Collecting rapidfuzz
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.1-cp

In [2]:
import pandas as pd
import numpy as np
from scipy.special import softmax
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from keybert import KeyBERT
from rapidfuzz import process, fuzz
from google_play_scraper import reviews, Sort
from datetime import datetime
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
# Configuration
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
review_limit = 1000
fetched_reviews = 0

# Preloaded dictionary of words
preloaded_words = {
    "apple", "banana", "grape", "orange", "watermelon",
    'cancel today', 'fee cancel', 'payment issue', 'driver problem',
    'service quality', 'app crash', 'wait time', 'customer support',
    'ride experience', 'booking difficulty', 'refund request'
}

# Initialize global variables
processed_reviews_df = pd.DataFrame(columns=[
    'Keyword', 'Content', 'sentiment_score', 'Date_of_posting',
    'Reply_status', 'reply_delay', 'Key_relevance', 'app_name'
])
new_rows = []

In [4]:
def initialize_models():
    """Initialize sentiment analysis and keyword extraction models"""
    try:
        logger.info("Initializing sentiment analysis model...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL)
        model = AutoModelForSequenceClassification.from_pretrained(MODEL)
        logger.info("Sentiment model initialized successfully")

        logger.info("Initializing keyword extraction model...")
        kw_model = KeyBERT()
        logger.info("Keyword model initialized successfully")

        return tokenizer, model, kw_model
    except Exception as e:
        logger.error(f"Error initializing models: {e}")
        raise

# Initialize models
try:
    tokenizer, model, kw_model = initialize_models()
except Exception as e:
    logger.error(f"Failed to initialize models: {e}")
    raise

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
def sentimentscore(example):
    """Calculate sentiment score for given text"""
    try:
        if not example or not isinstance(example, str) or len(example.strip()) == 0:
            logger.warning("Empty or invalid text provided for sentiment analysis")
            return 0

        encoded_text = tokenizer(example, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        final_score = (scores[0]*(-1)) + scores[2]

        if final_score >= 0.5:
            return 1
        elif final_score < -0.5:
            return -1
        else:
            return 0

    except Exception as e:
        logger.error(f"Error in sentiment analysis for text '{example[:50]}...': {e}")
        return 0

In [6]:
def extract_keywords(text):
    """Extract keywords from text with relevance scores"""
    try:
        if not text or not isinstance(text, str) or len(text.strip()) == 0:
            logger.warning("Empty or invalid text provided for keyword extraction")
            return []

        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 2),
            stop_words='english',
            use_mmr=True,
            diversity=0.7
        )
        logger.debug(f"Extracted {len(keywords)} keywords from text")
        return keywords

    except Exception as e:
        logger.error(f"Error in keyword extraction for text '{text[:50]}...': {e}")
        return []

In [7]:
def find_best_match(input_word, threshold=90):
    """Find the best matching word from preloaded dictionary using fuzzy matching"""
    try:
        if not input_word or not isinstance(input_word, str):
            logger.warning("Invalid input word for fuzzy matching")
            return input_word

        best_match = process.extractOne(input_word, preloaded_words, scorer=fuzz.partial_ratio)

        if best_match is not None:
            match, score, _ = best_match
            if score >= threshold:
                logger.debug(f"Matched '{input_word}' to '{match}' with score {score}")
                return match

        # Add new word to dictionary if no good match found
        preloaded_words.add(input_word)
        logger.info(f"Added new word to dictionary: '{input_word}'")
        return input_word

    except Exception as e:
        logger.error(f"Error in fuzzy matching for word '{input_word}': {e}")
        return input_word

In [8]:
def calculate_reply_delay_in_minutes(sent_time, reply_time):
    """Calculate delay in minutes between review and reply"""
    try:
        if sent_time is None or reply_time is None:
            logger.debug("Missing time data for delay calculation")
            return None

        if not isinstance(sent_time, datetime) or not isinstance(reply_time, datetime):
            logger.warning("Invalid datetime objects provided for delay calculation")
            return None

        # Calculate the delay
        delay = reply_time - sent_time

        # Convert the delay to minutes and ensure non-negative
        delay_in_minutes = max(0, delay.total_seconds() / 60)

        logger.debug(f"Calculated reply delay: {delay_in_minutes:.2f} minutes")
        return delay_in_minutes

    except Exception as e:
        logger.error(f"Error calculating reply delay: {e}")
        return None

In [9]:
def truncate_text(text, max_length=512):
    """Truncate text to the maximum length supported by the model"""
    try:
        if text is None:
            return ""

        if not isinstance(text, str):
            logger.warning(f"Non-string text provided: {type(text)}")
            return str(text)[:max_length] if text else ""

        if len(text) > max_length:
            logger.debug(f"Truncating text from {len(text)} to {max_length} characters")
            return text[:max_length]

        return text

    except Exception as e:
        logger.error(f"Error truncating text: {e}")
        return text[:max_length] if text else ""

In [10]:
def process_review(review, app_name):
    """Process each review and extract relevant information"""
    try:
        reviewId = review.get('reviewId', 'unknown')
        content = truncate_text(review.get('content', ''))
        at = review.get('at')
        replyContent = review.get('replyContent')
        repliedAt = review.get('repliedAt')

        logger.debug(f"Processing review {reviewId} for {app_name}")

        # Sentiment analysis
        sentiment = sentimentscore(content)

        # Keyword extraction
        keywords = extract_keywords(content)

        # Prepare date and reply information
        at_str = at.strftime('%Y-%m-%d %H:%M:%S') if at else None

        if replyContent is None or repliedAt is None:
            reply_status = 0
            reply_delay = None
        else:
            reply_status = 1
            reply_delay = calculate_reply_delay_in_minutes(at, repliedAt)
            if reply_delay is not None and reply_delay < 0:
                reply_delay = 0
                reply_status = 0

        # Process keywords
        processed_keywords = []
        for keyword, relevance_score in keywords:
            normalized_keyword = find_best_match(keyword)
            if normalized_keyword not in processed_keywords:
                processed_keywords.append(normalized_keyword)

                new_row = {
                    'Keyword': normalized_keyword,
                    'Content': content,
                    'sentiment_score': sentiment,
                    'Date_of_posting': at_str,
                    'Reply_status': reply_status,
                    'reply_delay': reply_delay,
                    'Key_relevance': relevance_score,
                    'app_name': app_name
                }
                new_rows.append(new_row)

        logger.info(f"Successfully processed review {reviewId} with {len(keywords)} keywords")
        return True

    except Exception as e:
        logger.error(f"Error processing review {review.get('reviewId', 'unknown')}: {e}")
        return False

In [11]:
def save_to_csv(rows, app_name):
    """Save processed data to CSV files"""
    try:
        if not rows:
            logger.warning("No data to save to CSV")
            return False

        df = pd.DataFrame(rows)

        # Save individual app files
        app_df = df[df['app_name'] == app_name]
        if not app_df.empty:
            filename = f"processed_{app_name.lower()}_reviews.csv"
            app_df.to_csv(filename, index=False)
            logger.info(f"Saved {len(app_df)} rows to {filename}")

        # Save combined file
        df.to_csv('all_processed_reviews.csv', index=False)
        logger.info(f"Saved {len(df)} rows to all_processed_reviews.csv")

        return True

    except Exception as e:
        logger.error(f"Error saving to CSV: {e}")
        return False

def save_data():
    """Save all processed data to CSV files"""
    global new_rows

    if not new_rows:
        logger.warning("No data to save")
        return False

    try:
        # Group by app for individual saving
        apps_data = {}
        for row in new_rows:
            app_name = row['app_name']
            if app_name not in apps_data:
                apps_data[app_name] = []
            apps_data[app_name].append(row)

        # Save individual app files
        for app_name, app_rows in apps_data.items():
            save_to_csv(app_rows, app_name)

        # Save combined file
        all_success = save_to_csv(new_rows, 'all')

        if all_success:
            logger.info(f"Successfully saved all {len(new_rows)} records to CSV files")
            return True
        else:
            logger.error("Failed to save some CSV files")
            return False

    except Exception as e:
        logger.error(f"Error in save_data: {e}")
        return False

In [12]:
def fetch_reviews(app_package_name, app_name, limit=100):
    """Fetch reviews from Google Play Store"""
    global fetched_reviews, new_rows

    try:
        logger.info(f"Starting to fetch reviews for {app_name} (package: {app_package_name})")

        result, continuation_token = reviews(
            app_package_name,
            lang='en',
            country='IN',
            count=min(100, limit),
            sort=Sort.NEWEST
        )

        successful_processing = 0
        for review in result:
            if process_review(review, app_name):
                successful_processing += 1

        fetched_reviews += len(result)
        logger.info(f"Processed {successful_processing}/{len(result)} reviews for {app_name}")

        # Continue with pagination
        while continuation_token and fetched_reviews < review_limit:
            logger.debug(f"Fetching more reviews for {app_name}...")

            additional_reviews, continuation_token = reviews(
                app_package_name,
                lang='en',
                country='IN',
                count=min(100, review_limit - fetched_reviews),
                sort=Sort.NEWEST,
                continuation_token=continuation_token
            )

            additional_successful = 0
            for review in additional_reviews:
                if process_review(review, app_name):
                    additional_successful += 1

            fetched_reviews += len(additional_reviews)
            logger.info(f"Processed {additional_successful}/{len(additional_reviews)} additional reviews for {app_name}")

        return True

    except Exception as e:
        logger.error(f"Error fetching reviews for {app_name}: {e}")
        return False

In [13]:
def main():
    """Main execution function"""
    global new_rows

    try:
        logger.info("Starting review processing pipeline...")

        # Define apps to process
        apps = [
            {'package': 'com.ubercab', 'name': 'Uber'},
            {'package': 'com.olacabs.customer', 'name': 'Ola'}
        ]

        # Process each app with individual error handling
        for app in apps:
            try:
                logger.info(f"Processing {app['name']}...")
                success = fetch_reviews(app['package'], app['name'], review_limit // len(apps))
                if not success:
                    logger.warning(f"Partial failure processing {app['name']}")
            except Exception as e:
                logger.error(f"Failed to process {app['name']}: {e}")
                continue

        # Save results
        if new_rows:
            logger.info(f"Processing completed. Saving {len(new_rows)} records...")

            # Save to CSV files
            save_success = save_data()

            if save_success:
                logger.info("Data saved successfully to CSV files")

                # Display data preview
                try:
                    preview_df = pd.DataFrame(new_rows)
                    print("\n=== DATA PREVIEW ===")
                    print(preview_df[['Keyword', 'sentiment_score', 'app_name', 'Key_relevance']].head(10))
                except Exception as e:
                    logger.error(f"Error displaying preview: {e}")
            else:
                logger.error("Failed to save data to CSV")

        else:
            logger.warning("No reviews were processed. Possible issues:")
            logger.warning("1. No internet connection")
            logger.warning("2. App package names incorrect")
            logger.warning("3. Google Play scraping restrictions")
            logger.warning("4. Review limit too low")

        logger.info("Review processing completed!")

    except Exception as e:
        logger.error(f"Fatal error in main execution: {e}")
        # Try to save any processed data before crashing
        try:
            if new_rows:
                emergency_df = pd.DataFrame(new_rows)
                emergency_df.to_csv('emergency_backup_reviews.csv', index=False)
                logger.info("Emergency backup saved to emergency_backup_reviews.csv")
        except:
            logger.error("Emergency backup also failed")
        raise

# Run the main function with top-level error handling
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        logger.info("Process interrupted by user")
        # Try to save progress
        if new_rows:
            try:
                interrupted_df = pd.DataFrame(new_rows)
                interrupted_df.to_csv('interrupted_backup_reviews.csv', index=False)
                logger.info("Progress saved to interrupted_backup_reviews.csv before interruption")
            except:
                logger.error("Could not save progress before interruption")
    except Exception as e:
        logger.critical(f"Unhandled exception: {e}")


=== DATA PREVIEW ===
            Keyword  sentiment_score app_name  Key_relevance
0       good driver                1     Uber         1.0000
1        fraud uber               -1     Uber         0.6926
2           ola far               -1     Uber         0.3181
3  property blocked               -1     Uber         0.2105
4      rupees 16000               -1     Uber         0.1870
5  serve government               -1     Uber         0.1587
6       good driver                1     Uber         1.0000
7           bad app               -1     Uber         0.8342
8        quiet trip                1     Uber         0.8641
9       great quiet                1     Uber         0.7008


In [14]:
def generate_report():
    """Generate analysis report of processed data"""
    try:
        # Try to load data from CSV if available
        try:
            df = pd.read_csv('all_processed_reviews.csv')
        except FileNotFoundError:
            # Use in-memory data if CSV doesn't exist yet
            if not new_rows:
                logger.warning("No data available for reporting")
                return
            df = pd.DataFrame(new_rows)
        except Exception as e:
            logger.error(f"Error loading data for reporting: {e}")
            return

        print("\n" + "="*50)
        print("ANALYSIS REPORT")
        print("="*50)
        print(f"Total reviews processed: {len(df)}")
        print(f"Unique keywords found: {df['Keyword'].nunique()}")

        print("\nSentiment Distribution:")
        sentiment_counts = df['sentiment_score'].value_counts()
        for score, count in sentiment_counts.items():
            sentiment = "Positive" if score == 1 else "Negative" if score == -1 else "Neutral"
            percentage = (count / len(df)) * 100
            print(f"  {sentiment}: {count} reviews ({percentage:.1f}%)")

        reply_rate = df['Reply_status'].mean() * 100
        print(f"\nReply Rate: {reply_rate:.1f}%")

        if df['reply_delay'].notna().any():
            avg_delay = df[df['reply_delay'].notna()]['reply_delay'].mean()
            print(f"Average Reply Delay: {avg_delay:.2f} minutes")

        print("\nTop 10 Keywords:")
        top_keywords = df['Keyword'].value_counts().head(10)
        for keyword, count in top_keywords.items():
            print(f"  {keyword}: {count} occurrences")

        print("\nBy Application:")
        app_counts = df['app_name'].value_counts()
        for app, count in app_counts.items():
            percentage = (count / len(df)) * 100
            print(f"  {app}: {count} reviews ({percentage:.1f}%)")

        print("\nAverage Keyword Relevance Score:")
        avg_relevance = df['Key_relevance'].mean()
        print(f"  {avg_relevance:.3f}")

        print("="*50)

    except Exception as e:
        logger.error(f"Error generating report: {e}")

# Generate report after processing
generate_report()


ANALYSIS REPORT
Total reviews processed: 3426
Unique keywords found: 1619

Sentiment Distribution:
  Positive: 1550 reviews (45.2%)
  Negative: 1213 reviews (35.4%)
  Neutral: 663 reviews (19.4%)

Reply Rate: 9.0%
Average Reply Delay: 8.64 minutes

Top 10 Keywords:
  wise good: 157 occurrences
  good customer: 71 occurrences
  price app: 58 occurrences
  bad experience: 51 occurrences
  loved service: 47 occurrences
  excellent: 42 occurrences
  booking drivers: 30 occurrences
  nice drivers: 30 occurrences
  drivers great: 29 occurrences
  nice person: 29 occurrences

By Application:
  Uber: 3114 reviews (90.9%)
  Ola: 312 reviews (9.1%)

Average Keyword Relevance Score:
  0.505
