# playstore_scraper.ipynb

### IMPORTS

In [1]:
from google_play_scraper import app, reviews, Sort, search
import pandas as pd
from tqdm import tqdm
import time

### CONFIGURATIONS

In [2]:
CATEGORY_QUERIES = {
    "GAME": "top free games",
    "FINANCE": "finance apps",
    "TOOLS": "android tools",
    "EDUCATION": "education learning apps",
    "PRODUCTIVITY": "productivity apps",
    "SOCIAL": "social networking apps",
    "HEALTH_AND_FITNESS": "health and fitness apps",
    "COMMUNICATION": "communication apps",
    "TRAVEL": "travel apps",
    "ENTERTAINMENT": "entertainment apps",
    "MUSIC_AND_AUDIO": "music and audio apps",
    "SHOPPING": "shopping apps",
}

APPS_PER_CATEGORY = 250  # testing limit
OUTPUT_FILE = "playstore_dataset_small.csv"

### HELPER FUNCTIONS

In [None]:
# fetch_app_metadata function to fetch app metadatas while scraping

def fetch_app_metadata(app_id, lang, country):
    details = app(app_id, lang, country)
    if not details:
        return None

    # Add your new fields here
    metadata = {
        "appId": app_id,
        "title": details.get("title"),
        "developer": details.get("developer"),
        "developerId": details.get("developerId"),
        "released": details.get("released"),
        "score": details.get("score"),
        "ratings": details.get("ratings"),
        "installs": details.get("installs"),
        "free": details.get("free"),
        "price": details.get("price"),
        "currency": details.get("currency"),
        "offersInAppPurchases": details.get("offersIAP", False),
        "inAppProductPrice": details.get("inAppProductPrice", None),
        "containsAds": details.get("containsAds"),
        "updated": details.get("updated"),
        "version": details.get("version"),
        "genre": details.get("genre"),
        "genreId": details.get("genreId"),
        "contentRating": details.get("contentRating"),
        "contentRatingDescription": details.get("contentRatingDescription"),
        "summary": details.get("summary"),
        "url": details.get("url")
    }
    
    # Reviews: now with score and date
    liked, disliked = fetch_reviews(app_id, count=5)
    if liked and isinstance(liked, dict):
        metadata["most_liked_review"] = liked.get("content")
        metadata["most_liked_review_score"] = liked.get("score")
        metadata["most_liked_review_date"] = liked.get("at", "").isoformat() if liked.get("at") else None
    else:
        metadata["most_liked_review"] = None
        metadata["most_liked_review_score"] = None
        metadata["most_liked_review_date"] = None

    if disliked and isinstance(disliked, dict):
        metadata["most_disliked_review"] = disliked.get("content")
        metadata["most_disliked_review_score"] = disliked.get("score")
        metadata["most_disliked_review_date"] = disliked.get("at", "").isoformat() if disliked.get("at") else None
    else:
        metadata["most_disliked_review"] = None
        metadata["most_disliked_review_score"] = None
        metadata["most_disliked_review_date"] = None

    return metadata


# fetch_reviews to fetch the reviews for each apps

def fetch_reviews(app_id, count=250, lang="en", country="us"):
    try:
        rvws, _ = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.MOST_RELEVANT, 
            count=count
        )
        if not rvws:
            return None, None
        

        # Ensure rvws is a list of dicts
        rvws = [r for r in rvws if isinstance(r, dict)]

        # Most liked review = highest thumbsUpCount
        most_liked = max(rvws, key=lambda r: r.get("thumbsUpCount", 0)) if rvws else None

        # Most disliked = first low-score review with content
        most_disliked = next(
            (r for r in rvws if r.get("score", 0) <= 2 and r.get("content")),
            None
        )

        return most_liked, most_disliked
    
    except Exception as e:
        print(f"Error fetching reviews for {app_id}: {e}")
        return None, None
    



## Web Scraping Loop

In [4]:
all_data = []
print("🚀 Starting scraping... small test run (5 apps/category)\n")

for category, query in CATEGORY_QUERIES.items():
    print(f"📂 Category: {category}  |  Query: '{query}'")
    try:
        # Discover up to 5 candidate apps for this category via text search
        # n_hits default max shown in docs is 30; we cap to 5 for test
        results = search(query, lang="en", country="us", n_hits=APPS_PER_CATEGORY)
    except Exception as e:
        print(f"Could not search for category {category}: {e}")
        continue

    for r in tqdm(results, desc=f"Apps in {category}"):
        app_id = r.get("appId")
        if not app_id:
            continue

        # Metadata
        meta = fetch_app_metadata(app_id, lang="en", country="us")
        if not meta:
            continue

        # Reviews
        liked, disliked = fetch_reviews(app_id, count=10, lang="en", country="us")
        meta["most_liked_review"] = liked.get("content") if isinstance(liked, dict) else None
        meta["most_disliked_review"] = disliked.get("content") if isinstance(disliked, dict) else None


        # Track originating bucket
        meta["discovered_from_category"] = category
        meta["discovered_query"] = query

        all_data.append(meta)

    # Gentle pacing between categories
    time.sleep(1)

🚀 Starting scraping... small test run (5 apps/category)

📂 Category: GAME  |  Query: 'top free games'


Apps in GAME: 100%|██████████| 34/34 [01:37<00:00,  2.87s/it]


📂 Category: FINANCE  |  Query: 'finance apps'


Apps in FINANCE: 100%|██████████| 29/29 [01:16<00:00,  2.64s/it]


📂 Category: TOOLS  |  Query: 'android tools'


Apps in TOOLS: 100%|██████████| 30/30 [01:11<00:00,  2.38s/it]


📂 Category: EDUCATION  |  Query: 'education learning apps'


Apps in EDUCATION: 100%|██████████| 30/30 [01:13<00:00,  2.46s/it]


📂 Category: PRODUCTIVITY  |  Query: 'productivity apps'


Apps in PRODUCTIVITY: 100%|██████████| 30/30 [01:19<00:00,  2.65s/it]


📂 Category: SOCIAL  |  Query: 'social networking apps'


Apps in SOCIAL: 100%|██████████| 29/29 [01:20<00:00,  2.77s/it]


📂 Category: HEALTH_AND_FITNESS  |  Query: 'health and fitness apps'


Apps in HEALTH_AND_FITNESS: 100%|██████████| 30/30 [01:23<00:00,  2.77s/it]


📂 Category: COMMUNICATION  |  Query: 'communication apps'


Apps in COMMUNICATION: 100%|██████████| 30/30 [01:24<00:00,  2.80s/it]


📂 Category: TRAVEL  |  Query: 'travel apps'


Apps in TRAVEL: 100%|██████████| 30/30 [01:23<00:00,  2.77s/it]


📂 Category: ENTERTAINMENT  |  Query: 'entertainment apps'


Apps in ENTERTAINMENT: 100%|██████████| 30/30 [01:26<00:00,  2.88s/it]


📂 Category: MUSIC_AND_AUDIO  |  Query: 'music and audio apps'


Apps in MUSIC_AND_AUDIO: 100%|██████████| 30/30 [01:48<00:00,  3.63s/it]


📂 Category: SHOPPING  |  Query: 'shopping apps'


Apps in SHOPPING: 100%|██████████| 25/25 [01:27<00:00,  3.50s/it]


### Save in CSV

In [5]:
import os

# Ensure the target directory exists
os.makedirs("../dataset", exist_ok=True)

# Export DataFrame to your desired path
OUTPUT_PATH = "../dataset/playstore_dataset.csv"
df = pd.DataFrame(all_data)
df.to_csv(OUTPUT_PATH, index=False)

print(f"\n✅ Done! Saved {len(df)} apps to {OUTPUT_PATH}")



✅ Done! Saved 357 apps to ../dataset/playstore_dataset.csv
