# playstore_scraper.ipynb

### IMPORTS

In [11]:
from google_play_scraper import app, reviews, Sort, search
import pandas as pd
from tqdm import tqdm
import time

### CONFIGURATIONS

In [12]:
CATEGORY_QUERIES = {
    "GAME": "top free games",
    "FINANCE": "finance apps",
    "TOOLS": "android tools",
    "EDUCATION": "education learning apps",
    "PRODUCTIVITY": "productivity apps",
    "SOCIAL": "social networking apps",
    "HEALTH_AND_FITNESS": "health and fitness apps",
    "COMMUNICATION": "communication apps",
    "TRAVEL": "travel apps",
    "ENTERTAINMENT": "entertainment apps",
    "MUSIC_AND_AUDIO": "music and audio apps",
    "SHOPPING": "shopping apps",
}

APPS_PER_CATEGORY = 5  # testing limit
OUTPUT_FILE = "playstore_dataset_small.csv"

### HELPER FUNCTIONS

In [15]:
# fetch_app_metadata function to fetch app metadatas while scraping

def fetch_app_metadata(app_id, lang="en", country="us"):
    try:
        details = app(app_id, lang=lang, country=country)
        return {
            "appId": app_id,
            "title": details.get("title"),
            "developer": details.get("developer"),
            "score": details.get("score"),
            "reviews_count": details.get("ratings"),
            "installs": details.get("installs"),
            "free": details.get("free"),
            "price": details.get("price"),
            "currency": details.get("currency"),
            "updated": details.get("updated"),
            "genre": details.get("genre"),
            "contentRating": details.get("contentRating"),
            "minInstalls": details.get("minInstalls"),
            "maxInstalls": details.get("maxInstalls"),
            "summary": details.get("summary"),
        }
    except Exception as e:
        print(f"Error fetching {app_id}: {e}")
        return None


# fetch_reviews to fetch the reviews for each apps

def fetch_reviews(app_id, count=200, lang="en", country="us"):
    try:
        rvws, _ = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.MOST_RELEVANT,  # correct enum usage
            count=count
        )
        if not rvws:
            return None, None
        

        # Ensure rvws is a list of dicts
        rvws = [r for r in rvws if isinstance(r, dict)]

        # Most liked review = highest thumbsUpCount
        most_liked = max(rvws, key=lambda r: r.get("thumbsUpCount", 0)) if rvws else None

        # Most disliked = first low-score review with content
        most_disliked = next(
            (r for r in rvws if r.get("score", 0) <= 2 and r.get("content")),
            None
        )

        return most_liked, most_disliked
    
    except Exception as e:
        print(f"Error fetching reviews for {app_id}: {e}")
        return None, None
    



## Web Scraping Loop

In [17]:
all_data = []
print("ðŸš€ Starting scraping... small test run (5 apps/category)\n")

for category, query in CATEGORY_QUERIES.items():
    print(f"ðŸ“‚ Category: {category}  |  Query: '{query}'")
    try:
        # Discover up to 5 candidate apps for this category via text search
        # n_hits default max shown in docs is 30; we cap to 5 for test
        results = search(query, lang="en", country="us", n_hits=APPS_PER_CATEGORY)
    except Exception as e:
        print(f"Could not search for category {category}: {e}")
        continue

    for r in tqdm(results, desc=f"Apps in {category}"):
        app_id = r.get("appId")
        if not app_id:
            continue

        # Metadata
        meta = fetch_app_metadata(app_id, lang="en", country="us")
        if not meta:
            continue

        # Reviews
        liked, disliked = fetch_reviews(app_id, count=10, lang="en", country="us")
        meta["most_liked_review"] = liked.get("content") if isinstance(liked, dict) else None
        meta["most_disliked_review"] = disliked.get("content") if isinstance(disliked, dict) else None


        # Track originating bucket
        meta["discovered_from_category"] = category
        meta["discovered_query"] = query

        all_data.append(meta)

    # Gentle pacing between categories
    time.sleep(1)

ðŸš€ Starting scraping... small test run (5 apps/category)

ðŸ“‚ Category: GAME  |  Query: 'top free games'


Apps in GAME: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:15<00:00,  3.15s/it]


ðŸ“‚ Category: FINANCE  |  Query: 'finance apps'


Apps in FINANCE: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:17<00:00,  3.50s/it]


ðŸ“‚ Category: TOOLS  |  Query: 'android tools'


Apps in TOOLS: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:11<00:00,  2.39s/it]


ðŸ“‚ Category: EDUCATION  |  Query: 'education learning apps'


Apps in EDUCATION: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:13<00:00,  2.66s/it]


ðŸ“‚ Category: PRODUCTIVITY  |  Query: 'productivity apps'


Apps in PRODUCTIVITY: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:10<00:00,  2.03s/it]


ðŸ“‚ Category: SOCIAL  |  Query: 'social networking apps'


Apps in SOCIAL: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:13<00:00,  2.68s/it]


ðŸ“‚ Category: HEALTH_AND_FITNESS  |  Query: 'health and fitness apps'


Apps in HEALTH_AND_FITNESS: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:16<00:00,  3.23s/it]


ðŸ“‚ Category: COMMUNICATION  |  Query: 'communication apps'


Apps in COMMUNICATION: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:14<00:00,  2.81s/it]


ðŸ“‚ Category: TRAVEL  |  Query: 'travel apps'


Apps in TRAVEL: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:17<00:00,  3.42s/it]


ðŸ“‚ Category: ENTERTAINMENT  |  Query: 'entertainment apps'


Apps in ENTERTAINMENT: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:17<00:00,  3.49s/it]


ðŸ“‚ Category: MUSIC_AND_AUDIO  |  Query: 'music and audio apps'


Apps in MUSIC_AND_AUDIO: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:23<00:00,  4.63s/it]


ðŸ“‚ Category: SHOPPING  |  Query: 'shopping apps'


Apps in SHOPPING: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:21<00:00,  4.38s/it]


### Save in CSV

In [18]:
df = pd.DataFrame(all_data)
df.to_csv(OUTPUT_FILE, index=False)

print(f"\nâœ… Done! Saved {len(df)} apps to {OUTPUT_FILE}")


âœ… Done! Saved 60 apps to playstore_dataset_small.csv
