<a href="https://colab.research.google.com/github/remastered702/IT362/blob/main/Copy_of_Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json, time, random
from datetime import datetime
import requests

# SETTINGS
API_KEY = "AIzaSyBFu74b6afMqfo5t8koIMwgenEIDBfVVtc"

NEARBY_URL  = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"
DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json"

TARGET_PLACES = 700
RADIUS = 4000
PLACE_TYPES = ["beauty_salon", "spa"]

UK_CITIES = [
    (51.5074, -0.1278),   # London
    (53.4808, -2.2426),   # Manchester
    (52.4862, -1.8904),   # Birmingham
    (55.8642, -4.2518),   # Glasgow
    (53.8008, -1.5491),   # Leeds
    (51.4545, -2.5879),   # Bristol
    (50.8225, -0.1372),   # Brighton
    (52.9548, -1.1581),   # Nottingham
    (51.4816, -3.1791),   # Cardiff
    (54.9783, -1.6178)    # Newcastle
]

# OUTPUT FILES
RAW_SEARCH_FILE   = "raw_search_responses.jsonl"
UNSTRUCTURED_FILE = "unstructured_reviews.jsonl"

# HELPERS
def get_json(url, params):
    for attempt in range(5):
        r = requests.get(url, params=params, timeout=30)
        data = r.json()
        if data.get("status") == "OVER_QUERY_LIMIT":
            time.sleep(2**attempt + random.random())
            continue
        return data
    return data


seen = set()
place_ids = []

with open(RAW_SEARCH_FILE, "w", encoding="utf-8") as raw_f:
    for lat, lng in UK_CITIES:
        for t in PLACE_TYPES:

            data = get_json(NEARBY_URL, {
                "location": f"{lat},{lng}",
                "radius": RADIUS,
                "type": t,
                "key": API_KEY
            })

            # Save RAW search response
            raw_f.write(json.dumps({
                "type": t,
                "lat": lat,
                "lng": lng,
                "response": data,
                "collected_at": datetime.now().isoformat()
            }, ensure_ascii=False) + "\n")

            if data.get("status") not in ("OK", "ZERO_RESULTS"):
                continue

            page = 1
            while True:
                for p in data.get("results", []):
                    pid = p.get("place_id")
                    if pid and pid not in seen:
                        seen.add(pid)
                        place_ids.append(pid)
                        if len(place_ids) >= TARGET_PLACES:
                            break

                if len(place_ids) >= TARGET_PLACES:
                    break

                token = data.get("next_page_token")
                if not token or page >= 3:
                    break

                time.sleep(2)
                data = get_json(NEARBY_URL, {
                    "pagetoken": token,
                    "key": API_KEY
                })
                page += 1

            if len(place_ids) >= TARGET_PLACES:
                break
        if len(place_ids) >= TARGET_PLACES:
            break

print("Places collected:", len(place_ids))

#  FETCH REVIEWS (UNSTRUCTURED ONLY)

reviews_count = 0

with open(UNSTRUCTURED_FILE, "w", encoding="utf-8") as out_f:
    for pid in place_ids:

        d = get_json(DETAILS_URL, {
            "place_id": pid,
            "fields": "reviews",
            "key": API_KEY
        })

        if d.get("status") == "OK":
            revs = d.get("result", {}).get("reviews", []) or []
            for rev in revs:
                txt = rev.get("text")
                if not txt:
                    continue

                out_f.write(json.dumps({
                    "place_id": pid,
                    "review_text": txt,
                    "rating": rev.get("rating"),
                    "time": rev.get("time"),
                    "relative_time_description": rev.get("relative_time_description"),
                    "language": rev.get("language")
}, ensure_ascii=False) + "\n")

                reviews_count += 1

        time.sleep(0.2)


print("Total places:", len(place_ids))
print("Total reviews collected:", reviews_count)


Places collected: 700
Starting review collection...
Total places: 700
Total reviews collected: 2894


In [None]:
import json
import pandas as pd

rows = []

with open("unstructured_reviews.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        rows.append({
            "place_id": obj.get("place_id"),
            "review_text": obj.get("review_text"),
            "rating": obj.get("rating"),
            "time": obj.get("time"),
            "relative_time_description": obj.get("relative_time_description"),
            "language": obj.get("language")
        })

df = pd.DataFrame(rows)

df.to_csv(
    "unstructured_reviews_readable.csv",
    index=False,
    encoding="utf-8-sig"
)



Saved: unstructured_reviews_readable.csv
