## Maps Scraping for Baly

In [1]:
%%capture
pip install beautifulsoup4 requests pandas lxml

In [2]:
import requests
from bs4 import BeautifulSoup
import lxml.html
import re

You will need username and password, ask me for it.

In [4]:
from getpass import getpass
username = getpass("username")
password = getpass("password")

username ········
password ········


This is where we search for a particular PoI - in this case parking lots in Najaf.

In [47]:
payload = {
    'source': 'google_maps',
    'query': 'كراج',
    'user_agent_type': 'desktop',
    'domain': 'iq',
    'geo_location': 'Najaf, Iraq',
#   'geo_location': 'lat: 32.0106646, lng: 44.3265272, rad: 15000',
#   'geo_location': '2368',
    'start_page': '1',
    'google_suggest': 'IQ',
    'limit':'5',
    'pages': '3'
}
response = requests.request(
	'POST',
	'https://realtime.oxylabs.io/v1/queries',
	auth=(username, password),
	json=payload,
	timeout=180
)
print(response.status_code)
if response.status_code != 200:
    print(str(response.content))

200


In [50]:
results = response.json()['results']
print(len(results))
html_files = [result['content'] for result in results]

3


In [51]:
name_selector = '[role="heading"]'
rating_selector = 'span[aria-hidden="true"]'
rating_count_selector = '[class*="RDApEe"]'
details_selector = '.rllt__details div:nth-of-type(5)'
lat_selector = '[data-lat]'
lng_selector = '[data-lng]'
type_selector = '//div[@class="rllt__details"]/div[2]/text()'
address_selector = '.rllt__details div:nth-of-type(3)'


In [52]:
data = []
for html in html_files:
    soup = BeautifulSoup(html, 'html.parser')
    lxml_obj = lxml.html.fromstring(str(soup))
    index = -1

    for listing in soup.select('[class="VkpGBb"]'):
        index += 1
        place = listing.parent
        name_el = place.select_one(name_selector)
        name = name_el.text.strip() if name_el else ''
        
        rating_el = place.select_one(rating_selector)
        rating = rating_el.text.strip() if rating_el else ''
        
        rating_count_el = place.select_one(rating_count_selector)
        rating_count = ''
        if rating_count_el:
            count_match = re.search(r'\((.+)\)', rating_count_el.text)
            rating_count = count_match.group(1) if count_match else ''
        
        
        details_el = place.select_one(details_selector)
        details = details_el.text.strip() if details_el else ''
        
        lat_el = soup.select_one(lat_selector)
        lat = lat_el.get('data-lat') if lat_el else ''
        
        lng_el = soup.select_one(lng_selector)
        lng = lng_el.get('data-lng') if lng_el else ''
        
        type_el = lxml_obj.xpath(type_selector)
        place_types = []
        for item in type_el:
            parts = item.strip().split('·')
            non_empty_parts = [part.strip() for part in parts if part.strip()]
            if non_empty_parts:
                place_types.append(non_empty_parts[-1])
        
        address_el = place.select_one(address_selector)
        address = address_el.text.strip() if address_el else ''
        
        place = {
            'name': name,
            'place_type': place_types[index],
            'address': address,
            'rating': rating,
            'rating_count': rating_count,
            'latitude': lat,
            'longitude': lng,
            'details': details,
        }
        data.append(place)

In [53]:
import pandas as np
place_names = [ x['name'] for x in data ]
types = [ x['place_type'] for x in data ]
lats = [ x['latitude'] for x in data ]
longs = [ x['longitude'] for x in data ]
address = [x['address'] for x in data ]
df = np.DataFrame({'Name': place_names, 'Type': types, 'Latitutde': lats, 'Longitude': longs, 'Address': address})
df.head(15)

Unnamed: 0,Name,Type,Latitutde,Longitude,Address
0,كراج,مكان عبادة,32.0106646,44.3265272,286G+7J6
1,كراج مغتسل الحيدري,موقف سيارات,32.0106646,44.3265272,286G+7J6
2,كراج الطوابق,ورشة إصلاح سيارات,32.0106646,44.3265272,X8X6+4P7
3,كراج المشراق,موقف سيارات,32.0106646,44.3265272,286G+7J6
4,كراج عقيل الخفاجي,ورشة إصلاح سيارات,32.0106646,44.3265272,Kufa، العراق
5,كراج صباغة محمد تركي,ورشة إصلاح سيارات,32.0106646,44.3265272,أكثر من 40 عامًا في المجال · Kufa، العراق · +9...
6,كراج ديلوكس الحديث,ورشة إصلاح سيارات,32.0106646,44.3265272,Kufa، العراق · +964 780 392 0284
7,كراج علاء الموسوي لصيانة كير السيارات,ورشة إصلاح سيارات,32.0106646,44.3265272,X9XG+P37، النجف،
8,كراج رضوان الحمامي,ورشة إصلاح سيارات,32.0106646,44.3265272,أكثر من 3 أعوام في المجال · Kufa، العراق · +96...
9,كراج عقيل الجلابي,سوق قطع غيار السيارات,32.0106646,44.3265272,"298G+WX4 قرب غرفة الحراس, طريق الحي الصناعي"


## Comparison with Smapp


In [19]:
base_url = 'http://gosecapo-staging.apps.private.apps.private.okd4.teh-2.snappcloud.io/maps/api/place/autocomplete/json'
origin = '32.022698498355794,44.322398007378446'
query = 'كراج'

params = {
    'origin': origin,
    'input': query,
    'language': 'ar'
}

response = requests.request(
	'GET',
	base_url,
    params = params,
	timeout=180
)
print(response.status_code)
if response.status_code != 200:
    print(response.content)

200


In [21]:
import json
decoded_str = response.content.decode('utf-8')
smappdata = json.loads(decoded_str)
predicts = smappdata['predictions']
print(len(predicts))
place_names = [ x['description'] for x in predicts ]
types = [ x['types'][0] for x in predicts ]
lats = [ x['latitude'] for x in predicts]
longs = [ x['longitude'] for x in predicts]

df = np.DataFrame({'Name': place_names, 'Type': types, 'Latitutde': lats, 'Longitude': longs})
df.head(28)

28


Unnamed: 0,Name,Type,Latitutde,Longitude
0,جسر كراج بغداد,highway:trunk,32.032554,44.323044
1,كراج النجف الجنوبي,building:garage,31.981235,44.351372
2,كراج مسجد الكوفة,amenity:bus_station,32.025798,44.399097
3,كراج,amenity:car_wash,31.992382,44.320963
4,كراج ميسان,amenity:bus_station,32.02073,44.380629
5,كراج الرابطة,tourism:caravan_site,31.993346,44.320444
6,كراج سيارتي,shop:ticket,31.989473,44.31813
7,كراج الطوابق,amenity:car_rental,31.99855,44.311104
8,كراج الصلاحية,amenity:bus_station,31.963016,44.601279
9,كراج النجف الداخلي,amenity:bus_station,32.002836,44.329221
