## Web Scraping : Mubawab.ma


In [11]:
# install chromium, its driver, and selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium
# set options to be headless, ..
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [5]:
import numpy as np
import pandas as pd

In [3]:
# open it, go to a website, and get results
wd = webdriver.Chrome('chromedriver',options=options)
wd.get("https://www.mubawab.ma/fr/plr/casablanca-settat/listing-promotion")
# print(wd.page_source)  # results

In [4]:
url_ard = wd.current_url

### Get the city's number districts to navigate

In [6]:
n_elems = len(wd.find_elements_by_xpath('//*[@id="listingUl"]/li'))
n_elems

### Get Prices , quartiers , Standing , IDs and Dates

In [4]:
prices = []
quartiers = []
standings = []
ids = []
dates = []

for i in range(n_elems):
    # price
    prix = wd.find_element_by_xpath(f'//*[@id="listingUl"]/li[{i+1}]/div[2]/span') #Locate the price
    prix_text = prix.text #Get the price value
    # print(prix_text)
    prices.append(prix_text)

    # quartier
    quartier = wd.find_element_by_xpath(f'//*[@id="listingUl"]/li[{i+1}]/div[2]/h2/a') #Locate the nq'th neighbourhood to select
    qrt_text = quartier.text #Get the neighbourhood name
    # print(qrt_text)
    quartiers.append(qrt_text)

    # standing
    standing = wd.find_element_by_xpath(f'//*[@id="listingUl"]/li[{i+1}]/div[2]/div[1]/h4') #Locate the nq'th standing to select
    standing_text = standing.text #Get the standing_text name
    # print(standing_text)
    standings.append(standing_text)

    # id_annonceur
    id_annonceur = wd.find_element_by_xpath(f'//*[@id="listingUl"]/li[{i+1}]')
    promotion_id = id_annonceur.get_attribute("promotion-id")
    # print(promotion_id)
    ids.append(promotion_id)

    # date_livraison
    date_livraison = wd.find_element_by_xpath(f'//*[@id="listingUl"]/li[{i+1}]/div[2]/div[2]/span/span') #Locate the price
    date_livraison_text = date_livraison.text 
    # print(date_livraison_text)
    dates.append(date_livraison_text)

### Show results in DataFrame

In [7]:
all_data = [prices , quartiers , standings , ids , dates ]
df = pd.concat([pd.DataFrame(lst) for lst in all_data] , axis=1).reset_index(drop = True)
#Set the dataframe's columns
df.columns = ['prices','quartiers','standings','ids','dates']
df

Unnamed: 0,prices,quartiers,standings,ids,dates
0,À partir de 773 000 DH,Central Park,"Haut standing, Finalisé",2621,Publié il y a plus de 6 mois
1,À partir de 2 800 000 DH,Atlantic Eden,"Haut standing, Finalisé",2854,Publié il y a plus de 6 mois
2,À partir de 1 233 000 DH,QUEENS,"Haut standing, Finalisé",2925,Publié il y a plus de 6 mois
3,Prix à consulter,Ahl Loghlam- Lots de terrain,"Haut standing, Finalisé",2734,Publié il y a plus de 6 mois
4,À partir de 1 600 000 DH,Sea View,"Haut standing, Finalisé",3294,Publié il y a plus de 6 mois
5,À partir de 726 000 DH,Résidence Assafoi - Terrains et loca...,"Moyen standing, Finalisé",3221,Publié il y a plus de 6 mois
6,À partir de 2 900 000 DH,Duplex 3 chambres Front Mer,"Haut standing, Finalisé",3357,Publié il y a plus de 6 mois
7,À partir de 720 000 DH,Rosa Parc Magasins,"Haut standing, En cours de construction",2928,Publié il y a plus de 6 mois
8,À partir de 540 000 DH,Shems Al Madina - Lots de terrain,"Haut standing, Finalisé",1729,Publié il y a plus de 6 mois
9,À partir de 519 000 DH,Ocean Park,"Haut standing, En cours de construction",3262,Publié il y a plus de 6 mois


### Get Latitude , Longitude and concatenate all columns

In [7]:
lats = []
longs = []

import time
t = 5

for j in range(n_elems):
    # long and lat
    # wd.get(url_ard) #Go back to last page
    page_list = wd.find_element_by_xpath(f'//*[@id="listingUl"]/li[{j+1}]') #Locate the price
    page_list.click()
    time.sleep(t)
    mapOpen = wd.find_element_by_id('mapOpen')
    lat = mapOpen.get_attribute("lat")
    lon = mapOpen.get_attribute("lon")
    lats.append(lat)
    longs.append(lon)
    # wd.execute_script("window.history.go(-1)")
    wd.back()
    time.sleep(t)
    


all_data = [prices , quartiers , standings , ids , dates , lats , longs]
df = pd.concat([pd.DataFrame(lst) for lst in all_data], axis=1).reset_index(drop = True)
#Set the dataframe's columns
df.columns = ['prices','quartiers','standings','ids','dates','lats','longs']

### Show all results in a Dataframe

In [9]:
df

Unnamed: 0,prices,quartiers,standings,ids,dates,lats,longs
0,À partir de 773 000 DH,Central Park,"Haut standing, Finalisé",2621,Publié il y a plus de 6 mois,33.70259958074599,-7.399420738220243
1,À partir de 2 800 000 DH,Atlantic Eden,"Haut standing, Finalisé",2854,Publié il y a plus de 6 mois,33.51788561426908,-7.795518636703463
2,À partir de 1 233 000 DH,QUEENS,"Haut standing, Finalisé",2925,Publié il y a plus de 6 mois,33.57905668978428,-7.647800104393838
3,Prix à consulter,Ahl Loghlam- Lots de terrain,"Haut standing, Finalisé",2734,Publié il y a plus de 6 mois,33.57340232956212,-7.478427886963118
4,À partir de 1 600 000 DH,Sea View,"Haut standing, Finalisé",3294,Publié il y a plus de 6 mois,33.53037726442764,-7.8294328124397055
5,À partir de 726 000 DH,Résidence Assafoi - Terrains et loca...,"Moyen standing, Finalisé",3221,Publié il y a plus de 6 mois,33.54574410154525,-7.739739418029757
6,À partir de 2 900 000 DH,Duplex 3 chambres Front Mer,"Haut standing, Finalisé",3357,Publié il y a plus de 6 mois,33.52996746536189,-7.829604148864917
7,À partir de 720 000 DH,Rosa Parc Magasins,"Haut standing, En cours de construction",2928,Publié il y a plus de 6 mois,33.703081563375555,-7.391266822814885
8,À partir de 540 000 DH,Shems Al Madina - Lots de terrain,"Haut standing, Finalisé",1729,Publié il y a plus de 6 mois,33.613046760187586,-7.128281593322384
9,À partir de 519 000 DH,Ocean Park,"Haut standing, En cours de construction",3262,Publié il y a plus de 6 mois,33.65439484793648,-7.477188643157888


### Save Data in xlsx file

In [10]:
df.to_excel(r'mubawab_data.xlsx', index = False)