In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
from datetime import datetime, date

In [2]:
start = datetime.now()

In [3]:
df = pd.DataFrame({'City': list(), 'Municipality':list(), 'Part_of_the_city': list(),
                   'Size':list(), 'Rooms':list(), 'Floor':list(), 'Total floors': list(),
                   'Price':list(), 'Price_per_m2':list(),
                   'Type':list(), 'Advertiser': list(),'Date': list()
                  })
l = []

In [4]:
# Worked at 02.25.2021.
url = 'https://www.halooglasi.com/nekretnine/prodaja-stanova/beograd?page='

In [5]:
# The scraper is kind of slow, on my computer it loads approximately 2 pages per seconds
# There should be 20 entries per page

for i in range (1,51):  # starting at page 1, page 0 doesn't exsist
    r = get(url,
           params = {'page': i})
    c = r.content
    soup= BeautifulSoup(c, 'html.parser')

    types =['Premium', 'Standard', 'Top']

# There are 3 different add cases, depanding if apartment is classified as a Premium, Top or Standared.
# In html, only the division class product changes, while all other things are same

    for typ in types:
        find_all = soup.find_all('div', {'class':f'product-item product-list-item {typ} real-estates my-product-placeholder'})
        
    
        for entry in find_all:
            d = {}
            
            # address is an unordered list (UL), but it looks like data is structred in the following way:
            # li 1 city name
            # li 2 municipality
            # li 3 part of the city or some landmark nearby -a bit more specific then municipality
            # li 4 street and in some cases number - I decided not to collect this data
            
            adress_list = entry.find('ul', {'class': 'subtitle-places'}).findChildren()
            

            try: # In case of error, where no Values are found, we set variable to None
                city = adress_list[0].text.replace('\xa0', ' ')
                d['City'] = city
            except:
                d['City']= None
                                                   
            try:
                municipality = adress_list[1].text
                d['Municipality'] = municipality
            except:
                d['Municipality']= None
                                                   
            try:
                part_of_the_city = adress_list[2].text.replace('\xa0', ' ')
                d['Part_of_the_city'] = part_of_the_city
            except:
                d['Part_of_the_city']= None
                                                   
                    
            # Serbian numbering notation is a bit different than english, 
            # thus python interpretates values larger then 1000 as integer (1.0). 
            # To overcome this I replaced , with . and transformed string into int

            # Size
            try:
                d['Size'] = float(entry.find_all('div', {'class':'value-wrapper'})[0].text.replace('\xa0m2Kvadratura', '').replace(',', '.'))
            except:
                d['Size']= None

            # Rooms
            try:
                
            # 46 is ASCII code for . while numbers are 0-9 are 48-57
                d['Rooms'] =  float(''.join(x for x in entry.find_all('div', {'class':'value-wrapper'})[1].text if 45 < ord(x) < 58))
            except:
                d['Rooms']= None

                # Floors
            try:
                # Floors are given in format I/4 where first number is roman, and represents actual floor
                # (or code like VPR for Visoko prizemlje) and arabic number shows total number of floors
                # this will be handled during data preprocessing
                floors = ''.join(x for x in entry.find_all('div', {'class':'value-wrapper'})[2].text).split('/')
                d['Floor'] = floors[0]
                d['Total floors'] = ''.join (x for x in floors[1] if x.isnumeric())
                
            except:
                d['Floor']= None
                d['Total floors'] = None

            #Price 
            
            # I am not checking for decimal values in price and price per m2
            
            try:
                d['Price'] = int(''.join(x for x in str(entry.find('div', {'class':'central-feature'}).text) if x.isnumeric()))
            except:
                d['Price']= None

            # Price in m2
            try:
            # [:-1] at the end because we get another 2 at the end from m2
                d['Price_per_m2'] = int(''.join(x for x in str(entry.find('div', {'class':'price-by-surface'}).text) if x.isnumeric())[:-1])
            except:
                d['Price_per_m2']= None

            #Type
            d['Type'] = typ

            #Advertiser
            try:
                d['Advertiser'] = ''.join(x for x in entry.find_all('span', {'data-field-name':'oglasivac_nekretnine_s'})[0].text if x.isalpha())
            except:
                d['Advertiser'] = None

            #Date
            try:
                d['Date'] = entry.find_all('span', {'class':'publish-date'})[0].text
            except:
                d['Date'] = None

#appending to the list, and not directly to the data frame, becase it should be faster this way
            l.append(d) 

In [6]:
df = df.append(l, ignore_index=True)

In [7]:
df.to_excel(f'StanoviBGD-{date.today()}-{i}pages.xlsx')

In [8]:
print(datetime.now() - start)

0:00:24.693818
