In [1]:
# Requests for handling HTTP get and other requests
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re

In [2]:
baseurl = "https://www.ss.com/lv/real-estate/flats/riga/"

In [3]:
# function to get list of all first page urls for all regions
def getUrlList(url, prefix='https://www.ss.com', postfix='sell/', tag='a', class_='a_category'):
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    return [ prefix + el['href'] + postfix for el in soup.find_all(tag, class_) ]
    # What else could we pass as argument? How could our return fail?

In [4]:
def getRowData(row):
    """Gets information from  each row"""
    return [el.text for el in row.find_all('td')[2:]] + [baseurl + row.find('a')['href']]

In [5]:
def getDFfromUrl(url, region = None):
    print(f'Going to gather data from URL:{url}')
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    

    
    headline = soup.find('tr', id = "head_line")
    cindex = [el.text for el in headline.find_all('td')]
    cindex[0] = cindex[0].split()[0]
    cindex += ['URL'] #TODO add argument for this
    cindex += ['Region']
    
    # TODO move it somewhere else
    if len([el for el in soup.find_all('option') if 'Izīrē' in el.text]) == 0:
        print("Oops nothing for rent")
        return pd.DataFrame({}, columns=cindex)
    
    rows = soup.find_all('tr',id = re.compile(r'tr_*'))
    rowsdata = [getRowData(el) for el in rows[:-1]]
    # finally we add the region if we did not have one
    if region == None:
        region = url.split("/")[-3]
    rowsdata = [el + [region] for el in rowsdata]
    return pd.DataFrame(rowsdata, columns=cindex)
    

In [6]:
# with this recipe we can append a big list of dataframes into one
def getDFfromUrlList(urlist):
    dflist = []
    for ur in urlist:
        dflist.append(getDFfromUrl(ur))
        time.sleep(0.5)
    return pd.concat(dflist)

In [7]:
def getRegionUrls(url, optionName = "Izīrē"):
    regionurls = []
    print(f'Going to check Region Url:{url}')
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    # first we check if the optionName exists at all
    hand_options = [el for el in soup.find_all('option') if optionName in el.text]
    if len(hand_options) == 0:
        return []
    
    allanchors = soup.find_all('a', {'rel': 'prev'})
    if len(allanchors) == 0:
        return [url]
    
    lasturl = allanchors[0]['href']
    searchresult = re.search(r'page(\d+)\.html', lasturl)
    if searchresult:
        lastpageNum = int(searchresult.group(1))
    else:
        print("hmm no last page!!")
        return [url]
    # we add first page which is just the default url without page num
    # following pages have pagenum.html at the end
    regionurls =  [url] + [url + "page" + str(num) + ".html" for num in range(2, lastpageNum+1)]
    return regionurls

In [8]:
def getAllUrls(urlist):
    biglist = []
    for url in urlist:
        biglist += getRegionUrls(url)
        time.sleep(0.5)
    return biglist

In [14]:
regionUrls = getUrlList(baseurl, postfix="hand_over")
len(regionUrls)

53

In [15]:
regionUrls[-3:]


['https://www.ss.com/lv/real-estate/flats/riga/vef/hand_over',
 'https://www.ss.com/lv/real-estate/flats/riga/other/hand_over',
 'https://www.ss.com/lv/real-estate/flats/riga/all/hand_over']

In [16]:
allUrls = getAllUrls(regionUrls[-1])
allUrls

Going to check Region Url:h


MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?

In [17]:
regionUrls[-1]

'https://www.ss.com/lv/real-estate/flats/riga/all/hand_over'

In [20]:
allriga = getRegionUrls('https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/')

Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/


In [21]:
len(allriga)

45

In [23]:
rigadf = getDFfromUrlList(allriga)
rigadf.shape

Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page2.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page3.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page4.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page5.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page6.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page7.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page8.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page9.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/all/hand_over/page10.html
Going to 

(1331, 9)

In [24]:
rigadf.to_excel('RigaApartments.xlsx')