In [1]:
# Requests for handling HTTP get and other requests
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = "https://www.ss.com/lv/real-estate/flats/riga/"

In [3]:
# we request the page (similar to how a browser would request one)
req = requests.get(url)
req.status_code

200

In [4]:
req.text[:200]

'<!DOCTYPE html>\r\n<HTML><HEAD>\r\n<title>SS.COM Dzīvokļi - Rīga - Sludinājumi</title>\r\n<meta http-equiv="Content-Type" CONTENT="text/html; charset=UTF-8">\r\n<meta name="viewport" content="user-scalable=1,'

In [5]:
'centrs' in req.text

True

In [None]:
# if we had a badly formed html we could use our regular text search to extract information

In [6]:
# we create a structure from our html
# note: lxml is an improved parser over standard built-in parser
soup = BeautifulSoup(req.text, 'lxml')
soup.title

<title>SS.COM Dzīvokļi - Rīga - Sludinājumi</title>

In [16]:
len(list(soup.children))

2

In [18]:
len(list(soup.body.children))

10

['\n',
 <div id="left_banner" style="position:absolute;height:0px;width:0px;top:15px;left:10px;z-index:0;" z-index="0"></div>,
 '\n',
 <div align="center">
 <div class="page_header page_bg" id="main_table">
 <div id="sslogin" z-index="20"></div>
 <span class="page_header_head"><a href="/" title="Sludinājumi"><img alt="Sludinājumi" border="0" class="page_header_logo" src="https://i.ss.com/img/p.gif"/></a><h1>SLUDINĀJUMI</h1></span>
 <span class="page_header_menu"><b class="menu_main"><a class="a_menu" href="/lv/real-estate/flats/new/" title="Iesniegt Sludinājumu">Iesniegt Sludinājumu</a></b>
 <b class="menu_main"><a class="a_menu" href="/lv/login/" title="Mani Sludinājumi">Mani Sludinājumi</a></b>
 <b class="menu_main"><a class="a_menu" href="/lv/real-estate/flats/riga/search/" title="Meklēt sludinājumus">Meklēšana</a></b>
 <span style="display:inline-block;text-align:left;"><b class="menu_main" style="padding-right:0px;"><a class="a_menu" href="/lv/favorites/" title="Memo">Memo</a></b>

In [23]:
h4list = soup.find_all('h4', {'class': 'category'})
len(h4list)

53

In [26]:
h4list[0].text

'Centrs'

In [27]:
h4list[0].findChildren()

[<a class="a_category" href="/lv/real-estate/flats/riga/centre/" id="ahc_1106" title="Centrs, Sludinājumi">Centrs</a>]

In [7]:
# we want all anchor (tag "a") which have class 'a_category'
anchorlist = soup.find_all('a', {'class': 'a_category'})
len(anchorlist)

53

In [8]:
anchorlist[-1]

<a class="a_category" href="/lv/real-estate/flats/riga/all/" id="ahc_14195">Visi sludinājumi</a>

In [9]:
anchorlist[:5]

[<a class="a_category" href="/lv/real-estate/flats/riga/centre/" id="ahc_1106" title="Centrs, Sludinājumi">Centrs</a>,
 <a class="a_category" href="/lv/real-estate/flats/riga/agenskalns/" id="ahc_1088" title="Āgenskalns, Sludinājumi">Āgenskalns</a>,
 <a class="a_category" href="/lv/real-estate/flats/riga/aplokciems/" id="ahc_1114" title="Aplokciems, Sludinājumi">Aplokciems</a>,
 <a class="a_category" href="/lv/real-estate/flats/riga/beberbeki/" id="ahc_5061" title="Beberbeķi, Sludinājumi">Beberbeķi</a>,
 <a class="a_category" href="/lv/real-estate/flats/riga/bergi/" id="ahc_1173" title="Berģi, Sludinājumi">Berģi</a>]

In [10]:
anchorlist[1].text

'Āgenskalns'

In [13]:
# when we know what specific items we do not want
flist = anchorlist[:-1]
len(flist)

52

In [12]:
# more specific filter
filteredlist = [el for el in anchorlist if "visi sludinājumi" not in el.text.lower()]
len(filteredlist)

52

In [28]:
filteredlist[0]['href']

'/lv/real-estate/flats/riga/centre/'

In [29]:
# we should get all urls here
urlist = [el['href'] for el in filteredlist]
len(urlist)

52

In [30]:
urlist[0]

'/lv/real-estate/flats/riga/centre/'

In [31]:
baseurl = "https://www.ss.com"

In [32]:
# create a new list appending base url to each element
furlist = [baseurl + el for el in urlist]
furlist[:2]

['https://www.ss.com/lv/real-estate/flats/riga/centre/',
 'https://www.ss.com/lv/real-estate/flats/riga/agenskalns/']

In [33]:
postfix = 'hand_over/'

In [34]:
sellist = [el + postfix for el in furlist]
sellist[:2]

['https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/']

In [37]:
# we create a list of tuples containing the area and the full url for the area
handoverlist = [(el.text, baseurl + el['href'] + postfix ) for el in filteredlist ]
handoverlist[:5]

[('Centrs', 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/'),
 ('Āgenskalns',
  'https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/'),
 ('Aplokciems',
  'https://www.ss.com/lv/real-estate/flats/riga/aplokciems/hand_over/'),
 ('Beberbeķi',
  'https://www.ss.com/lv/real-estate/flats/riga/beberbeki/hand_over/'),
 ('Berģi', 'https://www.ss.com/lv/real-estate/flats/riga/bergi/hand_over/')]

In [39]:
# create a dictionary from our list of tuples
handdict = { el[0]:el[1] for el in handoverlist}


In [40]:
handdict['Centrs']

'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/'

In [None]:
# Pandas has built in web scraper for tables

In [35]:
dftables = pd.read_html(sellist[0])
len(dftables)

7

In [36]:
centrs = dftables[4]
centrs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Sludinājumi \tdatums,Sludinājumi \tdatums,Sludinājumi \tdatums,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena
1,,,Bez komisijas maksas. Izīrē studijas tipa dzīv...,Valdemāra 71,5,110,6/6,P. kara,7.18 €,790 €/mēn.
2,,,"Īstermiņā no 2st. tiek izīrēts saulains, mājīg...",Matīsa 100,1,30,4/5,Renov.,0.833 €,25 €/dienā
3,,,Izīrēju ilgtermiņā plašu trīs istabu dzīvokli ...,Hospitāļu 15,3,77,3/5,P. kara,6.75 €,520 €/mēn.
4,,,"Для краткосрочной аренды сдаётся уютная, двухк...",Lāčplēša 221,2,41,3/4,P. kara,0.854 €,35 €/dienā


In [None]:
type(riga)

In [None]:
rreq = requests.get(sellist[0])
rreq.status_code

In [None]:
rsoup = BeautifulSoup(rreq.text, 'lxml')
rsoup.title

In [None]:
ranchors = rsoup.find_all('a', class_ = 'am')
len(ranchors)

In [None]:
ranchors[0]

In [None]:
rurls= [baseurl+el['href'] for el in ranchors]
rurls[0]

In [None]:
sellist[0]

In [41]:
def getUrlList(url, prefix='https://www.ss.com', postfix='sell/', tag='a', class_='a_category'):
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    return [ prefix + el['href'] + postfix for el in soup.find_all(tag, class_) ]
    # What else could we pass as argument? How could our return fail?

In [None]:
woodlist = getUrlList("https://www.ss.com/lv/real-estate/wood/")
len(woodlist)

In [42]:
handoverlist[0]

('Centrs', 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/')

In [43]:
curl=handoverlist[0][1]
curl

'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/'

In [44]:
rreq = requests.get(curl)
rreq.status_code

200

In [45]:
rsoup = BeautifulSoup(rreq.text, 'lxml')
rsoup.title

<title>SS.COM Dzīvokļi - Rīga - Centrs, Cenas, Izīrē - Sludinājumi</title>

In [46]:
rtables = rsoup.find_all('table')
len(rtables)

7

In [48]:
# we will use regular expression to find only rows which start with tr_
import re

In [54]:
# the part after r' we can find using https://regex101.com/ or other regular expression testers
# so here we want all tr elements with id starting with tr_anydigit
rrows = rsoup.find_all('tr',id = re.compile(r'tr_[0-9]+')) #basic wildcard search could use tr_ in lambda function
len(rrows)

30

In [56]:
# without regular expressions
allrows = rsoup.find_all('tr')
len(allrows)

40

In [66]:
# we need to check if element has attributes before we use that attribute
# so we need to check el.attrs (which is a dictionary)
fallrows = [el for el in allrows if 'id' in el.attrs and "tr_" in el['id']]
len(fallrows)

31

In [67]:
fallrows = fallrows[:-1]


In [65]:
'id' in allrows[2]

False

In [55]:
type(re.compile(r'tr_[0-9]+'))

re.Pattern

In [68]:
headline = rsoup.find('tr', id = "head_line")
print(headline)

<tr id="head_line">
<td class="msg_column" colspan="3" width="70%">
<span style="float:left;"> Sludinājumi
</span>
<span align="right" class="msg_column" style="float:right;text-align:right;padding-right:3px;">
<noindex>
<a class="a19" href="/lv/real-estate/flats/riga/centre/hand_over/fDgSeF4S.html" rel="nofollow">datums</a></noindex></span>
</td>
<td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/hand_over/fDgSeF4SFDwT.html" rel="nofollow" title="">Iela</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/hand_over/fDgSeF4SelM=.html" rel="nofollow" title="">Ist.</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/hand_over/fDgSeF4QelM=.html" rel="nofollow" title="">m2</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/hand_over/fDgSeF4XelM=.

In [69]:
cindex = [el.text for el in headline.find_all('td')]
cindex

['\n\xa0Sludinājumi\r\n\n\n\ndatums\n',
 'Iela',
 'Ist.',
 'm2',
 'Stāvs',
 'Sērija',
 'Cena, m2',
 'Cena']

In [70]:
cindex[0].split()[0]

'Sludinājumi'

In [71]:
cindex[0] = cindex[0].split()[0]
cindex

['Sludinājumi', 'Iela', 'Ist.', 'm2', 'Stāvs', 'Sērija', 'Cena, m2', 'Cena']

In [72]:
# this will be index that we will be passing to Pandas as column index at the end
cindex += ['URL']  #vai cindex.append('URL')
cindex

['Sludinājumi',
 'Iela',
 'Ist.',
 'm2',
 'Stāvs',
 'Sērija',
 'Cena, m2',
 'Cena',
 'URL']

In [74]:
firstrow = rrows[0]
firstrow

<tr id="tr_46628436"><td class="msga2 pp0"><input id="c46628436" name="mid[]" type="checkbox" value="46628436_1106_0"/></td><td class="msga2"><a href="/msg/lv/real-estate/flats/riga/centre/ekljx.html" id="im46628436"><img alt="" class="isfoto foto_list" src="https://i.ss.com/gallery/3/440/109810/21961971.th2.jpg"/></a></td><td class="msg2"><div class="d1"><a class="am" data="aHolQTUlOEMlOUIlRDQlOUYlOTdpJTdEJTlFJThCJTk5JThFJUE0JTlCcCU3RSVBMiU5MCU5RSU4RQ==|7EmZgXnb" href="/msg/lv/real-estate/flats/riga/centre/ekljx.html" id="dm_46628436">Bez komisijas maksas. Izīrē studijas tipa dzīvokli, kas izbūvēts</a></div></td><td c="1" class="msga2-o pp6" nowrap="">Valdemāra 71</td><td c="1" class="msga2-o pp6" nowrap="">5</td><td c="1" class="msga2-o pp6" nowrap="">110</td><td c="1" class="msga2-o pp6" nowrap="">6/6</td><td c="1" class="msga2-o pp6" nowrap="">P. kara</td><td c="1" class="msga2-o pp6" nowrap="">7.18 €</td><td c="1" class="msga2-o pp6" nowrap="">790  €/mēn.</td></tr>

In [75]:
alltds = firstrow.find_all('td')
alltds

[<td class="msga2 pp0"><input id="c46628436" name="mid[]" type="checkbox" value="46628436_1106_0"/></td>,
 <td class="msga2"><a href="/msg/lv/real-estate/flats/riga/centre/ekljx.html" id="im46628436"><img alt="" class="isfoto foto_list" src="https://i.ss.com/gallery/3/440/109810/21961971.th2.jpg"/></a></td>,
 <td class="msg2"><div class="d1"><a class="am" data="aHolQTUlOEMlOUIlRDQlOUYlOTdpJTdEJTlFJThCJTk5JThFJUE0JTlCcCU3RSVBMiU5MCU5RSU4RQ==|7EmZgXnb" href="/msg/lv/real-estate/flats/riga/centre/ekljx.html" id="dm_46628436">Bez komisijas maksas. Izīrē studijas tipa dzīvokli, kas izbūvēts</a></div></td>,
 <td c="1" class="msga2-o pp6" nowrap="">Valdemāra 71</td>,
 <td c="1" class="msga2-o pp6" nowrap="">5</td>,
 <td c="1" class="msga2-o pp6" nowrap="">110</td>,
 <td c="1" class="msga2-o pp6" nowrap="">6/6</td>,
 <td c="1" class="msga2-o pp6" nowrap="">P. kara</td>,
 <td c="1" class="msga2-o pp6" nowrap="">7.18 €</td>,
 <td c="1" class="msga2-o pp6" nowrap="">790  €/mēn.</td>]

In [76]:
def getRowData(row):
    return [el.text for el in row.find_all('td')[2:]] + [baseurl + row.find('a')['href']]

In [78]:
len(rrows)

30

In [77]:
rowdata = getRowData(rrows[0])
rowdata

['Bez komisijas maksas. Izīrē studijas tipa dzīvokli, kas izbūvēts',
 'Valdemāra 71',
 '5',
 '110',
 '6/6',
 'P. kara',
 '7.18 €',
 '790  €/mēn.',
 'https://www.ss.com/msg/lv/real-estate/flats/riga/centre/ekljx.html']

In [79]:
rowsdata = [getRowData(el) for el in rrows] # It should work but what there is a hidden last element :)
rowsdata

[['Bez komisijas maksas. Izīrē studijas tipa dzīvokli, kas izbūvēts',
  'Valdemāra 71',
  '5',
  '110',
  '6/6',
  'P. kara',
  '7.18 €',
  '790  €/mēn.',
  'https://www.ss.com/msg/lv/real-estate/flats/riga/centre/ekljx.html'],
 ['Īstermiņā no 2st. tiek izīrēts saulains, mājīgs jauns vienistaba',
  'Matīsa 100',
  '1',
  '30',
  '4/5',
  'Renov.',
  '0.833 €',
  '25  €/dienā',
  'https://www.ss.com/msg/lv/real-estate/flats/riga/centre/ejbhd.html'],
 ['Izīrēju ilgtermiņā plašu trīs istabu dzīvokli (76, 7m2) Rīgas ce',
  'Hospitāļu 15',
  '3',
  '77',
  '3/5',
  'P. kara',
  '6.75 €',
  '520  €/mēn.',
  'https://www.ss.com/msg/lv/real-estate/flats/riga/centre/gijki.html'],
 ['Для краткосрочной аренды сдаётся уютная, двухкомнатная квартира.',
  'Lāčplēša 221',
  '2',
  '41',
  '3/4',
  'P. kara',
  '0.854 €',
  '35  €/dienā',
  'https://www.ss.com/msg/lv/real-estate/flats/riga/centre/acbkb.html'],
 ['4-5 мест. 1 комнатная квартира в центре Риги, перекресток Садовн',
  'Sadovņikova 47',
  

In [None]:
rowsdata = [getRowData(el) for el in rrows[:-1]] # so we just skip the last element
rowsdata

In [80]:
df = pd.DataFrame(rowsdata, columns=cindex)
df

Unnamed: 0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL
0,Bez komisijas maksas. Izīrē studijas tipa dzīv...,Valdemāra 71,5,110,6/6,P. kara,7.18 €,790 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
1,"Īstermiņā no 2st. tiek izīrēts saulains, mājīg...",Matīsa 100,1,30,4/5,Renov.,0.833 €,25 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...
2,Izīrēju ilgtermiņā plašu trīs istabu dzīvokli ...,Hospitāļu 15,3,77,3/5,P. kara,6.75 €,520 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
3,"Для краткосрочной аренды сдаётся уютная, двухк...",Lāčplēša 221,2,41,3/4,P. kara,0.854 €,35 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...
4,"4-5 мест. 1 комнатная квартира в центре Риги, ...",Sadovņikova 47,1,40,2/5,Renov.,7.50 €,300 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
5,"3х местная, 1 комнатная квартира в центре Риги...",Sadovņikova 47,1,30,3/5,Renov.,8.33 €,250 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
6,Тишина в самом центре Риги. \r\n\r\nСдается на...,Stabu 19,2,46,5/6,P. kara,9.89 €,455 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
7,Тишина в самом центре Риги. Уютная мансарда с ...,Brīvības 59,1,39,6/6,Jaun.,11.28 €,440 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
8,Īpašnieks izīrē modernu mansarda 2 līmeņu dzīv...,Matīsa 45,2,60,6/6,P. kara,9.17 €,550 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
9,Сдается без посредников на длительный срок без...,Ģertrūdes 98,2,45,5/5,P. kara,6.67 €,300 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...


In [None]:
bigdf = pd.concat([df,df])
bigdf

In [121]:
def getDFfromUrl(url, region = None):
    print(f'Going to gather data from URL:{url}')
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    

    
    headline = soup.find('tr', id = "head_line")
    cindex = [el.text for el in headline.find_all('td')]
    cindex[0] = cindex[0].split()[0]
    cindex += ['URL'] #TODO add argument for this
    cindex += ['Region']
    
    # TODO move it somewhere else
    if len([el for el in soup.find_all('option') if 'Izīrē' in el.text]) == 0:
        print("Oops nothing for rent")
        return pd.DataFrame({}, columns=cindex)
    
    rows = soup.find_all('tr',id = re.compile(r'tr_*'))
    rowsdata = [getRowData(el) for el in rows[:-1]]
    # finally we add the region if we did not have one
    if region == None:
        region = url.split("/")[-3]
    rowsdata = [el + [region] for el in rowsdata]
    return pd.DataFrame(rowsdata, columns=cindex)
    
    

In [96]:
rdf = getDFfromUrl("https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/")
rdf.head()

Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/


Unnamed: 0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL,Region
0,Bez komisijas maksas. Izīrē studijas tipa dzīv...,Valdemāra 71,5,110,6/6,P. kara,7.18 €,790 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
1,"Īstermiņā no 2st. tiek izīrēts saulains, mājīg...",Matīsa 100,1,30,4/5,Renov.,0.833 €,25 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
2,Izīrēju ilgtermiņā plašu trīs istabu dzīvokli ...,Hospitāļu 15,3,77,3/5,P. kara,6.75 €,520 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
3,"Для краткосрочной аренды сдаётся уютная, двухк...",Lāčplēša 221,2,41,3/4,P. kara,0.854 €,35 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
4,"4-5 мест. 1 комнатная квартира в центре Риги, ...",Sadovņikova 47,1,40,2/5,Renov.,7.50 €,300 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre


In [98]:
yugla = getDFfromUrl("https://www.ss.com/lv/real-estate/flats/riga/yugla/hand_over/")
yugla.head()

Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/yugla/hand_over/


Unnamed: 0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL,Region
0,Mēbelēts dzīvoklis uz ilgu laika periodu. Izīr...,Malienas 74,1,31,1/5,LT proj.,8.06 €,250 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,yugla
1,"Сдается однокомнатная квартира на Югле, ул. Ti...",Tirzas 3/4,1,28,4/5,Hrušč.,7.86 €,220 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,yugla
2,Īpašnieks uz ilgāku laiku izīrē kārtīgiem cilv...,Vangažu 30,2,47,11/12,Čehu pr.,6.17 €,290 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,yugla
3,"Сдаётся 2 комнатная квартира, после частичнoго...",Juglas 1,2,43,2/5,Hrušč.,5.12 €,220 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,yugla
4,Tiek izīrēts 1 istabas dzīvoklis Juglā. \r\nDz...,Silciema 15k2,1,30,4/5,Hrušč.,6.67 €,200 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,yugla


In [83]:
mysplit = "https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/".split("/")
mysplit

['https:',
 '',
 'www.ss.com',
 'lv',
 'real-estate',
 'flats',
 'riga',
 'centre',
 'hand_over',
 '']

In [85]:
mysplit[-3]

'centre'

In [82]:
rdf = getDFfromUrl("https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/")
rdf.head()

Going gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/


Unnamed: 0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL
0,Bez komisijas maksas. Izīrē studijas tipa dzīv...,Valdemāra 71,5,110,6/6,P. kara,7.18 €,790 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
1,"Īstermiņā no 2st. tiek izīrēts saulains, mājīg...",Matīsa 100,1,30,4/5,Renov.,0.833 €,25 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...
2,Izīrēju ilgtermiņā plašu trīs istabu dzīvokli ...,Hospitāļu 15,3,77,3/5,P. kara,6.75 €,520 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...
3,"Для краткосрочной аренды сдаётся уютная, двухк...",Lāčplēša 221,2,41,3/4,P. kara,0.854 €,35 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...
4,"4-5 мест. 1 комнатная квартира в центре Риги, ...",Sadovņikova 47,1,40,2/5,Renov.,7.50 €,300 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...


In [None]:
df2 = getDFfromUrl(woodlist[12])
df2

In [100]:
# Good idea to add time delay to a function which processes many scrapes
# Be a good citizen
# https://www.pythoncentral.io/pythons-time-sleep-pause-wait-sleep-stop-your-code/
import time

In [102]:
len(handoverlist)

52

In [104]:
handurllist = [el[1] for el in handoverlist]
handurllist[:5]

['https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/aplokciems/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/beberbeki/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/bergi/hand_over/']

In [105]:
# with this recipe we can append a big list of dataframes into one
def getDFfromUrlList(urlist):
    dflist = []
    for ur in urlist:
        dflist.append(getDFfromUrl(ur))
        time.sleep(0.5)
    return pd.concat(dflist)

In [117]:
handurllist[:3]

['https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/aplokciems/hand_over/']

In [122]:
bigdf = getDFfromUrlList(handurllist[:10])
bigdf.head()

Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/aplokciems/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/beberbeki/hand_over/
Oops nothing for rent
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/bergi/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/bierini/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/bolderaya/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/breksi/hand_over/
Oops nothing for rent
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/bukulti/hand_over/
Oops nothing for rent
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/bulli/h

Unnamed: 0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL,Region
0,Bez komisijas maksas. Izīrē studijas tipa dzīv...,Valdemāra 71,5,110,6/6,P. kara,7.18 €,790 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
1,"Īstermiņā no 2st. tiek izīrēts saulains, mājīg...",Matīsa 100,1,30,4/5,Renov.,0.833 €,25 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
2,Izīrēju ilgtermiņā plašu trīs istabu dzīvokli ...,Hospitāļu 15,3,77,3/5,P. kara,6.75 €,520 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
3,"Для краткосрочной аренды сдаётся уютная, двухк...",Lāčplēša 221,2,41,3/4,P. kara,0.854 €,35 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
4,"4-5 мест. 1 комнатная квартира в центре Риги, ...",Sadovņikova 47,1,40,2/5,Renov.,7.50 €,300 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre


In [109]:
bigdf.groupby(['Region']).count()

Unnamed: 0_level_0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
agenskalns,30,30,30,30,30,30,30,30,30
aplokciems,2,2,2,2,2,2,2,2,2
beberbeki,1,1,1,1,1,1,1,1,1
bierini,5,5,5,5,5,5,5,5,5
bolderaya,6,6,6,6,6,6,6,6,6
breksi,1,1,1,1,1,1,1,1,1
bukulti,4,4,4,4,4,4,4,4,4
bulli,1,1,1,1,1,1,1,1,1
centre,30,30,30,30,30,30,30,30,30
chiekurkalns,17,17,17,17,17,17,17,17,17


In [110]:

# https://stackoverflow.com/questions/10607688/how-to-create-a-file-name-with-the-current-date-time-in-python
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html
# Let's use with context manager so we do not forget to close writer!
with pd.ExcelWriter(f'sellers_{timestr}.xlsx') as writer:
    bigdf.to_excel(writer, sheet_name='Sheet_name_1')

In [111]:
rsoup.title

<title>SS.COM Dzīvokļi - Rīga - Centrs, Cenas, Izīrē - Sludinājumi</title>

In [112]:
options = rsoup.find_all('option')
options

[<option selected="" value="/lv/real-estate/flats/riga/centre/hand_over/">Visi sludinājumi</option>,
 <option value="/lv/real-estate/flats/riga/centre/today/hand_over/">Šodien - 6</option>,
 <option value="/lv/real-estate/flats/riga/centre/today-2/hand_over/">Par 2 dienām - 78</option>,
 <option value="/lv/real-estate/flats/riga/centre/today-5/hand_over/">Par 5 dienām - 178</option>,
 <option value=""></option>,
 <option value="1">1</option>,
 <option value="2">2</option>,
 <option value="3">3</option>,
 <option value="4">4</option>,
 <option value="5">5</option>,
 <option value="6">6</option>,
 <option value="Citi">Citi</option>,
 <option value=""></option>,
 <option value="1">1</option>,
 <option value="2">2</option>,
 <option value="3">3</option>,
 <option value="4">4</option>,
 <option value="5">5</option>,
 <option value="6">6</option>,
 <option value="Citi">Citi</option>,
 <option value=""></option>,
 <option value="67">103.</option>,
 <option value="70">467.</option>,
 <option v

In [115]:
hand_over_options = [el for el in options if 'Izīrē' in el.text]
len(hand_over_options)

1

In [116]:
# this way we can filter for ads which do not contain for rent (hand_over) option
hand_options = [el for el in rsoup.find_all('option') if 'Izīrē' in el.text]
hand_options

[<option selected="" value="/lv/real-estate/flats/riga/centre/hand_over/">Izīrē</option>]

In [123]:
rsoup.title

<title>SS.COM Dzīvokļi - Rīga - Centrs, Cenas, Izīrē - Sludinājumi</title>

In [124]:
allanchors = rsoup.find_all('a', {'rel': 'prev'})
allanchors

[<a class="navi" href="/lv/real-estate/flats/riga/centre/hand_over/page19.html" name="nav_id" rel="prev"><img border="0" height="5" src="https://i.ss.com/img/s_left.png" style="padding-bottom:2px;" width="9"/> Iepriekšējie</a>]

In [125]:
prevnav = allanchors[0]

In [126]:
prevnav['href']

'/lv/real-estate/flats/riga/centre/hand_over/page19.html'

In [127]:
lurl = prevnav['href']
lurl

'/lv/real-estate/flats/riga/centre/hand_over/page19.html'

In [136]:
# so we look for at least one digit between page and .html
searchresult = re.search(r'page(\d+)\.html', lurl)
if searchresult:
    print(searchresult.group(1))
else:
    print("Didn't find anything")

19


In [128]:
lurl.split('/page')[1].split('.')[0]

'19'

In [148]:
def getRegionUrls(url, optionName = "Izīrē"):
    regionurls = []
    print(f'Going to check Region Url:{url}')
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    # first we check if the optionName exists at all
    hand_options = [el for el in soup.find_all('option') if optionName in el.text]
    if len(hand_options) == 0:
        return []
    
    allanchors = soup.find_all('a', {'rel': 'prev'})
    if len(allanchors) == 0:
        return [url]
    
    lasturl = allanchors[0]['href']
    searchresult = re.search(r'page(\d+)\.html', lasturl)
    if searchresult:
        lastpageNum = int(searchresult.group(1))
    else:
        print("hmm no last page!!")
        return [url]
    # we add first page which is just the default url without page num
    # following pages have pagenum.html at the end
    regionurls =  [url] + [url + "page" + str(num) + ".html" for num in range(2, lastpageNum+1)]
    return regionurls
    
    
    
    

In [146]:
centreUrls = getRegionUrls('https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/')
centreUrls

Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/


['https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page2.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page3.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page4.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page5.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page6.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page7.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page8.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page9.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page10.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page11.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page12.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page13.html',
 'htt

In [149]:
t = getRegionUrls('https://www.ss.com/lv/real-estate/flats/riga/vef/')
t

Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/vef/


['https://www.ss.com/lv/real-estate/flats/riga/vef/']

In [155]:
a = [1,2,3]
# a.append([])
a+= []
a

[1, 2, 3]

In [157]:
def getAllUrls(urlist):
    biglist = []
    for url in urlist:
        biglist += getRegionUrls(url)
        time.sleep(0.5)
    return biglist

In [158]:
justurls = [el[1] for el in handoverlist]
justurls[:3]

['https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/',
 'https://www.ss.com/lv/real-estate/flats/riga/aplokciems/hand_over/']

In [159]:
mybiglist = getAllUrls(justurls)
len(mybiglist)

Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/agenskalns/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/aplokciems/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/beberbeki/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/bergi/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/bierini/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/bolderaya/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/breksi/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/bukulti/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/bulli/hand_over/
Going to check Region Url:https://www.ss.com/lv/real-estate/flats/riga/chiekurkalns/hand_over/
G

70

In [160]:
allrigadf = getDFfromUrlList(mybiglist)
allrigadf.shape

Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page2.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page3.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page4.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page5.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page6.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page7.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page8.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/hand_over/page9.html
Going to gather data from URL:https://www.ss.com/lv/real-estate/flats/riga/centre/h

(1328, 10)

In [161]:
allrigadf.head()


Unnamed: 0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL,Region
0,Сдаётся светлая уютная 2-комнатная квартира в ...,Stabu 15,2,56,4/6,P. kara,7.50 €,420 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
1,Saimniece ilgtermiņā izīrē 92 m2 lielu dzīvokl...,Dzirnavu 3,2,92,3/5,Specpr.,4.89 €,450 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
2,Vienistabas dzīvoklis Avotu ielā koka mājā 23 ...,Avotu 65,1,23,1/2,P. kara,8.70 €,200 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
3,Bez komisijas maksas. Izīrē studijas tipa dzīv...,Valdemāra 71,5,110,6/6,P. kara,7.18 €,790 €/mēn.,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre
4,"Īstermiņā no 2st. tiek izīrēts saulains, mājīg...",Matīsa 100,1,30,4/5,Renov.,0.833 €,25 €/dienā,https://www.ss.com/msg/lv/real-estate/flats/ri...,centre


In [162]:
allrigadf.to_excel("Riga_26sep.xlsx")

In [163]:
allrigadf.groupby(['Region']).count()

Unnamed: 0_level_0,Sludinājumi,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,URL
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
agenskalns,67,67,67,67,67,67,67,67,67
aplokciems,2,2,2,2,2,2,2,2,2
bierini,5,5,5,5,5,5,5,5,5
bolderaya,6,6,6,6,6,6,6,6,6
centre,558,558,558,558,558,558,558,558,558
chiekurkalns,17,17,17,17,17,17,17,17,17
darzciems,6,6,6,6,6,6,6,6,6
daugavgriva,4,4,4,4,4,4,4,4,4
dzeguzhkalns,11,11,11,11,11,11,11,11,11
grizinkalns,12,12,12,12,12,12,12,12,12


In [165]:
allrigadf.dtypes

Sludinājumi    object
Iela           object
Ist.           object
m2             object
Stāvs          object
Sērija         object
Cena, m2       object
Cena           object
URL            object
Region         object
dtype: object