In [1]:
import pandas as pd
import re
import datetime
import requests
import bs4

In [115]:
pd.set_option("display.max_rows", 500)

### Web scraping Craigslist MVP

In [81]:
url = "https://berlin.craigslist.org/d/housing/search/hhh"

In [82]:
r = requests.get(url)

In [83]:
soup = bs4.BeautifulSoup(r.content, "html.parser")

In [84]:
row = soup.find("ul", class_="rows").find_all("li", class_="result-row")[4]

In [85]:
row

<li class="result-row" data-pid="7051853627">
<a class="result-image gallery empty" href="https://berlin.craigslist.org/sbw/d/2-week-sublet-jan-20th-feb-3rd-in-brand/7051853627.html"></a>
<p class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">post zu favoriten</span>
</span>
<time class="result-date" datetime="2020-01-11 14:36" title="Sa 11 Jan 14:36:36">Jan 11</time>
<a class="result-title hdrlnk" data-id="7051853627" href="https://berlin.craigslist.org/sbw/d/2-week-sublet-jan-20th-feb-3rd-in-brand/7051853627.html">2 week sublet (Jan 20th - Feb 3rd) in brand new flat in central Berlin</a>
<span class="result-meta">
<span class="result-price">€0</span>
<span class="housing">
                    100m<sup>2</sup> -
                </span>
<span class="result-hood"> (Schöneberg)</span>
<span class="result-tags">
</span>
<span class="banish icon icon-trash" role="button">
<span class="screen-reader-text">dieses posting verbergen</span>
</span>


### Things to parse:

* date x
* description
* price
* hood
* image x
* housing
* id
* url 

In [60]:
date = row.time["datetime"]

In [42]:
description = row.p.find("a", class_="result-title hdrlnk").contents[0]

In [43]:
price = row.p.find("span", class_="result-price").contents[0]

In [44]:
hood = row.p.find("span", class_="result-hood").contents[0]

In [53]:
housing = row.p.find("span", class_="housing").contents[0]

In [74]:
id = row.p.a["data-id"]

In [73]:
url = row.a["href"]

In [98]:
def extract_value(value):
    try:
        value_extracted = value.contents[0]
    except:
        value_extracted = ""

    return value_extracted  
        

In [118]:
def extract_row(row):

    date = row.time["datetime"]
    description = extract_value(row.p.find("a", class_="result-title hdrlnk"))
    price = extract_value(row.p.find("span", class_="result-price"))
    hood = extract_value(row.p.find("span", class_="result-hood")).replace("(","").replace(")","")
    housing = extract_value(row.p.find("span", class_="housing")).replace("\n","").strip()
    id = row.p.a["data-id"]
    url = row.a["href"]

    row_dict = {"id"           :   id,
                "url"          :   url,
                "date"         :   date,
                "description"  :   description,
                "price"        :   price,
                "hood"         :   hood,
                "housing"      :   housing}

    return row_dict

In [119]:
def scraper(url):
    
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.content, "html.parser")
    rows = soup.find("ul", class_="rows").find_all("li", class_="result-row")

    df = pd.DataFrame([extract_row(row) for row in rows])
    
    return df



In [120]:
craigslist = scraper(url)

In [121]:
craigslist

Unnamed: 0,id,url,date,description,price,hood,housing
0,7040584671,https://berlin.craigslist.org/roo/d/beautiful-...,2020-01-11 18:16,BEAUtifUL rOOM fOR rENT iN fOUR-bEDROOM aPARTM...,€850,"Kaiserstraße 4c, 12209 Berlin, Germany",
1,7038380041,https://berlin.craigslist.org/apa/d/stylish-pr...,2020-01-11 18:06,sTYLISH pRIVATE sTUDIO fOR rENT iN cOLIVING sP...,€590,Kossätenweg,
2,7054012218,https://berlin.craigslist.org/roo/d/room-to-re...,2020-01-11 16:50,ROOM TO RENT IN THREE-BEDROOM APARTMENT IN WIL...,€1045,Fechnerstraße,
3,7054010475,https://berlin.craigslist.org/apa/d/2-queer-os...,2020-01-11 16:02,2 Queer-o's auf die Suche nach Zuhause,€750,Berlin,
4,7051853627,https://berlin.craigslist.org/sbw/d/2-week-sub...,2020-01-11 14:36,2 week sublet (Jan 20th - Feb 3rd) in brand ne...,€0,Schöneberg,100m
5,7053956828,https://berlin.craigslist.org/sub/d/furnished-...,2020-01-11 13:41,"furnished room in friendly flatshare for 3,",€550,Neukölln,1br - 18m
6,7053946498,https://berlin.craigslist.org/apa/d/mblierte-2...,2020-01-11 13:18,Möblierte 2-Zimmer- Wohnung in Berlin,€580,Chausseestr. 37 C 1,2br -
7,7040565122,https://berlin.craigslist.org/apa/d/bright-fur...,2020-01-11 12:05,"BRIGHT , FURNISHED . LARGE 2 BEDROOM",€650,CITY,2br -
8,7040565269,https://berlin.craigslist.org/roo/d/one-bedroo...,2020-01-11 12:05,ONE BEDROOM FOR RENT !!,€400,CITY,
9,7053753066,https://berlin.craigslist.org/apa/d/bright-hig...,2020-01-11 02:05,Bright high-quality 2-rooms apartment,€680,Berlin,2br - 52m


In [122]:
craigslist.to_json("craiglist.json")