In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep

In [2]:
def scraper(url, pagenum):
    '''
    Web Scraper for Lamudi.co.id
    -------------------------------
    Parameter:
    
    url(str): url to scrape
    pagenum(int): number of page to scrape + 1
    -------------------------------
    Return:
    
    house_list
    '''
    house_list = []
    for page in range(1, pagenum):
        print(f'Scraping page: {page}/{pagenum-1}')
        html_doc = requests.get(url+str(page)).text
        soup = BeautifulSoup(html_doc, 'lxml')
        houses = soup.find_all('a', 'js-listing-link')
        links = set([house.get('href') for house in houses])
        for link in links:
            html_link = requests.get(link).text
            soup_link = BeautifulSoup(html_link, 'lxml')
            try:
                price = soup_link.find('span', 'Overview-main FirstPrice').text.replace('Rp ','').replace('.','')
            except:
                price = np.nan
            try:
                specs = soup_link.find('div', 'Overview-attribute-container').text.split()
            except:
                specs = np.nan
            try:
                bedroom = specs[0]
            except:
                bedroom = np.nan
            try:
                hsize = specs[3]
            except:
                hsize = np.nan
            try:
                lsize = specs[-3]
            except:
                lsize = np.nan
            try:
                location = soup_link.find('span', 'Header-title-address-text').text.replace(',','').split()
                if len(location) > 2:
                    location = location[-2]
                else:
                    location = location[0]
            except:
                location = np.nan
            try:
                facility = len(soup_link.find('div', 'columns medium-12 small-12 ViewMore-text-description').text.replace(' ','').split())
            except:
                facility = np.nan

            house_dict = {
                'price':price,
                'bedroom':bedroom,
                'hsize':hsize,
                'lsize':lsize,
                'location':location,
                'facility':facility
            }
            house_list.append(house_dict)
    return house_list

In [3]:
url = 'https://www.lamudi.co.id/east-java/malang/house/buy/?page='

In [4]:
house_list = scraper(url, 101)

Scraping page: 1/100
Scraping page: 2/100
Scraping page: 3/100
Scraping page: 4/100
Scraping page: 5/100
Scraping page: 6/100
Scraping page: 7/100
Scraping page: 8/100
Scraping page: 9/100
Scraping page: 10/100
Scraping page: 11/100
Scraping page: 12/100
Scraping page: 13/100
Scraping page: 14/100
Scraping page: 15/100
Scraping page: 16/100
Scraping page: 17/100
Scraping page: 18/100
Scraping page: 19/100
Scraping page: 20/100
Scraping page: 21/100
Scraping page: 22/100
Scraping page: 23/100
Scraping page: 24/100
Scraping page: 25/100
Scraping page: 26/100
Scraping page: 27/100
Scraping page: 28/100
Scraping page: 29/100
Scraping page: 30/100
Scraping page: 31/100
Scraping page: 32/100
Scraping page: 33/100
Scraping page: 34/100
Scraping page: 35/100
Scraping page: 36/100
Scraping page: 37/100
Scraping page: 38/100
Scraping page: 39/100
Scraping page: 40/100
Scraping page: 41/100
Scraping page: 42/100
Scraping page: 43/100
Scraping page: 44/100
Scraping page: 45/100
Scraping page: 46/1

In [5]:
df = pd.DataFrame(house_list)
df.head()

Unnamed: 0,price,bedroom,hsize,lsize,location,facility
0,320000000,2,36,72,Kedungkandang,2.0
1,600000000,3,70,64,Dau,4.0
2,459000000,3,65,60,Pakis,6.0
3,227000000,2,62,43,Pakis,6.0
4,250000000,2,50,84,Singosari,


In [7]:
df.to_csv('malang.csv', index=False)