#### Finca Raiz data scrapping

In [1]:
# Explanation of this code and more fun tutorials at www.squaid.com

# Import libraries we are going to use
import pandas as pd
import itertools
import requests
import json

In [2]:
# Just a couple of functions
def clean_element(element: dict) -> dict:
    """ This function is created to clean data and take just relevant keys"""
    relevant_keys = ["area","rooms","garages", "baths","stratum","is_new","price","locations"]

    element_cleaned = {key: element["_source"]["listing"][key] for key in relevant_keys}

    keys_name = ["rooms","baths","stratum","garages"]

    for key in keys_name:
        element_cleaned[key] = element_cleaned[key]["name"]

    return element_cleaned

def get_frdata(business: str, property_type: str, city: list, limit: int) -> list:
    """This functions is created to ger data from Finca Raiz a bit improved in order to improve amount of data scrapped"""
    url = 'https://api.fincaraiz.com.co/document/api/1.0/listing/search'
    request_json = {
        "filter": {
            "offer": {
                "slug": [
                    business
                ]
            },
            "property_type": {
                "slug": [
                    property_type
                ]
            },
            "locations": {
                "cities": {
                    "slug": city
                }
            }
        },
        "fields": {
            "exclude": [],
            "facets": [],
            "include": [
                "area",
                "baths.id",
                "baths.name",
                "baths.slug",
                "client.client_type",
                "client.company_name",
                "client.first_name",
                "client.fr_client_id",
                "client.last_name",
                "client.logo.full_size",
                "garages.name",
                "is_new",
                "locations.cities.fr_place_id",
                "locations.cities.name",
                "locations.cities.slug",
                "locations.countries.fr_place_id",
                "locations.countries.name",
                "locations.countries.slug",
                "locations.groups.name",
                "locations.groups.slug",
                "locations.groups.subgroups.name",
                "locations.groups.subgroups.slug",
                "locations.neighbourhoods.fr_place_id",
                "locations.neighbourhoods.name",
                "locations.neighbourhoods.slug",
                "locations.states.fr_place_id",
                "locations.states.name",
                "locations.states.slug",
                "locations.location_point",
                "max_area",
                "max_price",
                "media.photos.list.image.full_size",
                "media.photos.list.is_main",
                "media.videos.list.is_main",
                "media.videos.list.video",
                "media.logo.full_size",
                "min_area",
                "min_price",
                "offer.name",
                "price",
                "products.configuration.tag_id",
                "products.configuration.tag_name",
                "products.label",
                "products.name",
                "products.slug",
                "property_id",
                "property_type.name",
                "fr_property_id",
                "fr_parent_property_id",
                "rooms.id",
                "rooms.name",
                "rooms.slug",
                "stratum.name",
                "title"
            ],
            "limit": limit,
            "offset": 0,
            "ordering": [
                {
                    "field": "price",
                    "type": "asc"
                }
            ],
            "platform": 41,
            "with_algorithm": False
        }
    }

    data = []
    response = requests.post(url,json=request_json)
    response_body =  json.loads(response.text)

    total_assets = response_body['hits']['total']['value']
    data_response = response_body['hits']['hits']

    data.append(data_response)

    if total_assets > len(data_response):

        if total_assets < 10000:
            while len(data_response) > 0:
                request_json['fields']['offset']+=limit
                response = requests.post(url,json=request_json)
                response_body =  json.loads(response.text)
                data_response = response_body['hits']['hits']
                data.append(data_response)
        else:
            
            for stratum_id in ["100","1","2","3","4","5","6"]:

                request_json['filter']['stratum'] = {"id": [stratum_id]}
                request_json['fields']['offset']=0
                response = requests.post(url,json=request_json)
                response_body =  json.loads(response.text)
                total_assets = response_body['hits']['total']['value']
                data_response = response_body['hits']['hits']
                data.append(data_response)

                while len(data_response) > 0:
                    request_json['fields']['offset']+=limit
                    if request_json['fields']['offset'] >=10000: break
                    response = requests.post(url,json=request_json)
                    response_body =  json.loads(response.text)
                    data_response = response_body['hits']['hits']

                    data.append(data_response)

    data = list(itertools.chain(*data))
    return data

In [3]:

# Let´s get data from apartments in Bogota
business = "sell"
property_type = "apartment"
city = ["city-colombia-11-001","colombia-cundinamarca-3630001-bogotá"]
limit = 100

data = get_frdata(business, property_type, city, limit) 

In [4]:

# Data cleansing
generator_clean = (clean_element(element) for element in data)
data_cleaned = pd.DataFrame(list(generator_clean))
data_cleaned

Unnamed: 0,area,rooms,garages,baths,stratum,is_new,price,locations
0,230.0,3,3,4,Estrato 6,False,1350000.0,"{'neighbourhoods': [{'fr_place_id': 3632021, '..."
1,256.0,3,2,5,Estrato 6,False,1950000.0,"{'neighbourhoods': [{'fr_place_id': 3632074, '..."
2,410.0,4,4,6,Estrato 5,False,2650000.0,"{'neighbourhoods': [{'fr_place_id': 0, 'name':..."
3,140.0,3,1,3,Estrato 5,False,3500000.0,"{'neighbourhoods': [{'fr_place_id': 0, 'name':..."
4,60.0,1,1,2,Estrato 6,False,8000000.0,"{'neighbourhoods': [{'fr_place_id': 3632063, '..."
...,...,...,...,...,...,...,...,...
40117,290.0,3,3,4,Estrato 6,False,1700000000.0,"{'neighbourhoods': [{'fr_place_id': 3632013, '..."
40118,258.0,5,3,6,Estrato 6,False,1700000000.0,"{'neighbourhoods': [{'fr_place_id': 3632113, '..."
40119,187.0,3,2,4,Estrato 6,False,1700000000.0,"{'neighbourhoods': [{'fr_place_id': 3632070, '..."
40120,220.0,3,3,5,Estrato 6,False,1700000000.0,"{'neighbourhoods': [{'fr_place_id': 3632077, '..."
