#### Finca Raiz Web Data Scrapping

This example was created to show how to do web scrapping using the API. For more details and tutorials go to: www.squaid.com

In [None]:
# Author www.squaid.com
# Import libraries we are going to use
import pandas as pd
from datetime import date
import itertools
import requests
import json

In [None]:
# Just a couple of functions
def clean_element(element: dict) -> dict:
    """ This function is created to clean data and take just relevant keys"""
    relevant_keys = ["area","rooms","garages", "baths","stratum","is_new","price","locations"]

    element_cleaned = {key: element["_source"]["listing"][key] for key in relevant_keys}

    keys_name = ["rooms","baths","stratum","garages"]

    for key in keys_name:
        element_cleaned[key] = element_cleaned[key]["name"]
    try:
        element_cleaned["neighbourhoods"] = list(set([x["name"].lower() for x in element_cleaned["locations"]["neighbourhoods"]]))
    except:
        element_cleaned["neighbourhoods"] = []

    element_cleaned["cities"] = list(set([x["name"].lower() for x in element_cleaned["locations"]["cities"]]))

    element_cleaned["location_gps"] = element_cleaned["locations"]["location_point"]
    
    del element_cleaned["locations"]

    return element_cleaned

def get_frdata(business: str, property_type: str, city: list, limit: int) -> list:
    """This functions is created to ger data from Finca Raiz a bit improved in order to improve amount of data scrapped"""
    url = 'https://api.fincaraiz.com.co/document/api/1.0/listing/search'
    request_json = {
        "filter": {
            "offer": {
                "slug": [
                    business
                ]
            },
            "property_type": {
                "slug": [
                    property_type
                ]
            },
            "locations": {
                "cities": {
                    "slug": city
                }
            }
        },
        "fields": {
            "exclude": [],
            "facets": [],
            "include": [
                "area",
                "baths.id",
                "baths.name",
                "baths.slug",
                "client.client_type",
                "client.company_name",
                "client.first_name",
                "client.fr_client_id",
                "client.last_name",
                "client.logo.full_size",
                "garages.name",
                "is_new",
                "locations.cities.fr_place_id",
                "locations.cities.name",
                "locations.cities.slug",
                "locations.countries.fr_place_id",
                "locations.countries.name",
                "locations.countries.slug",
                "locations.groups.name",
                "locations.groups.slug",
                "locations.groups.subgroups.name",
                "locations.groups.subgroups.slug",
                "locations.neighbourhoods.fr_place_id",
                "locations.neighbourhoods.name",
                "locations.neighbourhoods.slug",
                "locations.states.fr_place_id",
                "locations.states.name",
                "locations.states.slug",
                "locations.location_point",
                "max_area",
                "max_price",
                "media.photos.list.image.full_size",
                "media.photos.list.is_main",
                "media.videos.list.is_main",
                "media.videos.list.video",
                "media.logo.full_size",
                "min_area",
                "min_price",
                "offer.name",
                "price",
                "products.configuration.tag_id",
                "products.configuration.tag_name",
                "products.label",
                "products.name",
                "products.slug",
                "property_id",
                "property_type.name",
                "fr_property_id",
                "fr_parent_property_id",
                "rooms.id",
                "rooms.name",
                "rooms.slug",
                "stratum.name",
                "title"
            ],
            "limit": limit,
            "offset": 0,
            "ordering": [
                {
                    "field": "price",
                    "type": "asc"
                }
            ],
            "platform": 41,
            "with_algorithm": False
        }
    }

    data = []
    response = requests.post(url,json=request_json)
    response_body =  json.loads(response.text)

    total_assets = response_body['hits']['total']['value']
    data_response = response_body['hits']['hits']

    data.append(data_response)

    if total_assets > len(data_response):

        if total_assets < 10000:
            while len(data_response) > 0:
                request_json['fields']['offset']+=limit
                response = requests.post(url,json=request_json)
                response_body =  json.loads(response.text)
                data_response = response_body['hits']['hits']
                data.append(data_response)
        else:
            
            for stratum_id in ["100","1","2","3","4","5","6"]:

                request_json['filter']['stratum'] = {"id": [stratum_id]}
                request_json['fields']['offset']=0
                response = requests.post(url,json=request_json)
                response_body =  json.loads(response.text)
                total_assets = response_body['hits']['total']['value']
                data_response = response_body['hits']['hits']
                data.append(data_response)

                while len(data_response) > 0:
                    request_json['fields']['offset']+=limit
                    if request_json['fields']['offset'] >=10000: break
                    response = requests.post(url,json=request_json)
                    response_body =  json.loads(response.text)
                    data_response = response_body['hits']['hits']

                    data.append(data_response)

    data = list(itertools.chain(*data))
    return data

In [None]:
# Let´s get data Colombia Real State
business_types = ["sell","rent"]
property_types = ["studio","apartment","house","country-house","house-lot","farm","lot"]
cities = {
    'cali': ["city-colombia-76-001","colombia-valle-del-cauca-8200006-cali"],
    'jamundi': ["city-colombia-76-364","colombia-valle-del-cauca-8200003-jamundí"],
    'bogota': ["city-colombia-11-001","colombia-cundinamarca-3630001-bogotá"],
    'medellin': ["city-colombia-05-001","colombia-antioquia-5500006-medellín"],
    'cota': ["city-colombia-25-214","colombia-cundinamarca-6700015-cota"],
    'chia': ["city-colombia-25-175","colombia-cundinamarca-6700003-chía"],
    'girardot': ["city-colombia-25-307","colombia-cundinamarca-6700001-girardot"],
    'santa_marta': ["city-colombia-47-001","colombia-magdalena-7200003-santa-marta"],
    'cartagena': ["city-colombia-13-001","colombia-bolívar-5800003-cartagena"]
}
limit = 100

try:
    print(isinstance(assets,dict))
except:
    assets = {}

for city in cities:
    if not(city in assets.keys()):
        assets[city] = {}
    for business in business_types:
        if not(business in assets[city].keys()):
            assets[city][business] = {}
        for type in property_types:
            print(f"City: {city} Type:{type} ...")
            data = get_frdata(business, type, cities[city], limit) 
            print(f"... Data recolected:{len(data)}")
            assets[city][business][type] = data

In [None]:
# Save raw data
today = date.today()
d1 = today.strftime("%d%m%Y")
with open(f"./data/fr_{d1}.json", "w") as archivo:
    json.dump(assets, archivo)

In [None]:
#Clean data json
assets_cleaned = {}
for city_key in assets.keys():
    if not(city_key in assets_cleaned.keys()):
        assets_cleaned[city_key] = {}
    for business_key in assets[city_key].keys():
        if not(business_key in assets_cleaned[city_key].keys()):
            assets_cleaned[city_key][business_key] = {}
        for type_key in assets[city_key][business_key].keys():
            generator_clean = (clean_element(element) for element in assets[city_key][business_key][type_key])
            assets_cleaned[city_key][business_key][type_key] = list(generator_clean)

In [None]:
# Save cleaned data
today = date.today()
d1 = today.strftime("%d%m%Y")
with open(f"./data/fr_cleaned_{d1}.json", "w") as archivo:
    json.dump(assets_cleaned, archivo)

In [None]:
#Convert data into dataframe
data_full_df = []
for city_key in assets_cleaned.keys():
    for business_key in assets_cleaned[city_key].keys():
        for type_key in assets_cleaned[city_key][business_key].keys():
            data_df = pd.DataFrame(assets_cleaned[city_key][business_key][type_key])
            data_df["city"] = city_key
            data_df["business"] = business_key
            data_df["type"] = type_key
            data_full_df.append(data_df)

datafull = pd.concat(data_full_df)

In [None]:

# Save cleaned data csv
today = date.today()
d1 = today.strftime("%d%m%Y")
datafull.to_csv(f"./data/fr_cleaned_{d1}.csv", "w")

In [None]:
datafull