## Data Preparation

In [1]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json, re
from re import sub
from decimal import Decimal
import dask.dataframe as dd
from dask.distributed import Client, progress
import dask.dataframe as dd
from dask.delayed import delayed

### Dask Client

In [2]:
client = Client(n_workers=4, threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:33455  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.26 GB


### Database Connection

In [3]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [4]:
properties.count_documents({})

14209

### Convert Mongo Collection to DataFrame

#### New Properties

In [39]:
new_properties_json = list(properties.find({ 'use': 'Nuevo' }))

In [40]:
df_general_info = json_normalize(new_properties_json, record_path='offersType', 
                                meta=['_id', 
                                    'urlProperty',
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description'])
ddf_general_info = dd.from_pandas(df_general_info, npartitions=10)

In [41]:
df_location = json_normalize(new_properties_json, record_path='location', meta='urlProperty')
ddf_location = dd.from_pandas(df_location, npartitions=10)

In [42]:
df_owner_property = json_normalize(new_properties_json, record_path='ownerProperty', meta='urlProperty')
ddf_owner_property = dd.from_pandas(df_owner_property, npartitions=10)

In [43]:
df_features = json_normalize(new_properties_json, record_path='features', meta='urlProperty')
ddf_features = dd.from_pandas(df_features, npartitions=10)
new_columns = ['admon_price', 'antiquity', 'general_bathrooms', 'condition', 'construction_area',
               'floor', 'garages', 'includes_administration', 'interior_floors', 'range_prices', 
               'range_private_area', 'general_rooms', 'square_meters', 'square_meters_price', 'stratum', 
               'weather', 'urlProperty']
ddf_features = ddf_features.rename(columns=dict(zip(ddf_features.columns, new_columns)))

In [44]:
df_more_features = json_normalize(new_properties_json, record_path='moreFeatures', meta='urlProperty')
ddf_more_features = dd.from_pandas(df_more_features, npartitions=10)

In [45]:
ddf_list = [ddf_general_info, ddf_location, ddf_owner_property, ddf_features, ddf_more_features]
for ddf in ddf_list:
    ddf.set_index('urlProperty')

ddf = dd.concat(ddf_list, axis=1)
ddf.reset_index()

Unnamed: 0_level_0,index,area,bathrooms,offerType,price,privateArea,property,rooms,_id,urlProperty,scrapingDate,scrapingHour,modifyDate,modifyHour,code,status,type,use,nameProject,description,address,city,country,department,latitude,longitude,neighborhood,sector,urlProperty,contractType,financing,id,name,schedule,urlProperty,admon_price,antiquity,general_bathrooms,condition,construction_area,floor,garages,includes_administration,interior_floors,range_prices,range_private_area,general_rooms,square_meters,square_meters_price,stratum,weather,urlProperty,exteriorFeatures,interiorFeatures,sectorFeatures,urlProperty
npartitions=21,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
,int64,object,object,object,object,object,object,object,object,object,object,object,object,object,int64,object,object,object,object,object,object,object,object,object,float64,float64,object,object,object,object,object,int64,object,object,object,float64,object,int64,object,object,object,object,object,int64,object,object,int64,object,float64,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [46]:
ddf_new_properties = ddf.loc[:,~ddf.columns.duplicated()]

#### Old Properties

In [47]:
old_properties_json = list(properties.find({ 'use': 'Usado' }))

In [48]:
df_general_info = pd.DataFrame(old_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description'])
ddf_general_info = dd.from_pandas(df_general_info, npartitions=10)

In [49]:
df_location = json_normalize(old_properties_json, 'location')
ddf_location = dd.from_pandas(df_location, npartitions=10)

In [50]:
df_owner_property = json_normalize(old_properties_json, 'ownerProperty')
ddf_owner_property = dd.from_pandas(df_owner_property, npartitions=10)

In [51]:
df_features = json_normalize(old_properties_json, 'features')
ddf_features = dd.from_pandas(df_features, npartitions=10)

In [52]:
df_more_features = json_normalize(old_properties_json, 'moreFeatures')
ddf_more_features = dd.from_pandas(df_more_features, npartitions=10)

In [53]:
ddf_list = [ddf_general_info, ddf_location, ddf_owner_property, ddf_features, ddf_more_features]
ddf = dd.concat(ddf_list, axis=1)
ddf.reset_index()

Unnamed: 0_level_0,index,_id,urlProperty,scrapingDate,scrapingHour,modifyDate,modifyHour,code,status,type,use,nameProject,description,address,city,country,department,latitude,longitude,neighborhood,sector,contractType,financing,id,name,schedule,admonPrice,antiquity,bathrooms,condition,constructionArea,floor,garages,includesAdministration,interiorFloors,price,privateArea,rooms,squareMeters,squareMetersPrice,stratum,weather,exteriorFeatures,interiorFeatures,sectorFeatures
npartitions=20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
,int64,object,object,object,object,object,object,int64,object,object,object,object,object,object,object,object,object,float64,float64,object,object,object,object,int64,object,object,float64,object,int64,object,object,object,object,object,int64,object,object,int64,object,float64,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [54]:
ddf_old_properties = ddf.loc[:,~ddf.columns.duplicated()]

### Rename DataFrame Columns

#### New Properties

In [55]:
new_columns = ['area', 'bathrooms', 'offer_type', 'price', 'private_area',
               'property', 'rooms', 'id_mongoose', 'id_property', 'scraping_date', 
               'scraping_hour', 'modify_hour', 'modify_date', 'code', 'active', 'type',
               'new_property', 'name_project', 'description', 'address', 'city', 'country', 
               'department', 'latitude', 'longitude', 'neighborhood', 'sector', 'contract_type',
               'financing', 'id', 'name', 'schedule', 'admon_price', 'antiquity', 'general_bathrooms',
               'condition', 'construction_area', 'floor', 'garages', 'includes_administration', 
               'interior_floors', 'range_prices', 'range_private_area', 'general_rooms', 'square_meters',
               'square_meters_price', 'stratum', 'weather', 'exterior_features', 'interior_features', 
               'sector_features']
ddf_new_properties = ddf_new_properties.rename(columns=dict(zip(ddf_new_properties.columns, new_columns)))

#### Old Properties

In [56]:
new_columns = ['id_mongoose', 'id_property', 'scraping_date', 
               'scraping_hour', 'modify_hour', 'modify_date', 'code', 'active', 'type',
               'new_property', 'name_project', 'description', 'address', 'city', 'country', 
               'department', 'latitude', 'longitude', 'neighborhood', 'sector', 'contract_type',
               'financing', 'id', 'name', 'schedule', 'admon_price', 'antiquity', 'bathrooms',
               'condition', 'construction_area', 'floor', 'garages', 'includes_administration', 
               'interior_floors', 'price', 'private_area', 'rooms', 'square_meters',
               'square_meters_price', 'stratum', 'weather', 'exterior_features', 'interior_features', 
               'sector_features']
ddf_old_properties = ddf_old_properties.rename(columns=dict(zip(ddf_old_properties.columns, new_columns)))

### Data Cleaning

#### New Properties

In [57]:
ddf_new_properties['active'] = ddf_new_properties['active'].apply(lambda status: True if (status == 'Active') else False, meta=('active', 'bool'))
ddf_new_properties['new_property'] = ddf_new_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('new_property', 'bool'))
ddf_new_properties['includes_administration'] = ddf_new_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('includes_administration', 'bool'))
ddf_new_properties['garages'] = ddf_new_properties['garages'].replace('', value = 0, regex = True).astype(int)
ddf_new_properties['stratum'] = ddf_new_properties['stratum'].replace('', value = 0, regex = True).astype(int)
ddf_new_properties['floor'] = ddf_new_properties['floor'].astype(int)
ddf_new_properties['area'] = ddf_new_properties['area'].astype(float)
ddf_new_properties['private_area'] = ddf_new_properties['private_area'].replace('', value = 0, regex = True)
ddf_new_properties['private_area'] = ddf_new_properties['private_area'].astype(float)
ddf_new_properties['rooms'] = ddf_new_properties['rooms'].astype(int)
ddf_new_properties['bathrooms'] = ddf_new_properties['bathrooms'].astype(int)
ddf_new_properties['price'] = ddf_new_properties['price'].apply(lambda price: Decimal(sub(r'[^\d,]', '', price)), meta=('price', 'int'))
ddf_new_properties['price'] = ddf_new_properties['price'].astype(int)

#### Old Properties

In [58]:
ddf_old_properties['active'] = ddf_old_properties['active'].apply(lambda status: True if (status == 'Active') else False, meta=('active', 'bool'))
ddf_old_properties['new_property'] = ddf_old_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('new_property', 'bool'))
ddf_old_properties['includes_administration'] = ddf_old_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('includes_administration', 'bool'))
ddf_old_properties['garages'] = ddf_old_properties['garages'].replace('', value = 0, regex = True)
ddf_old_properties['garages'] = ddf_old_properties['garages'].replace('Más de 10', value = 0, regex = True)
ddf_old_properties['garages'] = ddf_old_properties['garages'].astype(int)
ddf_old_properties['stratum'] = ddf_old_properties['stratum'].replace('', value = 0, regex = True)
ddf_old_properties['stratum'] = ddf_old_properties['stratum'].replace('Campestre', value = 0, regex = True)
ddf_old_properties['stratum'] = ddf_old_properties['stratum'].astype(int)
ddf_old_properties['price'] = ddf_old_properties['price'].apply(lambda price: Decimal(sub(r'[^\d,]', '', price)), meta=('price', 'int'))
ddf_old_properties['price'] = ddf_old_properties['price'].astype(int)
ddf_old_properties['square_meters'] = ddf_old_properties['square_meters'].str[0:-3]
ddf_old_properties['square_meters'] = ddf_old_properties['square_meters'].apply(lambda meters: Decimal(sub(r'[^\d,]', '', meters)), meta=('square_meters', 'float'))
ddf_old_properties['square_meters'] = ddf_old_properties['square_meters'].astype(float)
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].str[0:-2]
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].replace('', value = '0', regex = True)
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].apply(lambda area: Decimal(sub(r'[^\d.]', '', area)), meta=('private_area', 'float'))
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].astype(float)
ddf_old_properties['construction_area'] = ddf_old_properties['construction_area'].str[0:-3]
ddf_old_properties['construction_area'] = ddf_old_properties['construction_area'].apply(lambda area: Decimal(sub(r'[^\d,]', '', area)), meta=('construction_area', 'float'))
ddf_old_properties['construction_area'] = ddf_old_properties['construction_area'].astype(float)
ddf_old_properties['floor'] = ddf_old_properties['floor'].astype(int)

### Testing

#### New Properties

In [59]:
ddf_new_properties.head(5)

Unnamed: 0,area,bathrooms,offer_type,price,private_area,property,rooms,id_mongoose,id_property,scraping_date,...,range_prices,range_private_area,general_rooms,square_meters,square_meters_price,stratum,weather,exterior_features,interior_features,sector_features
0,46.3,2,Venta,206200000,0.0,Apartamento,2,5dbaf08d8f40fc05710379ee,https://www.fincaraiz.com.co/reserva-serrat-se...,31/10/2019,...,Desde $ 206.200.000 Hasta $ 238.400.000,,2,"46,30 a 57,90",4453563.71,4,,"[Piscina, Salón Comunal, Zona de Camping, Zona...","[Ascensores Comunales, Balcón, Closet]","[Cómodas vias de acceso, Seguridad, Supermerca..."
1,57.9,2,Venta,238400000,0.0,Apartamento,3,5dbaf08d8f40fc05710379ee,https://www.fincaraiz.com.co/reserva-serrat-se...,31/10/2019,...,Desde $ 395.632.500 Hasta $ 411.632.500,8290,3,8750,4521514.29,4,,"[Ascensor, Cancha de Squash, Garaje / Parquead...","[Balcón, Estudio, Garaje Cubierto]",[Zona Residencial]
2,87.5,3,Venta,395632500,82.9,Apartamento,3,5dbaf08db187b10d230379ee,https://www.fincaraiz.com.co/citte/medellin/pr...,31/10/2019,...,Desde $ 314.610.582 Hasta $ 775.329.030,"38,05 a 105,05",1,"41,75 a 115,02",7535582.8,6,,"[Ascensor, En conjunto cerrado, Garaje / Parqu...","[Balcón, Cocina Integral, Sauna / Turco / Jacu...","[Bombas de gasolina, Cerca centro comercial, C..."
3,87.5,3,Venta,406132500,82.9,Apartamento,3,5dbaf08db187b10d230379ee,https://www.fincaraiz.com.co/citte/medellin/pr...,31/10/2019,...,Desde $ 292.633.175 Hasta $ 504.651.335,,1,"58,79 a 101,16",4977601.21,4,Cálido,"[Acceso Pavimentado, Corrales, Cuarto de Escol...","[Acceso para camiones, Acceso para tractomulas...","[Bombas de gasolina, Cerca a sector comercial,..."
4,87.5,3,Venta,396632500,82.9,Apartamento,3,5dbaf08db187b10d230379ee,https://www.fincaraiz.com.co/citte/medellin/pr...,31/10/2019,...,Desde $ 569.575.530 Hasta $ 780.532.220,,3,"86,32 a 120,20",6598419.02,6,,"[Piscina, Salón Comunal]",[Balcón],"[Cómodas vias de acceso, Restaurantes, Segurid..."


#### Old Properties

In [61]:
ddf_old_properties.head(5)

Unnamed: 0,id_mongoose,id_property,scraping_date,scraping_hour,modify_hour,modify_date,code,active,type,new_property,...,price,private_area,rooms,square_meters,square_meters_price,stratum,weather,exterior_features,interior_features,sector_features
0,5dbaf08da724eb62520379ee,https://www.fincaraiz.com.co/apartamento-en-ve...,31/10/2019,09:22:46,19/10/2019,16:22:31,4868887,True,Venta,False,...,185000000,0.0,1,58.0,3189655.17,4,,"[Circuito cerrado de TV, Portería / Recepción,...","[Baño Auxiliar, Citófono, Instalación de gas, ...","[Colegios / Universidades, Parques cercanos, S..."
1,5dbaf08dae9d7e77530379ee,https://www.fincaraiz.com.co/apartamento-en-ve...,31/10/2019,09:22:46,30/09/2019,01:20:27,4913705,True,Venta,False,...,555000000,104.0,2,104.0,5336538.46,6,,"[En conjunto cerrado, Garaje / Parqueadero(s),...","[Balcón, Calentador, Closet, Cuarto de Servici...",[]
2,5dbaf08d29dd38ca5e0379ee,https://www.fincaraiz.com.co/apartamento-en-ve...,31/10/2019,09:22:46,30/10/2019,11:02:08,4031875,True,Venta,False,...,1420000000,220.0,3,220.0,6454545.45,6,,"[Ascensor, Canchas Deportivas, En conjunto cer...","[Balcón, Citófono, Closet, Cocina Integral, Sh...","[Área Urbana, Zona Residencial]"
3,5dbaf08d9319faf5bb0379ee,https://www.fincaraiz.com.co/casa-en-venta/med...,31/10/2019,09:22:46,25/10/2019,01:36:45,5027509,True,Venta,False,...,195000000,105.0,4,105.0,1857142.86,3,,"[En conjunto cerrado, Garaje / Parqueadero(s),...","[Balcón, Calentador, Closet, Instalación de ga...",[]
4,5dbaf08d9753b6ebb60379ee,https://www.fincaraiz.com.co/apartamento-en-ve...,31/10/2019,09:22:46,23/10/2019,16:50:18,5022378,True,Venta,False,...,225000000,64.0,2,64.0,3515625.0,3,,"[Acceso Pavimentado, Árboles frutales, Asador,...","[Balcón, Baño Auxiliar, Baño Independiente, Ba...","[Parques cercanos, Supermercados / C.Comercial..."
