## Data Preparation

In [24]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json, re
from re import sub
from decimal import Decimal
import dask.dataframe as dd
from dask.distributed import Client, progress
import dask.dataframe as dd
from dask.delayed import delayed

### Dask Client

In [25]:
client = Client(n_workers=4, threads_per_worker=1)
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:40683  Dashboard: http://127.0.0.1:36311/status,Cluster  Workers: 4  Cores: 4  Memory: 8.26 GB


### Database Connection

In [158]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'test_scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [159]:
properties.count_documents({})

68

### Convert Mongo Collection to DataFrame

#### New Properties

In [160]:
new_properties_json = list(properties.find({ 'use': 'Nuevo' }))

In [186]:
df_new_properties = json_normalize(new_properties_json, record_path='offersType', 
                                meta=['_id', 
                                    'urlProperty',
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                     'country',
                                     'department',
                                     'city',
                                     'sector',
                                     'neighborhood',
                                     'address',
                                     'latitude',
                                     'longitude',
                                     'idOwnerProperty',
                                     'nameOwnerProperty',
                                     'contractType',
                                     'financing',
                                     'schedule',
                                     'price',
                                     'squareMeters',
                                     'rooms',
                                     'bathrooms',
                                     'garages',
                                     'privateArea',
                                     'constructionArea',
                                     'squareMetersPrice',
                                     'stratum',
                                     'condition',
                                     'antiquity',
                                     'floor',
                                     'interiorFloors',
                                     'weather',
                                     'includesAdministration',
                                     'admonPrice',
                                     'interiorFeatures',
                                     'exteriorFeatures',
                                     'sectorFeatures'])
ddf_new_properties = dd.from_pandas(df_new_properties, npartitions=10)

In [187]:
ddf_new_properties = ddf_new_properties.loc[:,~ddf_new_properties.columns.duplicated(keep='first')]

In [188]:
ddf_new_properties = ddf_new_properties.dropna()

#### Old Properties

In [224]:
old_properties_json = list(properties.find({ 'use': 'Usado' }))

In [225]:
df_general_info = json_normalize(old_properties_json,
                              meta=['_id', 
                                    'urlProperty',
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                     'country',
                                     'department',
                                     'city',
                                     'sector',
                                     'neighborhood',
                                     'address',
                                     'latitude',
                                     'longitude',
                                     'idOwnerProperty',
                                     'nameOwnerProperty',
                                     'contractType',
                                     'financing',
                                     'schedule',
                                     'price',
                                     'squareMeters',
                                     'rooms',
                                     'bathrooms',
                                     'garages',
                                     'privateArea',
                                     'constructionArea',
                                     'squareMetersPrice',
                                     'stratum',
                                     'condition',
                                     'antiquity',
                                     'floor',
                                     'interiorFloors',
                                     'weather',
                                     'includesAdministration',
                                     'admonPrice'])
ddf_general_info = dd.from_pandas(df_general_info, npartitions=10)

In [226]:
ddf_general_info.columns

Index(['_id', 'address', 'admonPrice', 'antiquity', 'bathrooms', 'city',
       'code', 'condition', 'constructionArea', 'contractType', 'country',
       'department', 'description', 'financing', 'floor', 'garages',
       'idOwnerProperty', 'includesAdministration', 'interiorFloors',
       'latitude', 'longitude', 'modifyDate', 'modifyHour', 'moreFeatures',
       'moreFeatures.exteriorFeatures', 'moreFeatures.interiorFeatures',
       'moreFeatures.sectorFeatures', 'nameOwnerProperty', 'nameProject',
       'neighborhood', 'offersType', 'price', 'privateArea', 'rooms',
       'schedule', 'scrapingDate', 'scrapingHour', 'sector', 'squareMeters',
       'squareMetersPrice', 'status', 'stratum', 'type', 'urlProperty', 'use',
       'weather'],
      dtype='object')

In [221]:
df_more_features = json_normalize(old_properties_json, 'moreFeatures')
ddf_more_features = dd.from_pandas(df_more_features, npartitions=10)

In [222]:
ddf_list = [ddf_general_info, ddf_more_features]
ddf = dd.concat(ddf_list, axis=1)
ddf.reset_index()

Unnamed: 0_level_0,index,_id,address,admonPrice,antiquity,bathrooms,city,code,condition,constructionArea,contractType,country,department,description,financing,floor,garages,idOwnerProperty,includesAdministration,interiorFloors,latitude,longitude,modifyDate,modifyHour,moreFeatures,moreFeatures.exteriorFeatures,moreFeatures.interiorFeatures,moreFeatures.sectorFeatures,nameOwnerProperty,nameProject,neighborhood,offersType,price,privateArea,rooms,schedule,scrapingDate,scrapingHour,sector,squareMeters,squareMetersPrice,status,stratum,type,urlProperty,use,weather,0
npartitions=20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
,int64,object,object,float64,object,int64,object,int64,object,object,object,object,object,object,object,object,object,int64,object,int64,float64,float64,object,object,object,object,object,object,object,object,object,object,object,object,int64,object,object,object,object,object,float64,object,object,object,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [193]:
ddf_old_properties = ddf.loc[:,~ddf.columns.duplicated()]

In [194]:
ddf_old_properties = ddf_old_properties.dropna()

In [201]:
ddf_old_properties.columns

Index([                  'id_mongoose',                       'address',
                         'admon_price',                     'antiquity',
                           'bathrooms',                          'city',
                                'code',                     'condition',
                   'construction_area',  'contract_type_owner_property',
                             'country',                    'department',
                         'description',      'financing_owner_property',
                               'floor',                       'garages',
                   'id_owner_property',       'includes_administration',
                     'interior_floors',                      'latitude',
                           'longitude',                   'modify_date',
                         'modify_hour',                  'moreFeatures',
       'moreFeatures.exteriorFeatures', 'moreFeatures.interiorFeatures',
         'moreFeatures.sectorFeatures',           '

### Rename DataFrame Columns

#### New Properties

In [195]:
ddf_new_properties = ddf_new_properties.rename(columns={
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "idOwnerProperty": "id_owner_property",
                        "nameOwnerProperty": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "bathrooms": "general_bathrooms",
                        "rooms": "general_rooms",
                        "price": "range_prices",
                        "squareMeters": "range_square_meters",
                        "constructionArea": "range_construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features",
                        "offerType": "offer_type",
                        "privateArea": "range_private_area",
                        "areaOfferType": "area",
                        "bathroomsOfferType": "bathrooms",
                        "priceOfferType": "price",
                        "privateAreaOfferType": "private_area",
                        "roomsOfferType": "rooms"
})

#### Old Properties

In [196]:
ddf_old_properties = ddf_old_properties.rename(columns={
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "idOwnerProperty": "id_owner_property",
                        "nameOwnerProperty": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "offerType": "offer_type",
                        "areaOfferType": "area",
                        "bathroomsOfferType": "bathrooms",
                        "priceOfferType": "price",
                        "privateAreaOfferType": "private_area",
                        "roomsOfferType": "rooms"
})

### Data Cleaning

#### New Properties

In [197]:
ddf_new_properties['id_mongoose'] = ddf_new_properties['id_mongoose'].astype(str)
ddf_new_properties['code'] = ddf_new_properties['code'].astype(int)
ddf_new_properties['active'] = ddf_new_properties['active'].apply(lambda status: True if (status == 'Active') else False, meta=('active', 'bool'))
ddf_new_properties['new_property'] = ddf_new_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('new_property', 'bool'))
ddf_new_properties['includes_administration'] = ddf_new_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('includes_administration', 'bool'))
ddf_new_properties['garages'] = ddf_new_properties['garages'].replace('', value = 0, regex = True).astype(int)
ddf_new_properties['stratum'] = ddf_new_properties['stratum'].replace('', value = 0, regex = True).astype(int)
ddf_new_properties['floor'] = ddf_new_properties['floor'].astype(int)
ddf_new_properties['area'] = ddf_new_properties['area'].astype(float)
ddf_new_properties['private_area'] = ddf_new_properties['private_area'].replace('', value = 0, regex = True)
ddf_new_properties['private_area'] = ddf_new_properties['private_area'].astype(float)
ddf_new_properties['rooms'] = ddf_new_properties['rooms'].astype(int)
ddf_new_properties['bathrooms'] = ddf_new_properties['bathrooms'].astype(int)
ddf_new_properties['price'] = ddf_new_properties['price'].apply(lambda price: Decimal(sub(r'[^\d,]', '', price)), meta=('price', 'int'))
ddf_new_properties['price'] = ddf_new_properties['price'].astype(int)

#### Old Properties

In [198]:
ddf_old_properties['id_mongoose'] = ddf_old_properties['id_mongoose'].astype(str)
ddf_old_properties['code'] = ddf_old_properties['code'].astype(int)
ddf_old_properties['active'] = ddf_old_properties['active'].apply(lambda status: True if (status == 'Active') else False, meta=('active', 'bool'))
ddf_old_properties['new_property'] = ddf_old_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('new_property', 'bool'))
ddf_old_properties['includes_administration'] = ddf_old_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False, meta=('includes_administration', 'bool'))
ddf_old_properties['garages'] = ddf_old_properties['garages'].replace('', value = 0, regex = True)
ddf_old_properties['garages'] = ddf_old_properties['garages'].replace('MÃ¡s de 10', value = 0, regex = True)
ddf_old_properties['garages'] = ddf_old_properties['garages'].astype(int)
ddf_old_properties['stratum'] = ddf_old_properties['stratum'].replace('', value = 0, regex = True)
ddf_old_properties['stratum'] = ddf_old_properties['stratum'].replace('Campestre', value = 0, regex = True)
ddf_old_properties['stratum'] = ddf_old_properties['stratum'].astype(int)
ddf_old_properties['price'] = ddf_old_properties['price'].apply(lambda price: Decimal(sub(r'[^\d,]', '', price)), meta=('price', 'int'))
ddf_old_properties['price'] = ddf_old_properties['price'].astype(int)
ddf_old_properties['square_meters'] = ddf_old_properties['square_meters'].str[0:-3]
ddf_old_properties['square_meters'] = ddf_old_properties['square_meters'].apply(lambda meters: Decimal(sub(r'[^\d,]', '', meters)), meta=('square_meters', 'float'))
ddf_old_properties['square_meters'] = ddf_old_properties['square_meters'].astype(float)
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].str[0:-2]
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].replace('', value = '0', regex = True)
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].apply(lambda area: Decimal(sub(r'[^\d.]', '', area)), meta=('private_area', 'float'))
ddf_old_properties['private_area'] = ddf_old_properties['private_area'].astype(float)
ddf_old_properties['construction_area'] = ddf_old_properties['construction_area'].str[0:-3]
ddf_old_properties['construction_area'] = ddf_old_properties['construction_area'].apply(lambda area: Decimal(sub(r'[^\d,]', '', area)), meta=('construction_area', 'float'))
ddf_old_properties['construction_area'] = ddf_old_properties['construction_area'].astype(float)
ddf_old_properties['floor'] = ddf_old_properties['floor'].astype(int)

### Testing

#### New Properties

In [199]:
ddf_new_properties.describe().head()

Unnamed: 0,area,bathrooms,price,private_area,rooms,code,latitude,longitude,id_owner_property,general_rooms,general_bathrooms,garages,square_meters_price,stratum,floor,interior_floors,admon_price
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,85.865,2.333333,404486300.0,0.0,2.833333,2900167.0,0.0,0.0,519363.333333,1.666667,2.0,0.333333,4713455.0,4.333333,0.0,0.0,0.0
std,15.517403,0.516398,85157950.0,0.0,0.983192,785010.3,0.0,0.0,262356.40827,1.032796,0.0,0.516398,409214.0,0.516398,0.0,0.0,0.0
min,58.79,2.0,292633200.0,0.0,1.0,2393445.0,0.0,0.0,350013.0,1.0,2.0,0.0,4185162.0,4.0,0.0,0.0,0.0
25%,85.57,2.0,343750000.0,0.0,3.0,2393445.0,0.0,0.0,350013.0,1.0,2.0,0.0,4185162.0,4.0,0.0,0.0,0.0


#### Old Properties

In [200]:
ddf_old_properties.describe().head()

ValueError: No non-trivial arrays found