## Data Preparation

In [204]:
import pandas as pd
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json

### Database Connection

In [205]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [206]:
properties.count_documents({})

13813

### Convert Mongo Collection to DataFrame

In [207]:
new_properties_json = list(properties.find({ 'use': 'Nuevo' }))
df_general_info = pd.DataFrame(new_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(new_properties_json, 'location')
df_owner_property = json_normalize(new_properties_json, 'ownerProperty')
df_features = json_normalize(new_properties_json, 'features')
df_more_features = json_normalize(new_properties_json, 'moreFeatures')
df_new_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

In [208]:
old_properties_json = list(properties.find({ 'use': 'Usado' }))
df_general_info = pd.DataFrame(old_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(old_properties_json, 'location')
df_owner_property = json_normalize(old_properties_json, 'ownerProperty')
df_features = json_normalize(old_properties_json, 'features')
df_more_features = json_normalize(old_properties_json, 'moreFeatures')
df_old_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

### Rename DataFrame Columns

In [209]:
df_new_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

df_old_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

### Data Cleaning

In [234]:
df_new_properties['active'] = df_new_properties['active'].apply(lambda status: True if (status == 'Active') else False)
df_old_properties['active'] = df_old_properties['active'].apply(lambda status: True if (status == 'Active') else False)

df_new_properties['new_property'] = df_new_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['new_property'] = df_old_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)

df_new_properties['includes_administration'] = df_new_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['includes_administration'] = df_old_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)

df_new_properties['garages'] = df_new_properties['garages'].fillna(0).astype(int)
df_old_properties['garages'] = df_old_properties['garages'].fillna(0).astype(int)

#### New Properties DataFrame

In [235]:
df_new_properties['garages']

0       1
1       0
2       1
3       0
4       0
       ..
4453    1
4454    2
4455    2
4456    1
4457    1
Name: garages, Length: 4458, dtype: int64

#### Old Properties DataFrame

In [212]:
df_old_properties.head(5)

Unnamed: 0,id_mongoose,id_property,scraping_date,scraping_hour,modify_date,modify_hour,code,active,type,new_property,...,condition,antiquity,floors,interior_floors,weather,includes_administration,admon_price,interior_features,exterior_features,sector_features
0,5db702b51b085a1faacb7398,https://www.fincaraiz.com.co/apartamento-en-ve...,28/10/2019,09:53:56,26/10/2019,14:16:47,5029447,True,Venta,False,...,Bueno,,1º,0,,False,0.0,"[Balcón, Baño Auxiliar, Baño Independiente, Ca...","[Acceso Pavimentado, Árboles frutales, Asador,...","[Cerca de Zona Urbana, Colegios / Universidade..."
1,5db702b41c622f09edcb7398,https://www.fincaraiz.com.co/apartamento-en-ve...,28/10/2019,09:53:56,18/10/2019,20:46:07,4553024,True,Venta,False,...,Excelente,,10º,0,,False,285000.0,"[Balcón, Baño Auxiliar, Barra estilo americano...","[Ascensor, En conjunto cerrado, Garaje(s), Par...","[Colegios / Universidades, Parques cercanos, S..."
2,5db702b5d7ea0271dacb7398,https://www.fincaraiz.com.co/apartamento-en-ve...,28/10/2019,09:53:56,21/10/2019,20:34:17,4826576,True,Venta,False,...,Excelente,1 a 8 años,,0,,False,275000.0,"[Balcón, Cuarto de Servicio, Estudio, Sauna / ...","[En conjunto cerrado, Garaje / Parqueadero(s),...",[]
3,5db702b5a7c97f5afacb7398,https://www.fincaraiz.com.co/apartamento-en-ve...,28/10/2019,09:53:56,23/10/2019,20:06:28,4292477,True,Venta,False,...,,,,0,,False,0.0,"[Alarma, Balcón, Baño Auxiliar, Baño Independi...","[Acceso Pavimentado, Árboles frutales, Bósque ...","[Colegios / Universidades, Zona Campestre, Zon..."
4,5db702b4979c6c79fccb7398,https://www.fincaraiz.com.co/casa-en-venta/med...,28/10/2019,09:53:56,18/10/2019,20:44:05,4800198,True,Venta,False,...,Excelente,,1º,0,,False,685050.0,"[Balcón, Baño Auxiliar, Baño de Servicio, Cale...","[Ascensor, Portería / Recepción]","[Colegios / Universidades, Parques cercanos, S..."
