## Data Preparation

In [113]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json, regex

### Database Connection

In [114]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [115]:
properties.count_documents({})

13813

### Convert Mongo Collection to DataFrame

In [116]:
new_properties_json = list(properties.find({ 'use': 'Nuevo' }))
df_general_info = pd.DataFrame(new_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(new_properties_json, 'location')
df_owner_property = json_normalize(new_properties_json, 'ownerProperty')
df_features = json_normalize(new_properties_json, 'features')
df_more_features = json_normalize(new_properties_json, 'moreFeatures')
df_new_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

In [117]:
old_properties_json = list(properties.find({ 'use': 'Usado' }))
df_general_info = pd.DataFrame(old_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(old_properties_json, 'location')
df_owner_property = json_normalize(old_properties_json, 'ownerProperty')
df_features = json_normalize(old_properties_json, 'features')
df_more_features = json_normalize(old_properties_json, 'moreFeatures')
df_old_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

### Rename DataFrame Columns

In [118]:
df_new_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

df_old_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)
df_new_properties.dtypes

id_mongoose                      object
id_property                      object
scraping_date                    object
scraping_hour                    object
modify_date                      object
modify_hour                      object
code                              int64
active                           object
type                             object
new_property                     object
name_project                     object
description                      object
offers_type                      object
country                          object
department                       object
city                             object
sector                           object
neighborhood                     object
address                          object
latitude                        float64
longitude                       float64
id_owner_property                 int64
name_owner_property              object
contract_type_owner_property     object
financing_owner_property         object


### Data Cleaning

In [119]:
df_new_properties['active'] = df_new_properties['active'].apply(lambda status: True if (status == 'Active') else False)
df_old_properties['active'] = df_old_properties['active'].apply(lambda status: True if (status == 'Active') else False)

df_new_properties['new_property'] = df_new_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['new_property'] = df_old_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)

df_new_properties['includes_administration'] = df_new_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['includes_administration'] = df_old_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)

# -> New Properties DataFrame
# price
# square_meters
# garages
# stratum
# floors

# -> Old Properties DataFrame
# price
# square_meters
# garages
# private_area
# construction_area
# stratum
# antiquity
# floors


#### New Properties DataFrame

In [120]:
df_new_properties['garages'].head(20)

0     1
1      
2     1
3      
4      
5     1
6      
7     2
8      
9      
10     
11     
12    1
13     
14     
15     
16    2
17    2
18     
19    1
Name: garages, dtype: object

#### Old Properties DataFrame

In [121]:
df_old_properties['garages']

0        
1       1
2       2
3        
4       4
       ..
9350    1
9351     
9352     
9353    2
9354     
Name: garages, Length: 9355, dtype: object