## Data Preparation

In [393]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json, re

### Database Connection

In [394]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [395]:
properties.count_documents({})

102

### Convert Mongo Collection to DataFrame

In [396]:
new_properties_json = list(properties.find({ 'use': 'Nuevo' }))
df_general_info = pd.DataFrame(new_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(new_properties_json, 'location')
df_owner_property = json_normalize(new_properties_json, 'ownerProperty')
df_features = json_normalize(new_properties_json, 'features')
df_more_features = json_normalize(new_properties_json, 'moreFeatures')
df_new_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

In [397]:
old_properties_json = list(properties.find({ 'use': 'Usado' }))
df_general_info = pd.DataFrame(old_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(old_properties_json, 'location')
df_owner_property = json_normalize(old_properties_json, 'ownerProperty')
df_features = json_normalize(old_properties_json, 'features')
df_more_features = json_normalize(old_properties_json, 'moreFeatures')
df_old_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

### Rename DataFrame Columns

In [398]:
df_new_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

df_old_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

### Data Cleaning

#### New Properties

In [406]:
df_new_properties['active'] = df_new_properties['active'].apply(lambda status: True if (status == 'Active') else False)
df_new_properties['new_property'] = df_new_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df_new_properties['includes_administration'] = df_new_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df_new_properties['garages'] = df_new_properties['garages'].replace('', value = 0, regex = True).astype(int)
df_new_properties['stratum'] = df_new_properties['stratum'].replace('', value = 0, regex = True).astype(int)
df_new_properties['floor'] = df_new_properties['floor'].astype(int)

#### Old Properties

In [400]:
df_old_properties['active'] = df_old_properties['active'].apply(lambda status: True if (status == 'Active') else False)
df_old_properties['new_property'] = df_old_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['includes_administration'] = df_old_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['garages'] = df_old_properties['garages'].replace('', value = 0, regex = True)
df_old_properties['garages'] = df_old_properties['garages'].replace('Más de 10', value = 0, regex = True)
df_old_properties['garages'] = df_old_properties['garages'].astype(int)
df_old_properties['stratum'] = df_old_properties['stratum'].replace('', value = 0, regex = True)
df_old_properties['stratum'] = df_old_properties['stratum'].replace('Campestre', value = 0, regex = True)
df_old_properties['stratum'] = df_old_properties['stratum'].astype(int)

df_old_properties['price'] = (df_old_properties['price'].str.split('$')).str.join('')
df_old_properties['price'] = (df_old_properties['price'].str.split('.')).str.join('')
df_old_properties['square_meters'] = df_old_properties['square_meters'].str.replace(',','.')
df_old_properties['private_area'] = df_old_properties['private_area'].str.replace(',','.')
df_old_properties['construction_area'] = df_old_properties['construction_area'].str.replace(',','.')
df_old_properties['floor'] = df_old_properties['floor'].str.split('°')).str.join('')

#### New Properties DataFrame

In [408]:
df_new_properties['floor']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
Name: floor, dtype: int64

#### Old Properties DataFrame

In [405]:
df_old_properties['floor']

0      0
1      0
2     11
3      2
4     15
      ..
61     0
62     0
63     0
64     0
65     0
Name: floor, Length: 66, dtype: object