## Data Preparation

In [284]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json, re

### Database Connection

In [285]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [286]:
properties.count_documents({})

25220

### Convert Mongo Collection to DataFrame

In [287]:
new_properties_json = list(properties.find({ 'use': 'Nuevo' }))
df_general_info = pd.DataFrame(new_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(new_properties_json, 'location')
df_owner_property = json_normalize(new_properties_json, 'ownerProperty')
df_features = json_normalize(new_properties_json, 'features')
df_more_features = json_normalize(new_properties_json, 'moreFeatures')
df_new_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

In [288]:
old_properties_json = list(properties.find({ 'use': 'Usado' }))
df_general_info = pd.DataFrame(old_properties_json, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(old_properties_json, 'location')
df_owner_property = json_normalize(old_properties_json, 'ownerProperty')
df_features = json_normalize(old_properties_json, 'features')
df_more_features = json_normalize(old_properties_json, 'moreFeatures')
df_old_properties = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

### Rename DataFrame Columns

In [306]:
df_new_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

df_old_properties.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)

### Data Cleaning

#### New Properties

In [307]:
df_new_properties['active'] = df_new_properties['active'].apply(lambda status: True if (status == 'Active') else False)
df_new_properties['new_property'] = df_new_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df_new_properties['includes_administration'] = df_new_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df_new_properties['garages'] = df_new_properties['garages'].replace('', value = 0, regex = True).astype(int)
df_new_properties['stratum'] = df_new_properties['stratum'].replace('', value = 0, regex = True).astype(int)
df_new_properties['floors'] = df_new_properties['floors'].replace('', value = 0, regex = True).astype(int)

#### Old Properties

In [308]:
df_old_properties['active'] = df_old_properties['active'].apply(lambda status: True if (status == 'Active') else False)
df_old_properties['new_property'] = df_old_properties['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['includes_administration'] = df_old_properties['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df_old_properties['garages'] = df_old_properties['garages'].replace('' or 'Más de 10', value = 0, regex = True)
df_old_properties['stratum'] = df_old_properties['stratum'].replace('' or 'Campestre', value = 0, regex = True)

df_old_properties['price'] = df_old_properties['price'].str.split('$')

# square_meters
# private_area
# construction_area
# floors

AttributeError: Can only use .str accessor with string values!

#### New Properties DataFrame

In [309]:
#df_new_properties['floors'].head(20)

#### Old Properties DataFrame

In [310]:
df_old_properties['price'].head(20)

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
16   NaN
17   NaN
18   NaN
19   NaN
Name: price, dtype: float64