## Data Preparation

In [77]:
import pandas as pd
from pandas.io.json import json_normalize
from pymongo import MongoClient
import json

### Database Connection

In [78]:
DB_USER = 'anutibara'
DB_PASS = 'anutibara'
DB_HOST = 'scraping-cluster-7dtgt.gcp.mongodb.net'
DB_NAME = 'scraping_db'

try:
    client = MongoClient(f'mongodb+srv://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')
    print("Database connected successfully")
except Exception as e:
    print("Error to connect to database: ", e)
db = client.get_database(DB_NAME)
properties = db.properties

Database connected successfully


### Database Queries

In [79]:
properties.count_documents({})

34

### Convert Mongo Collection to DataFrame

In [86]:
json_documents = list(properties.find({}))
df_general_info = pd.DataFrame(json_documents, 
                                    columns=['_id', 
                                    'urlProperty', 
                                    'scrapingDate', 
                                    'scrapingHour', 
                                    'modifyDate',
                                    'modifyHour', 
                                    'code', 
                                    'status', 
                                    'type', 
                                    'use', 
                                    'nameProject', 
                                    'description',
                                    'offersType'])
df_location = json_normalize(json_documents, 'location')
df_owner_property = json_normalize(json_documents, 'ownerProperty')
df_features = json_normalize(json_documents, 'features')
df_more_features = json_normalize(json_documents, 'moreFeatures')
df = pd.concat([df_general_info, df_location, df_owner_property, 
                df_features, df_more_features], axis=1)

### Rename DataFrame Columns

In [87]:
df.rename(columns= {
                        "_id": "id_mongoose", 
                        "urlProperty":"id_property",
                        "scrapingDate": "scraping_date",
                        "scrapingHour": "scraping_hour",
                        "modifyDate": "modify_date",
                        "modifyHour": "modify_hour",
                        "status": "active",
                        "use": "new_property",
                        "nameProject": "name_project",
                        "offersType": "offers_type",
                        "id": "id_owner_property",
                        "name": "name_owner_property",
                        "contractType": "contract_type_owner_property",
                        "financing": "financing_owner_property",
                        "schedule": "schedule_owner_property",
                        "squareMeters": "square_meters",
                        "privateArea": "private_area",
                        "constructionArea": "construction_area",
                        "squareMetersPrice": "square_meters_price",
                        "interiorFloors": "interior_floors",
                        "includesAdministration": "includes_administration",
                        "admonPrice": "admon_price",
                        "interiorFeatures": "interior_features",
                        "exteriorFeatures": "exterior_features",
                        "sectorFeatures": "sector_features"
                    }, 
          inplace = True)
df.columns

Index(['id_mongoose', 'id_property', 'scraping_date', 'scraping_hour',
       'modify_date', 'modify_hour', 'code', 'active', 'type', 'new_property',
       'name_project', 'description', 'offers_type', 'country', 'department',
       'city', 'sector', 'neighborhood', 'address', 'latitude', 'longitude',
       'id_owner_property', 'name_owner_property',
       'contract_type_owner_property', 'financing_owner_property',
       'schedule_owner_property', 'price', 'square_meters', 'rooms',
       'bathrooms', 'garages', 'private_area', 'construction_area',
       'square_meters_price', 'stratum', 'condition', 'antiquity', 'floors',
       'interior_floors', 'weather', 'includes_administration', 'admon_price',
       'interior_features', 'exterior_features', 'sector_features'],
      dtype='object')

### Separate in 2 DataFrames: New Properties and Old Properties

#### New Properties DataFrame

In [91]:
df_new_properties = df.drop(df.columns[df['active'].apply(lambda col: col.isnull().sum() > 3)], axis=1)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 45 but corresponding boolean dimension is 34

#### Old Properties DataFrame

### Data Cleaning

In [88]:
df['active'] = df['active'].apply(lambda status: True if (status == 'Active') else False)
df['new_property'] = df['new_property'].apply(lambda status: True if (status == 'Nuevo') else False)
df['includes_administration'] = df['includes_administration'].apply(lambda status: True if (status == 'Nuevo') else False)
df.head(10)

0                                                    []
1                                                    []
2                                                    []
3                                                    []
4                                                    []
5                                                    []
6                                                    []
7                                                    []
8                                                    []
9                                                    []
10                                                   []
11                                                   []
12                                                   []
13                                                   []
14                                                   []
15                                                   []
16                                                   []
17                                              