In [1]:
import requests
import time
import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-data-lake


## 1) Create Database

### 1.1) Set Database file parameters
Database file is a sqlite .db file

In [2]:
version = 'v1'
db_file_name = f'cev-database-coordinates-{version}.db'
print(db_file_name)
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)
db_file_path

cev-database-coordinates-v1.db


'/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-data-lake/data/sqlite/cev-database-coordinates-v1.db'

### 1.2) Create sqlite database file

In [3]:
create_database(db_file_path)

Database file '/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-data-lake/data/sqlite/cev-database-coordinates-v1.db' created successfully.


## 2) Create Tables
Database tables are:
* `regiones`
* `comunas`
* `viewstate_region`
* `html_files_by_comuna_and_search`
* `html_files_by_comuna_page_and_search`

### 2.1) Define all tables in a dicitonary
1. Dictionay key are the tables' names
2. Dictionary values are the tables queries

In [4]:
table_queries = {
    'regiones': """
        CREATE TABLE regiones (
            region_id INTEGER PRIMARY KEY,
            region_name TEXT,
            UNIQUE (region_id, region_name)
        )
    """,
    'comunas': """
        CREATE TABLE comunas (
            comuna_id INTEGER PRIMARY KEY,
            comuna_name TEXT,
            region_id INTEGER,
            FOREIGN KEY(region_id) REFERENCES regiones(region_id),
            UNIQUE(comuna_id, comuna_name, region_id)
        )
    """,
    'viewstate_region': """
        CREATE TABLE viewstate_region (
            region_id INTEGER PRIMARY KEY,
            viewstate TEXT,
            UNIQUE(region_id, viewstate)
        )
    """,
    'html_files_by_comuna_and_search': """
        CREATE TABLE html_files_by_comuna_and_search (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            comuna_id INTEGER,
            region_id INTEGER,
            search_id TEXT,
            search_date TEXT,
            status TEXT,
            html_filename TEXT,
            downloaded_at TEXT,
            FOREIGN KEY(comuna_id) REFERENCES comunas(comuna_id),
            FOREIGN KEY(region_id) REFERENCES regiones(region_id),
            UNIQUE(comuna_id, region_id, search_id)
        )
    """,    
    'evals_html_download_coordinates': """
        CREATE TABLE evals_html_download_coordinates (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            comuna_id INTEGER,
            region_id INTEGER,
            tipo_evaluacion INTEGER,
            pagina INTEGER,
            search_id TEXT,
            search_date TEXT,
            eventtarget TEXT,
            viewstate TEXT,
            eventargument TEXT,
            total_evals INTEGER,
            total_pages INTEGER,            
            FOREIGN KEY(comuna_id) REFERENCES comunas(comuna_id),
            FOREIGN KEY(region_id) REFERENCES regiones(region_id),
            UNIQUE(comuna_id, region_id, tipo_evaluacion, pagina, search_id)
        )
    """,
    'evals_html_downloaded_files': """
        CREATE TABLE evals_html_downloaded_files (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            comuna_id INTEGER,
            region_id INTEGER,
            tipo_evaluacion INTEGER,
            pagina INTEGER,
            search_id TEXT,
            search_date TEXT,
            total_pages INTEGER,
            status TEXT,
            html_filename TEXT,
            downloaded_at TEXT,
            FOREIGN KEY(comuna_id) REFERENCES comunas(comuna_id),
            FOREIGN KEY(region_id) REFERENCES regiones(region_id),
            UNIQUE(comuna_id, region_id, tipo_evaluacion, pagina, search_id)
        )
    """,
    'evals_summary': """
        CREATE TABLE evals_summary (
            eval_id TEXT PRIMARY KEY,
            comuna_id INTEGER,
            region_id INTEGER,
            tipo_evaluacion INTEGER,
            identificacion_vivienda TEXT,
            tipologia TEXT,
            comuna TEXT,
            proyecto TEXT,
            CE TEXT,
            CEE TEXT,                      
            FOREIGN KEY(comuna_id) REFERENCES comunas(comuna_id),
            FOREIGN KEY(region_id) REFERENCES regiones(region_id)
        )
    """,
    'evals_pdf_reports_download_coordinates': """
        CREATE TABLE evals_pdf_reports_download_coordinates (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            comuna_id INTEGER,
            region_id INTEGER,
            tipo_evaluacion INTEGER,
            eval_number INTEGER,
            pagina INTEGER,
            eval_number_pagina INTEGER,
            search_id TEXT,
            search_date TEXT,
            eval_id TEXT,
            codigo_informe TEXT,
            codigo_etiqueta TEXT,
            viewstate TEXT,    
            eventargument TEXT,
            FOREIGN KEY(comuna_id) REFERENCES comunas(comuna_id),
            FOREIGN KEY(region_id) REFERENCES regiones(region_id),
            FOREIGN KEY(eval_id) REFERENCES evals_summary(eval_id),
            UNIQUE(comuna_id, region_id, tipo_evaluacion, eval_number, search_id)
        )
    """
}


### 2.2) Create Tables

In [5]:
create_tables(db_file_path, table_queries)

Table 'regiones' created successfully.
Table 'comunas' created successfully.
Table 'viewstate_region' created successfully.
Table 'html_files_by_comuna_and_search' created successfully.
Table 'evals_html_download_coordinates' created successfully.
Table 'evals_html_downloaded_files' created successfully.
Table 'evals_summary' created successfully.
Table 'evals_pdf_reports_download_coordinates' created successfully.


### 2.3) Populating Tables

#### 2.3.1) Tabla `Regiones`

In [6]:
# Region names and corresponding Ids    
with open(os.path.join(project_folder_path, 'utils/json_files/region_name__region_id.json')) as json_file:
    region_name__region_id_dict = json.load(json_file)
    
# Invert the dictionary
region_id__region_name_dict = {value: key for key, value in region_name__region_id_dict.items()}

# Create dataframe from dictionary
regiones_df = pd.DataFrame(list(region_id__region_name_dict.items()), columns=['region_id', 'region_name'])
regiones_df['region_id'] = regiones_df['region_id'].astype(int)

In [7]:
# Fill table Regiones
fill_table_from_dataframe(db_file_path, 'regiones', regiones_df)

Table 'regiones' filled successfully.


#### 2.3.2) Tabla `Comunas`

In [8]:
# Commune Id and corresponding Commune Name       
with open(os.path.join(project_folder_path, 'utils/json_files/commune_id__commune_name.json')) as json_file:
    commune_id__commune_name_dict = json.load(json_file)
# Create dataframe from dictionary
comunas_df = pd.DataFrame(list(commune_id__commune_name_dict.items()), columns=['comuna_id', 'comuna_name'])
comunas_df['comuna_id'] = comunas_df['comuna_id'].astype(int)
comunas_df.head()  

Unnamed: 0,comuna_id,comuna_name
0,1,Algarrobo
1,2,Alhué
2,3,Alto Biobío
3,4,Alto del Carmen
4,5,Alto Hospicio


In [9]:
# Region Id and corresponding Communes Ids       
with open(os.path.join(project_folder_path, 'utils/json_files/region_id__commune_id.json')) as json_file:
    region_id__commune_id_dict = json.load(json_file)
# Transform the dictionary 'region_id__commune_id_dict' into a list of dictionaries
data_list = []
for key, values in region_id__commune_id_dict.items():
    for value in values:
        data_list.append({'comuna_id': value, 'region_id': key})
# Create a DataFrame from the list of dictionaries
comuna_region_df = pd.DataFrame(data_list)
comuna_region_df['comuna_id'] = comuna_region_df['comuna_id'].astype(int)
comuna_region_df['region_id'] = comuna_region_df['region_id'].astype(int)
# Add region_id column to dataframe
comunas_df = pd.merge(comunas_df, comuna_region_df, on='comuna_id', how='left')
comunas_df.head()

Unnamed: 0,comuna_id,comuna_name,region_id
0,1,Algarrobo,5
1,2,Alhué,13
2,3,Alto Biobío,8
3,4,Alto del Carmen,3
4,5,Alto Hospicio,1


In [10]:
# Fill table Comunas
fill_table_from_dataframe(db_file_path, 'comunas', comunas_df)

Table 'comunas' filled successfully.


#### 2.3.3) Tabla `viewstate_region`

In [11]:
# Region id and corresponding Viewstate argument       
with open(os.path.join(project_folder_path, 'utils/json_files/viewstate.json')) as json_file:
    region_id__viewstate_arg_dict = json.load(json_file)
    
# Create a DataFrame from the list of dictionaries
viewstate_region_df = pd.DataFrame(list(region_id__viewstate_arg_dict.items()), columns=['region_id', 'viewstate'])
viewstate_region_df = viewstate_region_df[viewstate_region_df['region_id'] != '-1'].reset_index(drop=True)
viewstate_region_df['region_id'] = viewstate_region_df['region_id'].astype(int)
viewstate_region_df.head()

Unnamed: 0,region_id,viewstate
0,1,ds+vjgdHrGUwuQ7Wtfp/As3TijOJzWl4Am50L+MDzxvCmr...
1,2,lcOqEJheZeG8vd/rp8p1BHYU1s1YGJK7nW9VFbYJ4VI125...
2,3,SNPooV+wLykHbTfoFObmoAJDyFVIlmHpTxSR23R7R2dIT2...
3,4,QNFNTRngIyTWphwxY8ZrPhncTggApLaQxkJrZbrtGt2ptA...
4,5,G526bK3zKCD6OoX+Bzsh4skc8hNblOitiwFLPKk4JlcvUu...


In [12]:
# Fill table viewstate_region
fill_table_from_dataframe(db_file_path, 'viewstate_region', viewstate_region_df)

Table 'viewstate_region' filled successfully.


**Remaining tables are filled in next steps !!!**

## END