In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-coordinates-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## 1) Define parameters to run a `search`
This can be a new search or retake a previous one

In [None]:
search_id = '4bd70bab-b20c-4e27-b514-92ff6fa70351'
#search_id = '010576e8-90f1-42c4-88bc-3965b5c4c238'
search_date = '2024-05-02'
#search_date = '2024-04-21'

## 2) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
query = f"""
        SELECT * FROM html_files_by_comuna_and_search
        WHERE search_id = "{search_id}";
        """
print(query)

In [None]:
df = create_dataframe_from_query(db_file_path, query)
# Change data type to search_date and downloaded_at
df['search_date'] = pd.to_datetime(df['search_date'], format='%Y-%m-%d')#.dt.strftime('%Y-%m-%d %H:%M:%S')
df['downloaded_at'] = pd.to_datetime(df['downloaded_at'], format='%Y-%m-%d %H:%M:%S')#.dt.strftime('%Y-%m-%d %H:%M:%S')
# Drop specified columns
columns_to_drop = ['id']
df = df.drop(columns=columns_to_drop)
df.head()

In [None]:
# Filter only 'Successful' rows
df = df[(df['status'] == 'Successful') & (df['search_id'] == search_id)].reset_index(drop=True)
df.head()

## 3) Read all HTML files 
To get total viviendas evaluadas por comuna y por tipo de evaluacion

In [None]:
for index, row in df.iterrows():
    html_path = os.path.join(project_folder_path, 'data', 'raw', '1_total_evals_comuna', f"{row['search_date'].strftime('%Y-%m-%d')}_{row['search_id']}", 'html_files')
    html_filename = row['html_filename']
    html_file_path = os.path.join(html_path, html_filename)
    parsed = html.parse(html_file_path)
    df.loc[index, 'viewstate'] = parsed.xpath('//input[@name="__VIEWSTATE"]/@value')[0]
    
    # Viviendas Precalificadas
    message_viv_precal = parsed.xpath('//strong/span[@id="ContentPlaceHolder1_ResultadoGrillaPre"]/descendant-or-self::*/text()')
    nbr_viv_precal = int(message_viv_precal[1]) if message_viv_precal else 0
    df.loc[index, 'total_viviendas_precalificadas'] = str(nbr_viv_precal)
    df.loc[index, 'total_paginas_viviendas_precalificadas'] = str(math.ceil(int(nbr_viv_precal)/10))
    
    # Viviendas Calificadas Number
    message_viv_cal = parsed.xpath('//strong/span[@id="ContentPlaceHolder1_ResultadoGrillaCal"]/descendant-or-self::*/text()')
    nbr_viv_cal = int(message_viv_cal[1]) if message_viv_cal else 0
    df.loc[index, 'total_viviendas_calificadas'] = str(nbr_viv_cal)    
    df.loc[index, 'total_paginas_viviendas_calificadas'] = str(math.ceil(int(nbr_viv_cal)/10))
df.head()

### Viviendas Pre Calificadas

In [None]:
precal_df = pd.DataFrame()
for index, row in df.iterrows():
    for page in range(1, int(row['total_paginas_viviendas_precalificadas']) + 1):
        precal_df_i = pd.DataFrame()
        precal_df_i.loc[0, 'comuna_id'] = str(row['comuna_id'])
        precal_df_i.loc[0, 'region_id'] = str(row['region_id'])       
        precal_df_i.loc[0, 'tipo_evaluacion'] = '1'
        precal_df_i.loc[0, 'pagina'] = str(page)
        precal_df_i.loc[0, 'search_id'] = row['search_id']
        precal_df_i.loc[0, 'search_date'] = row['search_date']
        precal_df_i.loc[0, 'eventtarget'] = str('ctl00$ContentPlaceHolder1$grdViviendasPre')
        precal_df_i.loc[0, 'viewstate'] = row['viewstate']
        precal_df_i.loc[0, 'eventargument'] = 'Page$' + str(page)
        precal_df_i.loc[0, 'total_evals'] = str(row['total_viviendas_precalificadas'])
        precal_df_i.loc[0, 'total_pages'] = str(row['total_paginas_viviendas_precalificadas'])
        
        precal_df = pd.concat([precal_df, precal_df_i])
precal_df = precal_df.reset_index(drop=True) 
precal_df.head()

### Viviendas Calificadas

In [None]:
cal_df = pd.DataFrame()
for index, row in df.iterrows():
    for page in range(1, int(row['total_paginas_viviendas_calificadas']) + 1):
        cal_df_i = pd.DataFrame()
        cal_df_i.loc[0, 'comuna_id'] = str(row['comuna_id'])
        cal_df_i.loc[0, 'region_id'] = str(row['region_id'])     
        cal_df_i.loc[0, 'tipo_evaluacion'] = '2'
        cal_df_i.loc[0, 'pagina'] = str(page)
        cal_df_i.loc[0, 'search_id'] = row['search_id']
        cal_df_i.loc[0, 'search_date'] = row['search_date']           
        cal_df_i.loc[0, 'eventtarget'] = str('ctl00$ContentPlaceHolder1$grdViviendasCal')
        cal_df_i.loc[0, 'viewstate'] = row['viewstate']
        cal_df_i.loc[0, 'eventargument'] = 'Page$' + str(page)
        cal_df_i.loc[0, 'total_evals'] = str(row['total_viviendas_calificadas'])
        cal_df_i.loc[0, 'total_pages'] = str(row['total_paginas_viviendas_calificadas'])
        
        cal_df = pd.concat([cal_df, cal_df_i])
cal_df = cal_df.reset_index(drop=True) 
cal_df.head()

## Dataframe to save into Database

In [None]:
out_df = pd.DataFrame()
out_df = pd.concat([precal_df, cal_df]).reset_index(drop=True)
out_df.head()

In [None]:
out_df.dtypes

In [None]:
out_df['comuna_id'] = out_df['comuna_id'].astype(int)
out_df['region_id'] = out_df['region_id'].astype(int)
out_df['tipo_evaluacion'] = out_df['tipo_evaluacion'].astype(int)
out_df['pagina'] = out_df['pagina'].astype(int)
out_df['search_date'] = out_df['search_date'].dt.strftime('%Y-%m-%d')
out_df['total_evals'] = out_df['total_evals'].astype(int)
out_df['total_pages'] = out_df['total_pages'].astype(int)

In [None]:
out_df.shape

### Save to database
Save only those records that are not yet there.

In [None]:
insert_unique_rows_from_dataframe(db_file_path, 'evals_html_download_coordinates', out_df, unique_columns=['comuna_id', 'region_id', 'tipo_evaluacion', 'pagina', 'search_id'])