In [None]:
import requests
import lxml.html as html
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.utils_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-coordinates-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## 1) Define parameters to run a `search`
This can be a new search or retake a previous one

In [None]:
search_id = '4bd70bab-b20c-4e27-b514-92ff6fa70351'
#search_id = '010576e8-90f1-42c4-88bc-3965b5c4c238'
search_date = '2024-05-02'
#search_date = '2024-04-21'

## 2) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
query = f"""
        SELECT * FROM evals_html_downloaded_files
        WHERE search_id = "{search_id}";
        """
print(query)

In [None]:
df = create_dataframe_from_query(db_file_path, query)
# Drop specified columns
columns_to_drop = ['id']
df = df.drop(columns=columns_to_drop)
# Filter only 'Successful' rows
df = df[(df['status'] == 'Successful') & (df['search_id'] == search_id)].reset_index(drop=True)
print(f'{df.shape[0]} rows loaded...')
df.head()

### 3) Read/Scrape HTML files in order to get all `Evaluaciones`
- 1 row per `vivienda`, `tipo_evaluacion` and `comuna`

In [None]:
# Parameters
chunk_size = 2500  # Adjust based on your memory and performance needs

# Initialize an empty list to store results
all_chunks_results = []

# Process the dataframe in chunks
for start in range(0, df.shape[0], chunk_size):
    print(f'Chunk {list(range(0, df.shape[0], chunk_size)).index(start) + 1} out of {len(range(0, df.shape[0], chunk_size))}')
    chunk = df.iloc[start:start+chunk_size]
    chunk_results = []
    # Initialize an empty dataframe
    evals_df_i = pd.DataFrame()
    for index, row in chunk.iterrows():
        if (index + 1) % 250 == 0:
            print(f'Processing evaluacion {index+1} out of {df.shape[0]}')
        html_path = os.path.join(project_folder_path, 'data', 'raw', '2_evals_comuna_page', f'{search_date}_{search_id}', 'html_files', str(row['region_id']))
        html_filename = row['html_filename']
        html_file_path = os.path.join(html_path, html_filename)
        evals_df_i = read_single_html_file(html_file_path)
        evals_df_i['comuna_id'] = row['comuna_id']
        evals_df_i['region_id'] = row['region_id']
        evals_df_i['tipo_evaluacion'] = row['tipo_evaluacion']
        evals_df_i['pagina'] = row['pagina']
        evals_df_i['eval_number_pagina'] = (evals_df_i.reset_index(drop=True).index + 1).to_list()
        evals_df_i['search_id'] = row['search_id']
        evals_df_i['search_date'] = row['search_date']
        chunk_results.append(evals_df_i)
        del evals_df_i # memory release

    all_chunks_results.extend(chunk_results)
# Concatenate all results into a single dataframe
evals_df = pd.concat(all_chunks_results, ignore_index=True)

In [None]:
# Evaluacion id: generated from 4 columns with function string_to_uuid
evals_df['eval_id'] = (evals_df['comuna_id'].astype(str) + '_' + evals_df['region_id'].astype(str) + '_' + evals_df['tipo_evaluacion'].astype(str) + '_' + evals_df['identificacion_vivienda'].astype(str)).apply(string_to_uuid)
evals_df['eval_id'] = evals_df['eval_id'].astype(str)
evals_df['eval_number'] = (evals_df.reset_index(drop=True).index + 1).to_list()
evals_df['eventargument'] = 'Page$' + evals_df['pagina'].astype(str)

print(f'{evals_df.shape[0]} evaluaciones loaded ...')
evals_df = evals_df[['eval_id', 'comuna_id', 'region_id', 'tipo_evaluacion', 'eval_number', 'pagina', 'eval_number_pagina', 'search_id', 'search_date', 'identificacion_vivienda', 'tipologia', 'comuna', 'proyecto', 'CE','CEE', 'codigo_informe', 'codigo_etiqueta', 'viewstate', 'eventargument']]

### Looking for duplicates. 
Each row `should be unique`

In [None]:
duplicated_evals_df = evals_df[evals_df.duplicated(subset=['eval_id'], keep=False)].sort_values(by=['region_id', 'comuna_id', 'tipo_evaluacion', 'eval_id']).reset_index(drop=True)#['identificacion_vivienda'].to_list()
print(f'{duplicated_evals_df.shape[0]} duplicated rows... half of them must be removed')
duplicated_evals_df.head()
duplicated_evals_df[duplicated_evals_df['eval_id'] == '1db2ba01-952f-5c70-8145-c6bfb35b5484']

### 3.1) Create a DataFrame with `Summary` data.

In [None]:
evals_summary_df = evals_df[['eval_id', 'comuna_id', 'region_id', 'tipo_evaluacion', 'identificacion_vivienda', 'tipologia', 'comuna', 'proyecto', 'CE','CEE']]
# Remove duplicated rows by keeping the first occurrence
evals_summary_df  = evals_summary_df.drop_duplicates(subset=['eval_id'], keep='first').reset_index(drop=True)
print(f'Total rows: {evals_summary_df.shape[0]}')
evals_summary_df.head()

In [None]:
evals_summary_df['eval_id'].value_counts()

In [None]:
evals_summary_df.dtypes

In [None]:
insert_unique_rows_from_dataframe(db_file_path, 'evals_summary', evals_summary_df, unique_columns=['eval_id'])

### 3.2) Create a DataFrame with `Download Coordinate` to then dowload `PDF Reports`.

In [None]:
evals_df.columns

In [None]:
pdf_coordinates_df = evals_df[['comuna_id', 'region_id', 'tipo_evaluacion', 'eval_number', 'pagina', 'eval_number_pagina', 'search_id', 'search_date', 'eval_id', 'codigo_informe', 'codigo_etiqueta', 'viewstate', 'eventargument']]
pdf_coordinates_df  = pdf_coordinates_df.drop_duplicates(['comuna_id', 'region_id', 'tipo_evaluacion', 'eval_number', 'search_id'], keep='first').reset_index(drop=True)
print(f'Total rows: {pdf_coordinates_df.shape[0]}')
pdf_coordinates_df.head()

In [None]:
insert_unique_rows_from_dataframe(db_file_path, 'evals_pdf_reports_download_coordinates', pdf_coordinates_df, unique_columns=['comuna_id', 'region_id', 'tipo_evaluacion', 'eval_number', 'search_id'])

In [None]:
pdf_coordinates_df.dtypes