In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-coordinates-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## 1) Define parameters to run a `search`
This can be a new search or retake a previous one

In [None]:
search_id = '4bd70bab-b20c-4e27-b514-92ff6fa70351'
#search_id = '010576e8-90f1-42c4-88bc-3965b5c4c238'
search_date = '2024-05-02'
#search_date = '2024-04-21'

## 2) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
query = f"""
        SELECT * FROM evals_html_download_coordinates
        WHERE search_id = "{search_id}";
        """
print(query)

In [None]:
df = create_dataframe_from_query(db_file_path, query)
# Drop specified columns
columns_to_drop = ['id']
df = df.drop(columns=columns_to_drop)
df.head()

Set additional features to track the status of downloads as well as times and file names

In [None]:
df['status'] = None
df['html_filename'] = None
df['downloaded_at'] = None
df.head()

## 3) Check first if any html files has been downloaded
This is useful in case a seach has been run and has not completed

### 3.1) Read all html files for the corresponding `search_date` and `search_id`

In [None]:
# Define the directory path
html_files_dir = os.path.join(project_folder_path, 'data', 'raw', '2_evals_comuna_page', f'{search_date}_{search_id}', 'html_files')

# Check if the directory exists
if not os.path.exists(html_files_dir):
    # If it doesn't exist, create the directory
    os.makedirs(html_files_dir)
    print(f"Directory '{html_files_dir}' created.")

# List all files in the directory
html_file_paths = find_html_files(html_files_dir)
print(f'The directory: {html_files_dir} contains {len(html_file_paths)} out of {df.shape[0]}')

### 3.2) Fill on dataframe all those rows corresponding to files already downloaded

In [None]:
# Assuming df is your DataFrame
df['combined_key'] = df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str) + '_' + df['eventargument'].astype(str)

for html_file_path in html_file_paths:
    html_file_name = os.path.split(html_file_path)[-1]
    html_file_list = html_file_name.split('_')
    
    region_id = int(html_file_list[0])
    comuna_id = int(html_file_list[1])
    tipo_evaluacion = int(html_file_list[2])
    eventargument = '$'.join(html_file_list[3].split('-')[:2])
    number_of_pages = str(html_file_list[3].split('-')[-1])
    date = html_file_list[4]
    time = html_file_list[5].split('.')[0].replace('-', ':')
    date_time = date + ' ' + time
    
    combined_key = f"{region_id}_{comuna_id}_{tipo_evaluacion}_{eventargument}"
    
    row_index = df.index[df['combined_key'] == combined_key]
    if not row_index.empty:
        df.loc[row_index, 'status'] = 'Successful'
        df.loc[row_index, 'html_filename'] = html_file_name
        df.loc[row_index, 'downloaded_at'] = date_time

# Drop the combined_key column at the end
df.drop(columns=['combined_key'], inplace=True)

df.head()


In [None]:
df['status'].value_counts(dropna=False)


### Download html files: 1 file per commune

In [None]:
HOME_URL = 'http://calificacionenergeticaweb.minvu.cl/Publico/BusquedaVivienda.aspx'

In [None]:
MAX_ATTEMPTS = 5
DELAY_BETWEEN_ATTEMPTS = 15

attempts = 0
while attempts < MAX_ATTEMPTS:
    all_successful = True  # Flag to track if all rows are successful in this attempt
    communes_to_process = df[df['status'] != 'Successful'].shape[0]
    print(f"Attempt {attempts + 1} of {MAX_ATTEMPTS}: {communes_to_process} communes to process")

    for index, row in df.iterrows():
        if df.loc[index, 'status'] != 'Successful':
            region_id = str(row['region_id'])
            commune_id = str(row['comuna_id'])
            tipo_evaluacion = str(row['tipo_evaluacion'])
            eventtarget = row['eventtarget']
            eventargument = row['eventargument']
            number_of_pages = str(row['total_pages'])
            viewstate = row['viewstate']
            form_data = form_data_evaluacion(eventtarget, eventargument, region_id, commune_id, tipo_evaluacion, viewstate)
            
            print(f" * Processing {region_id}, {commune_id}")
           
            try:
                response = requests.post(HOME_URL, data=form_data)

                if response.status_code == 200:
                    # Record the current time after making the request
                    request_time = datetime.now()
                    downloaded_on = request_time.strftime("%Y-%m-%d")
                    downloaded_at = request_time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    df.loc[index, 'status'] = 'Successful'
                    
                    # Saving html data into a local folder
                    html_path = os.path.join(html_files_dir, region_id)
                   
                    if not os.path.exists(html_path):
                        os.makedirs(html_path)
                        
                    html_filename = region_id + "_" + commune_id + "_" + tipo_evaluacion + "_" + eventargument.split('$')[0] + '-' + eventargument.split('$')[-1] + '-' + str(number_of_pages) + '_' + request_time.strftime("%Y-%m-%d_%H-%M-%S") + '.html'

                    

                    with open(os.path.join(html_path, html_filename), "wb") as f:
                        f.write(response.content)                    
                    
                    df.loc[index, 'html_filename'] = downloaded_at
                    df.loc[index, 'downloaded_at'] = downloaded_at
                else:
                    #df.loc[index, 'status'] = 'Failed'
                    print(f"Request failed for region {region_id}, commune {commune_id}. Skipping to the next row.")
                    continue  # Skip to the next iteration of the loop
                #df.loc[index, 'status_code'] = response.status_code
            
            except Exception as e:
                print(f"An error occurred for region {region_id}, commune {commune_id}: {str(e)}")
                #df.loc[index, 'status'] = 'Failed'
                continue  # Skip to the next iteration of the loop
            
    
    all_successful = all_successful and (df['status'] == 'Successful').all()

    if all_successful:
        break  # Exit the loop if all rows are successful
    else:
        attempts += 1
        time.sleep(DELAY_BETWEEN_ATTEMPTS)  # Delay between attempts


In [None]:
# Drop specified columns
columns_to_drop = ['eventtarget', 'eventargument', 'viewstate', 'total_evals']
out_df = df.drop(columns=columns_to_drop).dropna().reset_index(drop=True)
out_df

In [None]:
insert_unique_rows_from_dataframe(db_file_path, 'evals_html_downloaded_files', out_df, unique_columns=['comuna_id', 'region_id', 'tipo_evaluacion', 'pagina', 'search_id'])