In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.pdf_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-coordinates-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## 1) Define parameters to run a `search`
This can be a new search or retake a previous one

In [None]:
search_id = '4bd70bab-b20c-4e27-b514-92ff6fa70351'
#search_id = '010576e8-90f1-42c4-88bc-3965b5c4c238'
search_date = '2024-05-02'
#search_date = '2024-04-21'

## 2) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
limit = 250
query = f"""
        SELECT * FROM evals_pdf_reports_download_coordinates
        WHERE search_id = '4bd70bab-b20c-4e27-b514-92ff6fa70351'
         AND codigo_informe IS NOT NULL
        ORDER BY RANDOM()
        LIMIT {limit};
        """
print(query)

In [None]:
df = create_dataframe_from_query(db_file_path, query)
# Drop specified columns
columns_to_drop = ['id']
df = df.drop(columns=columns_to_drop)
df.head()

Set additional features to track the status of downloads as well as times and file names

In [None]:
df['status'] = None
df['pdf_filename'] = None
df.head()

## 3) Check first if any pdf files has been downloaded
This is useful in case a seach has been run and has not completed

### 3.1) Read all html files for the corresponding `search_date` and `search_id`

In [None]:
# Define the directory path
pdf_files_dir = os.path.join(project_folder_path, 'data', 'raw', '3_evals_reports', 'pdf_files')

# Check if the directory exists
if not os.path.exists(pdf_files_dir):
    # If it doesn't exist, create the directory
    os.makedirs(pdf_files_dir)
    print(f"Directory '{pdf_files_dir}' created.")

# List all files in the directory
pdf_file_paths = find_pdf_files(pdf_files_dir)
print(f'The directory: {pdf_files_dir} contains {len(pdf_file_paths)} out of {df.shape[0]}')

### 3.2) Fill on dataframe all those rows corresponding to files already downloaded

In [None]:
# Assuming df is your DataFrame
df['combined_key'] = df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str) + '_' + df['eval_id'].astype(str)


for pdf_file_path in pdf_file_paths:
    pdf_file_name = os.path.split(pdf_file_path)[-1]
    pdf_file_list = pdf_file_name.split('_')

    region_id = int(pdf_file_list[0])
    comuna_id = int(pdf_file_list[1])
    tipo_evaluacion = int(pdf_file_list[2])
    eval_id = pdf_file_list[-1].split('.')[0]

    #row = df[df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str)+ '_' + df['eval_id'].astype(str) == str(region_id) + '_' + str(comuna_id) + '_' + str(tipo_evaluacion) + '_' + str(eval_id)]
    combined_key = f"{region_id}_{comuna_id}_{tipo_evaluacion}_{eval_id}"

    row_index = df.index[df['combined_key'] == combined_key]
    if not row_index.empty:
        df.loc[row_index, 'status'] = 'Successful'
        df.loc[row_index, 'pdf_filename'] = pdf_file_name

# Drop the combined_key column at the end
df.drop(columns=['combined_key'], inplace=True)

df.head()

In [None]:
df['status'].value_counts(dropna=False)

### Download pdf files: 1 file per evaluacion

In [None]:
HOME_URL = 'http://calificacionenergeticaweb.minvu.cl/Publico/BusquedaVivienda.aspx'

In [None]:
df = df[df['status'] != 'Successful'].reset_index(drop=True)
print(f'{df.shape[0]} reports to download')
df.head()

In [None]:
MAX_ATTEMPTS = 5
DELAY_BETWEEN_ATTEMPTS = 15

attempts = 0
while attempts < MAX_ATTEMPTS:
    all_successful = True  # Flag to track if all rows are successful in this attempt
    reports_to_download = df[df['status'] != 'Successful'].shape[0]
    print(f"Attempt {attempts + 1} of {MAX_ATTEMPTS}: {reports_to_download} reports to download")

    for index, row in df.iterrows():
        if df.loc[index, 'status'] != 'Successful':
            region_id = str(row['region_id'])
            comuna_id = str(row['comuna_id'])
            tipo_evaluacion = str(row['tipo_evaluacion'])
            #eventtarget = row['eventtarget']
            eventargument = 'Page$' + str(row['pagina'])
            viewstate = row['viewstate']
            target_report_label_code = row['codigo_informe']
            # eventtarget
            if str(tipo_evaluacion) == '1':
                eventtarget = 'ctl00$ContentPlaceHolder1$grdViviendasPre'
            else:
                eventtarget = 'ctl00$ContentPlaceHolder1$grdViviendasCal'
            
            form_data_pdf = form_data_pdf_report(eventtarget, eventargument, viewstate, region_id, comuna_id, tipo_evaluacion, target_report_label_code)
            
            print(f" * Downloading report {index+1} out of {df.shape[0]}")
           
            try:
                response = requests.post(HOME_URL, data=form_data_pdf)

                if response.status_code == 200:
                    # Record the current time after making the request
                    request_time = datetime.now()
                    downloaded_on = request_time.strftime("%Y-%m-%d")
                    downloaded_at = request_time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    df.loc[index, 'status'] = 'Successful'
                    
                    # Saving pdf data into a local folder
                    # pdf_path = os.path.join(pdf_files_dir, region_id)
                    pdf_path = '/mnt/c/Users/rober/Desktop/test_doawload/'
                   
                    if not os.path.exists(pdf_path):
                        os.makedirs(pdf_path)
                        
                    pdf_filename = region_id + "_" + comuna_id + "_" + tipo_evaluacion + "_" + row['eval_id'] + '.pdf'                    

                    with open(os.path.join(pdf_path, pdf_filename), "wb") as f:
                        f.write(response.content)                    
                    
                    df.loc[index, 'pdf_filename'] = pdf_filename
                    #df.loc[index, 'downloaded_at'] = downloaded_at
                else:
                    df.loc[index, 'status'] = 'Failed'
                    print(f"Request failed for region {region_id}, commune {comuna_id}. Skipping to the next row.")
                    continue  # Skip to the next iteration of the loop
                #df.loc[index, 'status_code'] = response.status_code
            
            except Exception as e:
                print(f"An error occurred for region {region_id}, commune {comuna_id}: {str(e)}")
                df.loc[index, 'status'] = 'Failed'
                continue  # Skip to the next iteration of the loop
            
    
    all_successful = all_successful and (df['status'] == 'Successful').all()

    if all_successful:
        break  # Exit the loop if all rows are successful
    else:
        attempts += 1
        time.sleep(DELAY_BETWEEN_ATTEMPTS)  # Delay between attempts

# Drop specified columns
columns_to_drop = ['codigo_etiqueta', 'eventargument', 'viewstate']
out_df = df#.drop(columns=columns_to_drop).dropna().reset_index(drop=True)
out_df

In [None]:
#insert_unique_rows_from_dataframe(db_file_path, 'evals_html_downloaded_files', out_df, unique_columns=['comuna_id', 'region_id', 'tipo_evaluacion', 'pagina', 'search_id'])