In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-coordinates-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## 1) Define parameters to run a `search`
This can be a new search or retake a previous one

#### 1.1) New search


search_id = uuid.uuid4()
search_date = datetime.now().strftime("%Y-%m-%d")

#### 1.2) Previous `search`

In [None]:
search_id = '4bd70bab-b20c-4e27-b514-92ff6fa70351'
#search_id = '010576e8-90f1-42c4-88bc-3965b5c4c238'
search_date = '2024-05-02'
#search_date = '2024-04-21'

## 2) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
query = """
        SELECT c.comuna_id, c.comuna_name, r.region_id, r.region_name, v.viewstate
        FROM comunas AS c
        LEFT JOIN regiones AS r ON c.region_id = r.region_id
        LEFT JOIN viewstate_region AS v ON r.region_id = v.region_id
        ORDER BY r.region_id ASC, c.comuna_id ASC;
        """

In [None]:
df = create_dataframe_from_query(db_file_path, query)
df.head()

Set additional features to track the status of downloads as well as times and file names

In [None]:
df['search_id'] = search_id
df['search_date'] = search_date
df['status'] = None
df['html_filename'] = None
df['downloaded_at'] = None
df.head()

## 3) Check first if any html files has been downloaded
This is useful in case a seach has been run and has not completed

### 3.1) Read all html files for the corresponding `search_date` and `search_id`

In [None]:
# Define the directory path
html_files_dir = os.path.join(project_folder_path, 'data', 'raw', '1_total_evals_comuna', f'{search_date}_{search_id}', 'html_files')

# Check if the directory exists
if not os.path.exists(html_files_dir):
    # If it doesn't exist, create the directory
    os.makedirs(html_files_dir)
    print(f"Directory '{html_files_dir}' created.")

# List all files in the directory
html_files = os.listdir(html_files_dir)
print(f'The directory: {html_files_dir} contains {len(html_files)} out of {df.shape[0]}')

### 3.2) Fill on dataframe all those rows corresponding to files already donwloaded

In [None]:
# Assuming df is your DataFrame
df['combined_key'] = df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str)

for html_file in html_files:
    html_file_list = html_file.split('_')
    region_id = int(html_file_list[0])
    comuna_id = int(html_file_list[1])
    date = html_file_list[2]
    time = html_file_list[3].split('.')[0].replace('-', ':')
    date_time = date + ' ' + time
    #row = df[df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) == str(region_id) + '_' + str(comuna_id)]
    combined_key = f"{region_id}_{comuna_id}"    
    row_index = df.index[df['combined_key'] == combined_key]

    if not row_index.empty:
        df.loc[row_index, 'status'] = 'Successful'
        df.loc[row_index, 'html_filename'] = html_file
        df.loc[row_index, 'downloaded_at'] = date_time
    
# Drop the combined_key column at the end
df.drop(columns=['combined_key'], inplace=True)

df.head()
#df['status'].value_counts(dropna=False)

## 3) Download HTML files per Comuna: Only those that have not been downloaded yet
For a new run, all files must be downloaded

In [None]:
HOME_URL = 'http://calificacionenergeticaweb.minvu.cl/Publico/BusquedaVivienda.aspx'

In [None]:
MAX_ATTEMPTS = 5
DELAY_BETWEEN_ATTEMPTS = 15

attempts = 0
while attempts < MAX_ATTEMPTS:
    all_successful = True  # Flag to track if all rows are successful in this attempt
    communes_to_process = df[df['status'] != 'Successful'].shape[0]
    print(f"Attempt {attempts + 1} of {MAX_ATTEMPTS}: {communes_to_process} communes to process")

    for index, row in df.iterrows():
        if df.loc[index, 'status'] != 'Successful':
            region_id = row['region_id']
            comuna_id = row['comuna_id']
            rating_type = '-1'
            viewstate = row['viewstate']
            form_data = form_data_consulta(region_id, comuna_id, rating_type, viewstate)
            
            print(f" * Processing {row['region_name']}, {row['comuna_name']} / file {index + 1} out of {communes_to_process}")
           
            try:
                response = requests.post(HOME_URL, data=form_data)

                if response.status_code == 200:
                    # Record the current time after making the request
                    request_time = datetime.now()
                    downloaded_on = request_time.strftime("%Y-%m-%d")
                    downloaded_at = request_time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    df.loc[index, 'status'] = 'Successful'
                    
                    # Saving html data into a local folder
                    html_path = html_files_dir
                   
                    if not os.path.exists(html_path):
                        os.makedirs(html_path)

                    html_filename = str(region_id) + "_" + str(comuna_id) + "_" + request_time.strftime("%Y-%m-%d_%H-%M-%S") + '.html'

                    with open(os.path.join(html_path, html_filename), "wb") as f:
                        f.write(response.content)                    
                    
                    df.loc[index, 'html_filename'] = html_filename
                    df.loc[index, 'downloaded_at'] = downloaded_at
                else:
                    df.loc[index, 'status'] = 'Failed'
                    print(f"Request failed for region {row['region_name']}, commune {row['comuna_name']}. Skipping to the next row.")
                    continue  # Skip to the next iteration of the loop
                #df.loc[index, 'status_code'] = response.status_code
            
            except Exception as e:
                print(f"An error occurred for region {row['region_name']}, commune {row['comuna_name']}: {str(e)}")
                df.loc[index, 'status'] = 'Failed'
                continue  # Skip to the next iteration of the loop
            
    
    all_successful = all_successful and (df['status'] == 'Successful').all()

    if all_successful:
        break  # Exit the loop if all rows are successful
    else:
        attempts += 1
        time.sleep(DELAY_BETWEEN_ATTEMPTS)  # Delay between attempts


In [None]:
# Drop specified columns
columns_to_drop = ['region_name', 'comuna_name', 'viewstate']
df = df.drop(columns=columns_to_drop)
df.head()

### Save to database
Save only those records that are not yet there.

In [None]:
insert_unique_rows_from_dataframe(db_file_path, 'html_files_by_comuna_and_search', df, unique_columns=['comuna_id', 'region_id', 'search_id'])

### END