In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.pdf_functs import *
from utils.utils_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-reports-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## 1) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
query = f"""
        SELECT * FROM reports_summary;
        """
print(query)

In [None]:
summary_df = create_dataframe_from_query(db_file_path, query)
summary_df.head()

Set additional features to track the status of downloads as well as times and file names

## 3) Check first if any pdf files has been downloaded
This is useful in case a seach has been run and has not completed

### 3.1) Read all html files for the corresponding `search_date` and `search_id`

In [None]:
# Define the directory path
pdf_files_dir = os.path.join(project_folder_path, 'data', 'raw', '3_evals_reports', 'pdf_files')

# Check if the directory exists
if not os.path.exists(pdf_files_dir):
    # If it doesn't exist, create the directory
    os.makedirs(pdf_files_dir)
    print(f"Directory '{pdf_files_dir}' created.")

# List all files in the directory
pdf_file_paths = find_pdf_files(pdf_files_dir)
print(f'The directory: {pdf_files_dir} contains {len(pdf_file_paths)} out of {summary_df.shape[0]}')

### 3.2) Fill on dataframe all those rows corresponding to files already downloaded

### Scrape pdf files: 1 file per evaluacion

In [None]:
### PDF Summary

In [None]:
start_time = time.time()
informe_v2_pagina1_df = pd.DataFrame()
informe_v2_pagina2_df = pd.DataFrame()
informe_v2_pagina3_consumos_df = pd.DataFrame()
informe_v2_pagina3_envolvente_df = pd.DataFrame()
informe_v2_pagina4_df = pd.DataFrame()
informe_v2_pagina5_df = pd.DataFrame()
informe_v2_pagina6_df = pd.DataFrame()
informe_v2_pagina7_df = pd.DataFrame()

for index, row in summary_df.iterrows():
    print(f'Report {index+1} out of {summary_df.shape[0]}')
    
    # Check if report is v2
    pdf_file_path = os.path.join(project_folder_path, 'data', 'raw', '3_evals_reports', 'pdf_files', str(row['region_id']), str(row['pdf_file_name']))
    pdf_report = fitz.open(pdf_file_path)
    if str(row['version_evaluacion']) == '2':
        v2 = True
    else:
        v2 = False

    if v2:
        print(pdf_file_path)
        informe_v2_pagina1_df_i = scrape_informe_cev_v2_pagina1(pdf_file_path)
        informe_v2_pagina1_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina1_df = pd.concat([informe_v2_pagina1_df, informe_v2_pagina1_df_i], axis=0)
        
        informe_v2_pagina2_df_i = scrape_informe_cev_v2_pagina2(pdf_file_path)
        informe_v2_pagina2_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina2_df = pd.concat([informe_v2_pagina2_df, informe_v2_pagina2_df_i], axis=0)
        
        informe_v2_pagina3_consumos_df_i = scrape_informe_cev_v2_pagina3_consumos(pdf_file_path)
        informe_v2_pagina3_consumos_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina3_consumos_df = pd.concat([informe_v2_pagina3_consumos_df, informe_v2_pagina3_consumos_df_i], axis=0)
        
        informe_v2_pagina3_envolvente_df_i = scrape_informe_cev_v2_pagina3_envolvente(pdf_file_path)
        informe_v2_pagina3_envolvente_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina3_envolvente_df = pd.concat([informe_v2_pagina3_envolvente_df, informe_v2_pagina3_envolvente_df_i], axis=0)
        
        informe_v2_pagina4_df_i = scrape_informe_cev_v2_pagina4(pdf_file_path)
        informe_v2_pagina4_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina4_df = pd.concat([informe_v2_pagina4_df, informe_v2_pagina4_df_i], axis=0)
        
        informe_v2_pagina5_df_i = scrape_informe_cev_v2_pagina5(pdf_file_path)
        informe_v2_pagina5_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina5_df = pd.concat([informe_v2_pagina5_df, informe_v2_pagina5_df_i], axis=0)
        
        informe_v2_pagina6_df_i = scrape_informe_cev_v2_pagina6(pdf_file_path)
        informe_v2_pagina6_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina6_df = pd.concat([informe_v2_pagina6_df, informe_v2_pagina6_df_i], axis=0)
        
        informe_v2_pagina7_df_i = scrape_informe_cev_v2_pagina7(pdf_file_path)        
        informe_v2_pagina7_df_i.insert(0, 'eval_id', row['eval_id'])
        informe_v2_pagina7_df = pd.concat([informe_v2_pagina7_df, informe_v2_pagina7_df_i], axis=0)
        
        # Insert into data base
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df_i, unique_columns=['eval_id'])
        insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df_i, unique_columns=['eval_id'])
        
        
informe_v2_pagina1_df.reset_index(drop=True, inplace=True)
informe_v2_pagina2_df.reset_index(drop=True, inplace=True)
informe_v2_pagina3_consumos_df.reset_index(drop=True, inplace=True)
informe_v2_pagina3_envolvente_df.reset_index(drop=True, inplace=True)
informe_v2_pagina4_df.reset_index(drop=True, inplace=True)
informe_v2_pagina5_df.reset_index(drop=True, inplace=True)
informe_v2_pagina6_df.reset_index(drop=True, inplace=True)
informe_v2_pagina7_df.reset_index(drop=True, inplace=True)
end_time = time.time()
execution_time = end_time - start_time
print("Script execution time:", execution_time, "seconds")

In [None]:
informe_v2_pagina1_df

In [None]:
informe_v2_pagina2_df 

In [None]:
informe_v2_pagina3_consumos_df

In [None]:
informe_v2_pagina4_df

In [None]:
informe_v2_pagina5_df

In [None]:
informe_v2_pagina6_df

In [None]:
informe_v2_pagina7_df

In [None]:
informe_v2_pagina1_df

insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df, unique_columns=['eval_id'])

In [None]:
## Save Excel

In [None]:
excel_file_path = './reports_summary.xlsx'

In [None]:
replace_sheet_content(excel_file_path, 'summary', summary_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df)