In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.pdf_functs import *
from utils.utils_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-reports-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)

## **1) Define all reports to be scraped `reports_summary`**

### 1.1) Get all reports stored in table `reports_summary`

In [None]:
query = f"""
        SELECT * FROM reports_summary;
        """
print(query)

In [None]:
summary_df = create_dataframe_from_query(db_file_path, query)
print(f'{summary_df.shape[0]} evals loaded')
summary_df.head()

### 1.2) Filter only those corresponding to version_evaluacion 2

In [None]:
summary_df = summary_df[summary_df['version_evaluacion'] == 2].reset_index(drop=True)
print(f'{summary_df.shape[0]} rows corresponding to version_evaluacion 2')
summary_df.head()
summary_df['tipo_evaluacion'].value_counts()

### 1.3) Get all those reports already scraped

In [None]:
query2 = f"""
        SELECT eval_id, codigo_evaluacion FROM informe_v2_pagina7;
        """
print(query2)

In [None]:
scraped_df = create_dataframe_from_query(db_file_path, query2)
print(f'{scraped_df.shape[0]} evals loaded')
scraped_df.head()

### 1.4) Remove all those reports already scraped

In [None]:
# Filter out rows where 'eval_id' is in the eval_ids list
eval_ids = scraped_df['eval_id'].to_list()
to_scrape_df = summary_df[~summary_df['eval_id'].isin(eval_ids)].reset_index(drop=True)
print(f'{to_scrape_df.shape[0]} evals to be scraped')

## **2) Scrape pdf files**

In [None]:
# Shuffle the DataFrame
to_scrape_df = to_scrape_df.sample(frac=1).reset_index(drop=True)
to_scrape_df = to_scrape_df.head(5)
to_scrape_df.head()

In [None]:
# Parameters
chunk_size = 5  # Adjust based on your memory and performance needs

# Initialize an empty list to store results
all_chunks_results = []

# Process the dataframe in chunks
for start in range(0, to_scrape_df.shape[0], chunk_size):
    print(f'Chunk {list(range(0, to_scrape_df.shape[0], chunk_size)).index(start) + 1} out of {len(range(0, to_scrape_df.shape[0], chunk_size))}')
    chunk = to_scrape_df.iloc[start:start+chunk_size]
    chunk_results = []
    # Initialize an empty dataframe
    start_time = time.time()
    informe_v2_pagina1_df = pd.DataFrame()
    informe_v2_pagina2_df = pd.DataFrame()
    informe_v2_pagina3_consumos_df = pd.DataFrame()
    informe_v2_pagina3_envolvente_df = pd.DataFrame()
    informe_v2_pagina4_df = pd.DataFrame()
    informe_v2_pagina5_df = pd.DataFrame()
    informe_v2_pagina6_df = pd.DataFrame()
    informe_v2_pagina7_df = pd.DataFrame()

    for index, row in chunk.iterrows():
        if (index + 1) % max(chunk_size // 10, 1) == 0:
            print(f'    * Report {index+1} out of {to_scrape_df.shape[0]}')

        # Check if report is v2
        pdf_report = os.path.join(project_folder_path, 'data', 'raw', '3_evals_reports', 'pdf_files', str(row['region_id']), str(row['pdf_file_name']))
        pdf_report = fitz.open(pdf_report)
        if str(row['version_evaluacion']) == '2':
            v2 = True
        else:
            v2 = False

        if v2:
            #print(pdf_file_path)
            informe_v2_pagina1_df_i = scrape_informe_cev_v2_pagina1(pdf_report)
            informe_v2_pagina1_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina1_df = pd.concat([informe_v2_pagina1_df, informe_v2_pagina1_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina2_df_i = scrape_informe_cev_v2_pagina2(pdf_report)
            informe_v2_pagina2_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina2_df = pd.concat([informe_v2_pagina2_df, informe_v2_pagina2_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina3_consumos_df_i = scrape_informe_cev_v2_pagina3_consumos(pdf_report)
            informe_v2_pagina3_consumos_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina3_consumos_df = pd.concat([informe_v2_pagina3_consumos_df, informe_v2_pagina3_consumos_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina3_envolvente_df_i = scrape_informe_cev_v2_pagina3_envolvente(pdf_report)
            informe_v2_pagina3_envolvente_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina3_envolvente_df = pd.concat([informe_v2_pagina3_envolvente_df, informe_v2_pagina3_envolvente_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina4_df_i = scrape_informe_cev_v2_pagina4(pdf_report)
            informe_v2_pagina4_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina4_df = pd.concat([informe_v2_pagina4_df, informe_v2_pagina4_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina5_df_i = scrape_informe_cev_v2_pagina5(pdf_report)
            informe_v2_pagina5_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina5_df = pd.concat([informe_v2_pagina5_df, informe_v2_pagina5_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina6_df_i = scrape_informe_cev_v2_pagina6(pdf_report)
            informe_v2_pagina6_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina6_df = pd.concat([informe_v2_pagina6_df, informe_v2_pagina6_df_i], axis=0).reset_index(drop=True)

            informe_v2_pagina7_df_i = scrape_informe_cev_v2_pagina7(pdf_report)
            informe_v2_pagina7_df_i.insert(0, 'eval_id', row['eval_id'])
            informe_v2_pagina7_df = pd.concat([informe_v2_pagina7_df, informe_v2_pagina7_df_i], axis=0).reset_index(drop=True)
            
            # Close the PDF document
            pdf_report.close()

    # Insert into data base
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df, unique_columns=['eval_id'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df, unique_columns=['eval_id'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df, unique_columns=['eval_id'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df, unique_columns=['eval_id', 'codigo_evaluacion', 'orientacion'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df, unique_columns=['eval_id', 'codigo_evaluacion', 'mes_id'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df, unique_columns=['eval_id'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df, unique_columns=['eval_id'])
    insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df, unique_columns=['eval_id'])

    # memory release
    del informe_v2_pagina1_df
    del informe_v2_pagina2_df
    del informe_v2_pagina3_consumos_df
    del informe_v2_pagina3_envolvente_df
    del informe_v2_pagina4_df
    del informe_v2_pagina5_df
    del informe_v2_pagina6_df
    del informe_v2_pagina7_df

end_time = time.time()
execution_time = end_time - start_time
print("Script execution time:", execution_time, "seconds")

In [None]:
informe_v2_pagina4_df

insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df, unique_columns=['eval_id'])
insert_unique_rows_from_dataframe(db_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df, unique_columns=['eval_id'])

### Save Excel

excel_file_path = './reports_summary.xlsx'

replace_sheet_content(excel_file_path, 'summary', summary_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df)