# Informe CEV (v.2) - All Pages

### Import `PyMuPDF`

In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
project_folder_path = '/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-database'
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.pdf_functs import *

In [None]:
directory = os.path.join(project_folder_path, 'data', 'raw', '3_evals_reports', 'pdf_files')

### Load the PDF

In [None]:
def find_pdf_files(directory):
    pdf_files = []
    for root, _, files in os.walk(directory):
        for filename in files:
            if fnmatch.fnmatch(filename, '*.pdf'):
                pdf_files.append(os.path.join(root, filename))
    return pdf_files

In [None]:
pdf_files_paths = find_pdf_files(directory)
pdf_files_paths = pdf_files_paths[:30]
#pdf_files_paths = ['/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-database-reports/data/raw/2/pdf_files/2_9_1_a14a0dec57b8c6dbfa82bff0f43fac88612d0728.pdf',
#                  '/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-database-reports/data/raw/4/pdf_files/4_68_1_9ac6b36c0433976d78d0eb4e67b9c404444e4d88.pdf']

In [None]:
# Summary

In [None]:
summary_df = pd.DataFrame(data=pdf_files_paths, columns=['pdf_file_path'])
summary_df[['directory', 'pdf_file_name']] = summary_df['pdf_file_path'].str.rsplit('/', n=1, expand=True)
#summary_df['pdf_file_name'] = summary_df['pdf_file_name'].str.replace('.pdf', '')
summary_df[['region_id', 'comuna_id', 'tipo_evaluacion_id', 'evaluacion_id']] = summary_df['pdf_file_name'].str.replace('.pdf', '').str.rsplit('_', n=0, expand=True)
#summary_df.drop(columns=['pdf_file_path', 'directory'], inplace=True)
summary_df.T

In [None]:
summary_df['is_pdf_file_valid'] = None
summary_df['version_evaluacion'] = None
summary_df['codigo_evaluacion'] = None

In [None]:
summary_df['pdf_file_name'][0]

In [None]:
for index, row in summary_df.iterrows():
    print(f'Report {index+1} out of {summary_df.shape[0]}')
    pdf_file_path = row['pdf_file_path']
    try:
        pdf_report = fitz.open(pdf_file_path)
        
        if pdf_report.page_count == 4:
            summary_df.loc[index, 'is_pdf_file_valid'] = True
            summary_df.loc[index, 'version_evaluacion'] = '1'
            page_number = 1  # Page number (starting from 0)
            page = pdf_report[page_number]
            # Código evaluación energética
            area_coordinates = (60.2, 21.5, 80.3, 25.3)  # Coordinates of the area to extract text from: (x1, y1, x2, y2)
            extracted_text = extract_text_from_area(page, area_coordinates)
            codigo_evaluacion = extracted_text.split('\n')[-1]
            summary_df.loc[index, 'codigo_evaluacion'] = codigo_evaluacion
            
        elif pdf_report.page_count == 7:
            summary_df.loc[index, 'is_pdf_file_valid'] = True
            summary_df.loc[index, 'version_evaluacion'] = '2'
            page_number = 2  # Page number (starting from 0)
            page = pdf_report[page_number]
            # Código evaluación energética
            area_coordinates = (62.3, 30.7, 88.1, 35.1)  # Coordinates of the area to extract text from: (x1, y1, x2, y2)
            extracted_text = extract_text_from_area(page, area_coordinates)
            codigo_evaluacion = extracted_text
            summary_df.loc[index, 'codigo_evaluacion'] = codigo_evaluacion
            
        else:        
            summary_df.loc[index, 'is_pdf_file_valid'] = False
            summary_df.loc[index, 'version_evaluacion'] = None 
            summary_df.loc[index, 'codigo_evaluacion'] = None           

        
    except Exception as e:
        print(f"FileDataError: cannot open broken document: {e}")
        summary_df.loc[index, 'is_pdf_file_valid'] = False
        summary_df.loc[index, 'version_evaluacion'] = None         
            
summary_df.T

In [None]:
summary_df = summary_df[summary_df['is_pdf_file_valid'] == True]    
#summary_df = summary_df[summary_df['version_evaluacion'] == '2']    
summary_df

In [None]:
start_time = time.time()
informe_v2_pagina1_df = pd.DataFrame()
informe_v2_pagina2_df = pd.DataFrame()
informe_v2_pagina3_consumos_df = pd.DataFrame()
informe_v2_pagina3_envolvente_df = pd.DataFrame()
informe_v2_pagina4_df = pd.DataFrame()
informe_v2_pagina5_df = pd.DataFrame()
informe_v2_pagina6_df = pd.DataFrame()
informe_v2_pagina7_df = pd.DataFrame()

for index, row in summary_df.iterrows():
    print(f'Report {index+1} out of {summary_df.shape[0]}')
    
    # Check if report is v2
    pdf_file_path = pdf_file_path = row['pdf_file_path']
    pdf_report = fitz.open(pdf_file_path)
    if row['version_evaluacion'] == '2':
        v2 = True
    else:
        v2 = False

    if v2:
        print(pdf_file_path)
        informe_v2_pagina1_df_i = scrape_informe_cev_v2_pagina1(pdf_file_path)
        informe_v2_pagina1_df = pd.concat([informe_v2_pagina1_df, informe_v2_pagina1_df_i], axis=0)
        informe_v2_pagina2_df_i = scrape_informe_cev_v2_pagina2(pdf_file_path)
        informe_v2_pagina2_df = pd.concat([informe_v2_pagina2_df, informe_v2_pagina2_df_i], axis=0)
        informe_v2_pagina3_consumos_df_i = scrape_informe_cev_v2_pagina3_consumos(pdf_file_path)
        informe_v2_pagina3_consumos_df = pd.concat([informe_v2_pagina3_consumos_df, informe_v2_pagina3_consumos_df_i], axis=0)
        informe_v2_pagina3_envolvente_df_i = scrape_informe_cev_v2_pagina3_envolvente(pdf_file_path)
        informe_v2_pagina3_envolvente_df = pd.concat([informe_v2_pagina3_envolvente_df, informe_v2_pagina3_envolvente_df_i], axis=0)
        informe_v2_pagina4_df_i = scrape_informe_cev_v2_pagina4(pdf_file_path)
        informe_v2_pagina4_df = pd.concat([informe_v2_pagina4_df, informe_v2_pagina4_df_i], axis=0)
        informe_v2_pagina5_df_i = scrape_informe_cev_v2_pagina5(pdf_file_path)
        informe_v2_pagina5_df = pd.concat([informe_v2_pagina5_df, informe_v2_pagina5_df_i], axis=0)
        informe_v2_pagina6_df_i = scrape_informe_cev_v2_pagina6(pdf_file_path)
        informe_v2_pagina6_df = pd.concat([informe_v2_pagina6_df, informe_v2_pagina6_df_i], axis=0)
        informe_v2_pagina7_df_i = scrape_informe_cev_v2_pagina7(pdf_file_path)
        informe_v2_pagina7_df = pd.concat([informe_v2_pagina7_df, informe_v2_pagina7_df_i], axis=0)

informe_v2_pagina1_df.reset_index(drop=True, inplace=True)
informe_v2_pagina2_df.reset_index(drop=True, inplace=True)
informe_v2_pagina3_consumos_df.reset_index(drop=True, inplace=True)
informe_v2_pagina3_envolvente_df.reset_index(drop=True, inplace=True)
informe_v2_pagina4_df.reset_index(drop=True, inplace=True)
informe_v2_pagina5_df.reset_index(drop=True, inplace=True)
informe_v2_pagina6_df.reset_index(drop=True, inplace=True)
informe_v2_pagina7_df.reset_index(drop=True, inplace=True)
end_time = time.time()
execution_time = end_time - start_time
print("Script execution time:", execution_time, "seconds")

## Pagina 1

In [None]:
informe_v2_pagina1_df 

## Pagina 2

In [None]:
informe_v2_pagina2_df 

## Pagina 3

In [None]:
informe_v2_pagina3_consumos_df

In [None]:
informe_v2_pagina3_envolvente_df

## Pagina 4

In [None]:
informe_v2_pagina4_df

## Pagina 5

In [None]:
informe_v2_pagina5_df

## Pagina 6

In [None]:
informe_v2_pagina6_df

## Pagina 7

In [None]:
informe_v2_pagina7_df

In [None]:
## Save Excel

In [None]:
excel_file_path = './pdf_summary.xlsx'

In [None]:
replace_sheet_content(excel_file_path, 'summary', summary_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina1', informe_v2_pagina1_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina2', informe_v2_pagina2_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina3_consumos', informe_v2_pagina3_consumos_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina3_envolvente', informe_v2_pagina3_envolvente_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina4', informe_v2_pagina4_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina5', informe_v2_pagina5_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina6', informe_v2_pagina6_df)
replace_sheet_content(excel_file_path, 'informe_v2_pagina7', informe_v2_pagina7_df)

# END