In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.pdf_functs import *

### Database and directories

In [None]:
source_db_file_name = 'cev-database-coordinates-v1.db'
source_db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', source_db_file_name)
destination_db_file_name = 'cev-database-reports-v1.db'
destination_db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', destination_db_file_name)

## 1) Define a dataframe with download coordinates
The dataframe must contain 1 row per `comuna` meaning 348 rows in total.

In [None]:
query = f"""
        SELECT * FROM evals_summary;
        """
print(query)

In [None]:
df = create_dataframe_from_query(source_db_file_path, query)
print(f'{df.shape[0]} rows loaded')
df.head()

### Write table to database
Only if it exists and empty

In [None]:
# Check if the specified table exists in the database
table_name = 'evals_summary'
if check_table_exists(destination_db_file_path, table_name):
    print(f"Table '{table_name}' exists in the database.")
    if is_table_empty(destination_db_file_path, table_name):
        insert_unique_rows_from_dataframe(destination_db_file_path, 'evals_summary', df, unique_columns=['eval_id'])
else:        
    print(f"Table '{table_name}' does not exist in the database.")

## 3) Check all pdf files that have been downloaded
This is useful in case a seach has been run and has not completed

### 3.1) Get all `pdf_file_paths`

In [None]:
# Define the directory path
pdf_files_dir = os.path.join(project_folder_path, 'data', 'raw', '3_evals_reports', 'pdf_files')

# Check if the directory exists
if not os.path.exists(pdf_files_dir):
    # If it doesn't exist, create the directory
    os.makedirs(pdf_files_dir)
    print(f"Directory '{pdf_files_dir}' created.")

# List all files in the directory
pdf_file_paths = find_pdf_files(pdf_files_dir)
print(f'The directory: {pdf_files_dir} contains {len(pdf_file_paths)} out of {df.shape[0]}')

### 3.2) Fill on dataframe all those rows corresponding to files already downloaded

In [None]:
summary_df = pd.DataFrame(data=pdf_file_paths, columns=['pdf_file_path'])
summary_df[['directory', 'pdf_file_name']] = summary_df['pdf_file_path'].str.rsplit('/', n=1, expand=True)
#summary_df['pdf_file_name'] = summary_df['pdf_file_name'].str.replace('.pdf', '')
summary_df[['region_id', 'comuna_id', 'tipo_evaluacion', 'eval_id']] = summary_df['pdf_file_name'].str.replace('.pdf', '').str.rsplit('_', n=0, expand=True)
#summary_df.drop(columns=['pdf_file_path', 'directory'], inplace=True)
summary_df.T

In [None]:
summary_df['is_pdf_file_valid'] = None
summary_df['version_evaluacion'] = None

In [None]:
# Apply os.path.getsize to the 'pdf_file_path' column to get size in bytes
# Convert file size to kilobytes (1 KB = 1024 bytes)
summary_df['file_size_kb'] = summary_df['pdf_file_path'].apply(os.path.getsize) / 1024
summary_df.head().T

### Version Evalaucion: v1

In [None]:
v1_indexes = summary_df[(summary_df['file_size_kb']>= 500.0) & (summary_df['file_size_kb'] < 4000.0)].index#.sort_values('file_size_kb', ascending=True)
print(f'{len(v1_indexes)} version evaluacion v1 found...')
summary_df.loc[v1_indexes, 'version_evaluacion'] = int(1)

### Version Evalaucion: v2

In [None]:
v2_indexes = summary_df[(summary_df['file_size_kb']>= 4000.0)].index#.sort_values('file_size_kb', ascending=True)
print(f'{len(v2_indexes)} version evaluacion v2 found...')
summary_df.loc[v2_indexes, 'version_evaluacion'] = int(2)

In [None]:
print(f"Total rows: {summary_df['version_evaluacion'].value_counts(dropna=False).sum()}")
summary_df['version_evaluacion'].value_counts(dropna=False)

In [None]:
# Remove all non valid pdf files
print(f"{summary_df[summary_df['version_evaluacion'].isna()].shape[0]} non valid pdf files found...")
non_valid_pdf_index = summary_df[summary_df['version_evaluacion'].isna()].index
# Remove non valid pdfs
try:
    summary_df.loc[non_valid_pdf_index, 'pdf_file_path'].apply(os.remove)
except:
    pass

# Remove non valid pdf files from dataframe
summary_df = summary_df.dropna(subset=['version_evaluacion']).reset_index(drop=True)
print(f'Summary dataframe now has {summary_df.shape[0]} rows')

In [None]:
# Drop specified columns
columns_to_drop = ['pdf_file_path', 'directory']
to_store_df = summary_df.drop(columns=columns_to_drop).reset_index(drop=True)
to_store_df = to_store_df[['eval_id', 'comuna_id', 'region_id', 'tipo_evaluacion', 'version_evaluacion', 'pdf_file_name']]
to_store_df['comuna_id'] = to_store_df['comuna_id'].astype(int)
to_store_df['region_id'] = to_store_df['region_id'].astype(int)
to_store_df['tipo_evaluacion'] = to_store_df['tipo_evaluacion'].astype(int)
to_store_df['version_evaluacion'] = to_store_df['version_evaluacion'].astype(int)
print(f"Total rows: {to_store_df['version_evaluacion'].value_counts(dropna=False).sum()}")
to_store_df

In [None]:
to_store_df.isna().sum().sum()

### Get all those rows already stored in the database

In [None]:
query = f"""
        SELECT * FROM reports_summary;
        """
print(query)

In [None]:
stored_df = create_dataframe_from_query(destination_db_file_path, query)
print(f'{stored_df.shape[0]} rows loaded')
stored_df.head()

In [None]:
out_df = pd.concat([to_store_df, stored_df], ignore_index=True).drop_duplicates(keep=False).reset_index(drop=True)
print(f'{out_df.shape[0]} rows to be loaded into the database')

In [None]:
if not out_df.empty:
  insert_unique_rows_from_dataframe(destination_db_file_path, 'reports_summary', out_df, unique_columns=['eval_id'])