In [None]:
import requests
import time
from datetime import datetime
import pandas as pd
import os
import json
import uuid
import sqlite3
import sys
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Access the environment variables
project_folder_path = os.getenv('PROJECT_FOLDER_PATH')
print(project_folder_path)
project_folder_path = '/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-database'
sys.path.append(project_folder_path)
from utils.db_functs import *
from utils.requests_functs import *
from utils.html_functs import *
from utils.pdf_functs import *
from utils.utils_functs import *

### Database and directories

In [None]:
db_file_name = 'cev-database-coordinates-v1.db'
db_file_path = os.path.join(project_folder_path, 'data', 'sqlite', db_file_name)
csv_file_path = '/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-database-reports/pipeline/CEV-Chile-2022-September-ratings.csv'

## 1) Define parameters to run a `search`
This can be a new search or retake a previous one

In [None]:
df = pd.read_csv(csv_file_path)
df.head()

Set additional features to track the status of downloads as well as times and file names

In [None]:
# Add additional columns
query = f"""
        SELECT * FROM comunas;
        """
print(query)
comunas_df = create_dataframe_from_query(db_file_path, query)
#comunas_df['comuna_id'] = comunas_df['comuna_id'].astype(str)

df['comuna_id'] = df['Comuna'].map(comunas_df.set_index('comuna_name')['comuna_id'].to_dict())
df['region_id'] = df['Comuna'].map(comunas_df.set_index('comuna_name')['region_id'].to_dict())
df['tipo_evaluacion'] = df['Status'].map({'Pre-calificación':1,
                                          'Calificación':2
                                            })

df.head()

In [None]:
# Evaluacion id: generated from 4 columns with fcuntion string_to_uuid
df['eval_id'] = (df['comuna_id'].astype(str) + '_' + df['region_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str) + '_' + df['Identificación Vivienda'].astype(str)).apply(string_to_uuid)
df['eval_id'] = df['eval_id'].astype(str)

In [None]:
df.head()

In [None]:
df['old_file_name'] = df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str) + '_' + df['Hash Id'].astype(str) + '.pdf'
df['new_file_name'] = df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str) + '_' + df['eval_id'].astype(str) + '.pdf'

In [None]:
df['status'] = None
df['pdf_filename'] = None
df['downloaded_at'] = None
df.head()

## 3) Check first if any pdf files has been downloaded
This is useful in case a seach has been run and has not completed

### 3.1) Read all html files for the corresponding `search_date` and `search_id`

In [None]:
# Define the directory path
pdf_files_dir = '/mnt/c/Users/rober/OneDrive/8_DEVELOPMENT/cev-database-reports/data'

# Check if the directory exists
if not os.path.exists(pdf_files_dir):
    # If it doesn't exist, create the directory
    os.makedirs(pdf_files_dir)
    print(f"Directory '{pdf_files_dir}' created.")

# List all files in the directory
pdf_file_paths = find_pdf_files(pdf_files_dir)
print(f'The directory: {pdf_files_dir} contains {len(pdf_file_paths)} out of {df.shape[0]}')

### 3.2) Fill on dataframe all those rows corresponding to files already downloaded

In [None]:
count = 0
for pdf_file_path in pdf_file_paths:
    print(f'file {count + 1} out of {len(pdf_file_paths)}')
    pdf_file_name = os.path.split(pdf_file_path)[-1]
    pdf_file_list = pdf_file_name.split('_')
    region_id = int(pdf_file_list[0])
    comuna_id = int(pdf_file_list[1])
    tipo_evaluacion = int(pdf_file_list[2])
    hash_id = pdf_file_list[-1].split('.')[0]
    row = df[df['region_id'].astype(str) + '_' + df['comuna_id'].astype(str) + '_' + df['tipo_evaluacion'].astype(str)+ '_' + df['Hash Id'].astype(str) == str(region_id) + '_' + str(comuna_id) + '_' + str(tipo_evaluacion) + '_' + str(hash_id)]
    if not row.empty:
        df.loc[row.index, 'status'] = 'Successful'
        df.loc[row.index, 'pdf_filename'] = pdf_file_name
        df.loc[row.index, 'pdf_file_path'] = pdf_file_path
        #df.loc[row.index, 'downloaded_at'] = None    
    count = count + 1

In [None]:
df[df['status'] == 'Successful'][['pdf_file_path']].reset_index(drop=True).iloc[0].values

In [None]:
df['pdf_filename'].isna().value_counts(dropna=False)

In [None]:
new_names_df = df[df['status'] == 'Successful'].reset_index(drop=True).copy()
new_names_df['rename_status'] = None
new_names_df

for index, row in new_names_df.iterrows():
    print(f'file {index + 1} out of {new_names_df.shape[0]}')
    new_pdf_file_path = os.path.join(os.path.split(pdf_file_path)[0],  row['new_file_name'])
    os.rename(row['pdf_file_path'], new_pdf_file_path)
    #new_names_df.loc[row.index, 'rename_status'] = 'renamed'
    

In [None]:
new_names_df.head()

In [None]:
new_pdf_file_path