In [None]:
import pandas as pd
import datetime
from pathlib import Path
from src.utils import is_index_exists
import hashlib


# builds a data-frame
def build_dataframe():
    if not Path('file_name.csv').exists():
        df = pd.DataFrame(columns=['file_name', 'hash_id', 'time_stamp'])
        df.to_csv('file_name.csv', index=False)

def is_ingested(file_path: str):
    csv_path = Path('file_name.csv')
    if not is_index_exists():
        csv_path.unlink(missing_ok=True)
    if not csv_path.exists():
        build_dataframe()
    df = pd.read_csv(csv_path)
    hash_id = compute_file_hash(file_path=file_path)
    if hash_id in df['hash_id'].values:
        return True 
    entry = [{
        "file_name": Path(file_path).name,
        "hash_id": hash_id,
        "time_stamp": datetime.datetime.now()
    }]
    df = pd.concat([df, pd.DataFrame(entry)], ignore_index=True)
    df.to_csv(csv_path, index=False)

    return False


def compute_file_hash(file_path, algorithm='sha256'):
    hash_func = hashlib.new(algorithm)
    
    with open(file_path, 'rb') as file:
        while chunk := file.read(8192):
            hash_func.update(chunk)
    return hash_func.hexdigest()

"""

### earlier approach was to keep a track of a csv, and every file that is uplaoded we would maintain a hashvalue with other columns in a csv file (it could be db also)
##BUT
#### commented out the build_dataframe and is_ingested function as they were not serving the purpose,  
#### if the csv is deleted then, repload could happen of the same document,
#### if the namespace is deleted in the pinecone, then the same file wont be uploaded that is on the csv wont be uplaode due to hash check
##  



"""