In [74]:
%pwd

'/home/milad/projects/medical-nlp-pipeline'

In [69]:
import os
os.chdir('../')

In [3]:
%pwd

'/home/milad/projects/medical-nlp-pipeline'

In [None]:
# import opendatasets as od

# # Assign the Kaggle data set URL into variable
# dataset = 'https://www.kaggle.com/datasets/prasad22/ca-independent-medical-review'
# # Using opendatasets let's download the data sets
# od.download(dataset)

In [4]:
from dataclasses import dataclass
from pathlib import Path

# specifies the type of value related to the key in yaml file
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str

In [5]:
from medical_nlp.constants import *
from medical_nlp.utils.common import read_yaml, create_directories

In [6]:
class configurationManager:
    def __init__(self, config_file_path = CONFIG_FILE_PATH,
                 params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root])
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        
        create_directories([config.root_dir])
        
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL
        )
        
        return data_ingestion_config

In [60]:
import opendatasets as od
from medical_nlp import logger
from medical_nlp.utils.common import get_size
import os
import shutil
import pandas as pd
import glob

In [73]:
class DataIngestion():
    def __init__(self, config: DataIngestionConfig):
        self.config = config
    
    def download_file(self) -> str:
        
        try:
            file_name = 'kaggle.json'
            dataset_url = self.config.source_URL
            # Check if the folder path exists
            files_in_folder = os.listdir(self.config.root_dir)
            
            # Check if the specified file exists in the folder
            if file_name not in files_in_folder:
                shutil.copy(os.path.join('research/', file_name), self.config.root_dir)
            logger.info(f"Downloading data from {dataset_url} to {str(self.config.root_dir)}")
            os.chdir(self.config.root_dir)
            od.download(dataset_url)
            logger.info(f"Downloaded data from {dataset_url} to {str(self.config.root_dir)}")
            os.chdir('../../')
            csv_files = glob.glob(str(self.config.root_dir) + '/' +  dataset_url.split('/')[-1] + '/' + '*.csv')
            base_file = [file for file in csv_files if 'Custom' not in os.path.basename(file)][0]
            df = pd.read_csv(base_file)
            df_customzied = df[['Findings', 'Type']]
            df_customzied.to_csv(str(self.config.root_dir) + '/' +  dataset_url.split('/')[-1] + '/' + os.path.basename(base_file).split('.')[0] + '_Custom.csv' ,index=False)
            logger.info(f"Saved custom dataset to {str(self.config.root_dir) +  dataset_url.split('/')[-1]}")
        
        except Exception as e:
            raise e

In [76]:
# update the pipeline
try:
    config = configurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
except Exception as e:
    raise e

[2024-05-06 04:05:12,452: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-05-06 04:05:12,454: INFO: common: yaml file: params.yaml loaded successfully]
[2024-05-06 04:05:12,455: INFO: common: created directory at: artifacts]
[2024-05-06 04:05:12,456: INFO: common: created directory at: artifacts/data_ingestion]
[2024-05-06 04:05:12,457: INFO: 1556454531: Downloading data from https://www.kaggle.com/datasets/prasad22/ca-independent-medical-review to artifacts/data_ingestion]
Skipping, found downloaded files in "./ca-independent-medical-review" (use force=True to force download)
[2024-05-06 04:05:12,459: INFO: 1556454531: Downloaded data from https://www.kaggle.com/datasets/prasad22/ca-independent-medical-review to artifacts/data_ingestion]
[2024-05-06 04:05:13,155: INFO: 1556454531: Saved custom dataset to artifacts/data_ingestionca-independent-medical-review]
