In [15]:
import os

In [16]:
%pwd

'c:\\Users\\hp\\Desktop\\ML\\project\\Thyroid_Detection'

In [17]:
os.chdir('../')

In [18]:
%pwd

'c:\\Users\\hp\\Desktop\\ML\\project'

In [19]:
import pandas as pd
pd.pandas.set_option('display.max_columns',None)
data = pd.read_csv('artifacts/data_ingestion/Thyroid_Detection.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/data_ingestion/Thyroid_Detection.csv'

In [None]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    encoder_name: str

In [None]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import numpy as np
from mlProject import logger
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            encoder_name = config.encoder_name
        )

        return data_transformation_config

In [None]:
class DataTransformation:
    
    def __init__(self,config: DataTransformationConfig):
        self.config = config

    def get_data(self):
        data = pd.read_csv(self.config.data_path)
        return data
         
         

    def dropUnnecessaryColumns(self,data,columnNameList): 
        #data = pd.read_csv(self.config.data_path)
        data = data.drop(columnNameList,axis=1)
        return data
   
    
    

    def replaceInvalidValuesWithNull(self,data):
        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.nan)
        return data
    
    
    def encodeCategoricalValues(self,data):
         
    # We can map the categorical values like below:
        data['sex'] = data['sex'].map({'F': 0, 'M': 1})

     # except for 'Sex' column all the other columns with two categorical data have same value 'f' and 't'.
     # so instead of mapping indvidually, let's do a smarter work
        for column in data.columns:
            if len(data[column].unique()) == 2:
                data[column] = data[column].map({'f': 0, 't': 1})

     # this will map all the rest of the columns as we require. Now there are handful of column left with more than 2 categories.
     # we will use get_dummies with that.
        data = pd.get_dummies(data,columns=['referral_source'])

        encode = LabelEncoder().fit(data['Class'])

        data['Class'] = encode.transform(data['Class'])


    # we will save the encoder as pickle to use when we do the prediction. We will need to decode the predcited values
    # back to original
        #with open('EncoderPickle/enc.pickle', 'wb') as file:
            #joblib.dump(encode, file)
        joblib.dump(encode, os.path.join(self.config.root_dir, self.config.encoder_name))

        return data
    

    
    def impute_missing_values(self,data):
        imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
        new_array=imputer.fit_transform(data)
        data=pd.DataFrame(data=np.round(new_array), columns=data.columns)
        
        return data
    
    
    def separate_label_feature(self, data, label_column_name):
    
        X=data.drop(labels=label_column_name,axis=1) # drop the columns specified and separate the feature columns
        Y=data[label_column_name] # Filter the Label columns
        
        return X,Y

    def handleImbalanceDataset(self, X,Y):
         
        rdsmple = RandomOverSampler()
        X_sampled,Y_sampled = rdsmple.fit_resample(X,Y)

        return X_sampled,Y_sampled

    def train_test_spliting(self,X_sampled,Y_sampled):
        #data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. (0.75, 0.25) split.
        x_train,x_test,y_train,y_test = train_test_split(X_sampled,Y_sampled)

        x_train.to_csv(os.path.join(self.config.root_dir, "x_train.csv"),index = False)
        x_test.to_csv(os.path.join(self.config.root_dir, "x_test.csv"),index = False)
        y_train.to_csv(os.path.join(self.config.root_dir, "y_train.csv"),index = False)
        y_test.to_csv(os.path.join(self.config.root_dir, "y_test.csv"),index = False)
        logger.info("Splited data into training and test sets")
        logger.info(x_train.shape)
        logger.info(x_test.shape)

        print(x_train.shape)
        print(x_test.shape)


      
    
    

        
        


In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_trnsformation = DataTransformation( config = data_transformation_config)
    #data= data = pd.read_csv(config.data_path)
    data = data_trnsformation.get_data()
    data = data_trnsformation.dropUnnecessaryColumns(data,['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','TBG','TSH'])
    #data =data_preprocessing.dropUnnecessaryColumns()
    data = data_trnsformation.replaceInvalidValuesWithNull(data)
    #data = data_preprocessing.replaceInvalidValuesWithNull(data)
    #data_preprocessing.encodeCategoricalValues(data)
    data = data_trnsformation.encodeCategoricalValues(data)
    #data_preprocessing.impute_missing_values(data)
    data= data_trnsformation.impute_missing_values(data)
    X,Y = data_trnsformation.separate_label_feature(data, label_column_name='TARGET_COLUMN')
    X_sampled,Y_sampled = data_trnsformation.handleImbalanceDataset(X,Y)

    data = data_trnsformation.train_test_spliting(X_sampled,Y_sampled)


except Exception as e:
    raise e

[2023-11-04 18:30:35,259: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-11-04 18:30:35,264: INFO: common: yaml file: params.yaml loaded successfully]
[2023-11-04 18:30:35,273: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-11-04 18:30:35,276: INFO: common: created directory at: artifacts]
[2023-11-04 18:30:35,280: INFO: common: created directory at: artifacts/data_transformation]


FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/data_ingestion/Thyroid_Detection.csv'