In [1]:
import os

In [2]:
%pwd

'c:\\Users\\hp\\Desktop\\ML\\project\\Thyroid_Detection\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\hp\\Desktop\\ML\\project\\Thyroid_Detection'

In [5]:
import pandas as pd
pd.pandas.set_option('display.max_columns',None)
data = pd.read_csv('artifacts/data_ingestion/Thyroid_Detection.csv')
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,?,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.1,t,2,t,102,f,?,f,?,f,?,other,negative
2,46,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.98,f,?,t,109,t,0.91,t,120,f,?,other,negative
3,70,F,t,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.16,t,1.9,t,175,f,?,f,?,f,?,other,negative
4,70,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.72,t,1.2,t,61,t,0.87,t,70,f,?,SVI,negative


In [25]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    encoder_name: str

In [26]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml, create_directories
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import numpy as np
from mlProject import logger
from sklearn.model_selection import train_test_split

In [27]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            encoder_name = config.encoder_name
        )

        return data_transformation_config

In [28]:
class DataTransformation:
    
    def __init__(self,config: DataTransformationConfig):
        self.config = config
         
         

    def dropUnnecessaryColumns(self,data,columnNameList): 
        data= data = pd.read_csv(self.config.data_path)
        data = data.drop(columnNameList,axis=1)
        return data
   
    
    

    def replaceInvalidValuesWithNull(self,data):
        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.nan)
        return data
    
    
    def encodeCategoricalValues(self,data):
         
    # We can map the categorical values like below:
        data['sex'] = data['sex'].map({'F': 0, 'M': 1})

     # except for 'Sex' column all the other columns with two categorical data have same value 'f' and 't'.
     # so instead of mapping indvidually, let's do a smarter work
        for column in data.columns:
            if len(data[column].unique()) == 2:
                data[column] = data[column].map({'f': 0, 't': 1})

     # this will map all the rest of the columns as we require. Now there are handful of column left with more than 2 categories.
     # we will use get_dummies with that.
        data = pd.get_dummies(data,columns=['referral_source'])

        encode = LabelEncoder().fit(data['Class'])

        data['Class'] = encode.transform(data['Class'])


    # we will save the encoder as pickle to use when we do the prediction. We will need to decode the predcited values
    # back to original
        #with open('EncoderPickle/enc.pickle', 'wb') as file:
            #joblib.dump(encode, file)
        joblib.dump(encode, os.path.join(self.config.root_dir, self.config.encoder_name))

        return data
    

    
    def impute_missing_values(self,data):
        imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
        new_array=imputer.fit_transform(data)
        data=pd.DataFrame(data=np.round(new_array), columns=data.columns)
        
        return data
    

    def train_test_spliting(self,data):
        #data = pd.read_csv(self.config.data_path)

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
    

        
        


In [29]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_trnsformation = DataTransformation( config = data_transformation_config)
    #data= data = pd.read_csv(config.data_path)
    data = data_trnsformation.dropUnnecessaryColumns(data,['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','TBG','TSH'])
    #data =data_preprocessing.dropUnnecessaryColumns()
    data = data_trnsformation.replaceInvalidValuesWithNull(data)
    #data = data_preprocessing.replaceInvalidValuesWithNull(data)
    #data_preprocessing.encodeCategoricalValues(data)
    data = data_trnsformation.encodeCategoricalValues(data)
    #data_preprocessing.impute_missing_values(data)
    data= data_trnsformation.impute_missing_values(data)

    data = data_trnsformation.train_test_spliting(data)


except Exception as e:
    raise e

[2023-10-30 17:28:49,244: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-10-30 17:28:49,246: INFO: common: yaml file: params.yaml loaded successfully]
[2023-10-30 17:28:49,251: INFO: common: yaml file: schema.yaml loaded successfully]
[2023-10-30 17:28:49,257: INFO: common: created directory at: artifacts]
[2023-10-30 17:28:49,257: INFO: common: created directory at: artifacts/data_transformation]


[2023-10-30 17:28:50,199: INFO: 1380358785: Splited data into training and test sets]
[2023-10-30 17:28:50,199: INFO: 1380358785: (2829, 26)]
[2023-10-30 17:28:50,205: INFO: 1380358785: (943, 26)]
(2829, 26)
(943, 26)
