In [1]:
#Import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek
from copy import deepcopy


#Define global vars
fpath = '/Users/richardmiller/Downloads/thyroid_cancer_risk_data.csv'
model_results = {}
df = pd.read_csv(fpath)

In [17]:
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("pipeline.log"),  # writes to file
        logging.StreamHandler()               # prints to console
    ]
)
logger = logging.getLogger(__name__)

# Data Processor

In [30]:
class DataProcessor:
    def __init__(self,data,params,logger):
        self.logger = logger
        self.logger.info('Initlizizing DataProcessor class')
        
        self.data = data
        self.params = params
        self.logger = logger

        
    def feat_ratios(self,top,bottom):
        self.logger.debug('Before ratio encoding the features are: {self.data.columns}')
        self.logger.debug('Before ratio encoding the number of NaNs is: {self.data.isna().sum()}')
        '''Creates a new features by dividing two numerical features.

        Inputs: top (string) - name of feature in dataset to be in the numerator of the ratio.
                bottom (string) - name of feature in dataset to be in the denominator of the ratio.

        Outputs: None - Appends new feature to the dataset, self.data.
        
        '''
        #Divide top/bottom features
        new_feat = top+':'+bottom
        self.data[new_feat] = self.data[top]/self.data[bottom]
        self.logger.debug('After ratio encoding the features are: {self.data.columns}')
        self.logger.debug('After ratio encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def encode_binaries(self,binary_feats,binary_map):
        '''Encodes binary feats in a data set using the binary map dictionary. Entries in each feature
            must be keys in the dictionary and will be replaced by the corresponding value in the dictionary.
        
        Inputs:
            binary_feats (list of strings) - list containing strings with binary features 
            from the data set to be encoded.
            
            binary_maps (dictionary) - dictionary that has keys (string) corresponding to 
            entries in a binary feature. The values (int) of the dictionary will replace 
            the corresponding keys.
        
        Outputs:
            None - Replaces the feature with a binary encoded feature in the dataset (self.data).
        '''
        self.logger.debug('Before binary encoding the features are: {self.data.columns}')
        self.logger.debug('Before binary encoding the number of NaNs is: {self.data.isna().sum()}')
        self.data[binary_feats] = self.data[binary_feats].map(lambda x: binary_map[x])
        self.logger.debug('After binary encoding the features are: {self.data.columns}')
        self.logger.debug('After binary encoding the number of NaNs is: {self.data.isna().sum()}')
        
    def encode_catnoms(self, catnom_feats, encode_type='one_hot'):
        '''
        Performs encoding on categorical nominal features. Currently only supports one hot encoding.
        
        Inputs:
            catnom_feats (list of strings) - Each string in the list is the name of a categorical 
            nominal feature to be encoded.
            
            encode_type (str) - Type of encoding to be performed on categorical nominal features.
                Currently only one hot encoding is implimented.
                
        Outputs:
            None - appends the encoded features to the data set (self.data) and drops the unencoded
            features.
        
        '''
        self.logger.debug('Before categorical nominal encoding the features are: {self.data.columns}')
        self.logger.debug('Before categorical nominal encoding the number of NaNs is: {self.data.isna().sum()}')
        if encode_type == 'one_hot':
            #Initialize encoder and fit transform
            ohe_encoder = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
            ohe_encoded = ohe_encoder.fit_transform(self.data[catnom_feats])
            col_names = ohe_encoder.get_feature_names_out(catnom_feats)
            encoded_feats = pd.DataFrame(ohe_encoded,columns=col_names)

            #Reset indices to ensure alignment of the dataframe
            #then concatenate data with encoded feats
            self.data.reset_index(drop=True,inplace=True)
            encoded_feats.reset_index(drop=True,inplace=True)
            self.data = pd.concat([self.data,encoded_feats],axis=1)
            
            #Drop unencoded features to prevent issues with resampling and model fitting.
            self.data.drop(columns=catnom_feats,axis=1,inplace=True)
        self.logger.debug('After categorical nominal encoding the features are: {self.data.columns}')
        self.logger.debug('After categorical nominal encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def encode_catords(self,catord_feats,catord_map):
        '''
        Encodes categorical ordinal features using a mapping dictionary. Replaces the unencoded features
        with encoded features.
        
        Inputs: 
            catord_feats (list of strings) - Each string in the list must be a feature name in
                the dataset.
                
            catord_map (dict) - Dictionary of features containing keys (str) that are in the unencoded
                feature and corresponding values (int) that will be replacing the unencoded entries.
                
        Outputs:
            None - Replaces the unencoded feature in the dataset (self.data)
        '''
        self.logger.debug('Before categorical ordinal encoding the features are: {self.data.columns}')
        self.logger.debug('Before categorical ordinal encoding the number of NaNs is: {self.data.isna().sum()}')
        #Apply encoding
        self.data[catord_feats] = self.data[catord_feats].map(lambda x: catord_map[x])
        self.logger.debug('After categorical ordinal encoding the features are: {self.data.columns}')
        self.logger.debug('After categorical ordinal encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def smote_tomek(self,target_name):
        '''
        Performs SMOTETomek resampling to prevent model bias towards a majority feature.
        
        This function MUST be used AFTER encoding all features. Non-numeric features will raise
        an error.
        
        Inputs:
            target_name (str) - Name of the feature to be predicted (truth or target).
            
        Outputs:
            None - Resampled replaces the old data.
        '''
        self.logger.debug('Before SMOTE-Tomek resampling the features are: {self.data.columns}')
        self.logger.debug('Before SMOTE-Tomek resampling the number of NaNs is: {self.data.isna().sum()}')
        #Split data into features and target
        X = self.data
        y = self.data.pop(target_name)
        
        #Initialize and fit data
        smote_tomek = SMOTETomek()
        X_resampled, y_resampled = smote_tomek.fit_resample(X,y)
        
        #Concatenate data and replaced ata set.
        self.data = pd.concat([X_resampled,y_resampled],axis=1)
    
        self.logger.debug('After SMOTE-Tomek resampling the features are: {self.data.columns}')
        self.logger.debug('After SMOTE-Tomek resampling the number of NaNs is: {self.data.isna().sum()}')

# Pipeline class

In [26]:
class PipeLine:
    def __init__(self,raw_data,logger,run=True,ratios=None):
        '''
        Initializes PipeLine class that will pass data to DataProcessor class. After a successful run
        all data should be encoded and the data set should be resampled (if necessary).
        
        Inputs: 
            raw_data (pd.DataFrame) - Data set that will be encoded and resampled. Stored to retain
                the dataset for future use and testing.
                
            run (bool) - If true, the pipeline will pass the data through the encoding class.
            
            ratios (list/tuple of list/tuples of strings) - The features to be turned into a ratio 
                are placed into a list/tuple where position 0 is the numerator and position 1 is 
                the denominator. If multiple ratios need to be taken then they can be passed 
                as a list of lists.
                
                [(feat1,feat2),(feat3,feat4)] will give the rations feat1/feat2 and feat3/feat4.
    
        '''
        self.logger = logging.getLogger(__name__)
        logger.info('Initializing PipeLine class')
        logger.debug('Raw Data Shape: {self.raw_data.shape}')
        logger.debug('Raw Data Features: {self.raw_data.columns}')
        
        self.raw_data = raw_data
        self.ratios = ratios
        self.parameter_loader()
        
        if self.ratios == 'all':
            self.ratios = [v for k,v in self.params['feat_ratio_names'].items()]
            
        if run == True:
            self.run_pipeline()
            
    def parameter_loader(self):
        self.logger.info('Loading parameter dictionary.')
        '''
        Generates parameter dictionary. This will be replaced later with a json or yaml loading function.
        
        '''
        self.params = {
            'feat_categories':{
                'numerical_continuous':['TSH_Level','T3_Level','T4_Level','Nodule_Size',],
                'ordinal':['Age'],
                'binary':['Gender','Family_History','Radiation_Exposure','Iodine_Deficiency',
                    'Smoking','Obesity','Diabetes','Diagnosis',],
                'categorical_nominal':['Country','Ethnicity',],
                'categorical_ordinal':['Thyroid_Cancer_Risk',],
            },
            'encoding_utils':{
                'binary':{'Yes':1.0,'No':0.0,'Male':1.0,'Female':0.0,'Benign':0.0,'Malignant':1.0},
                'categorical_ordinal':{'Low':0.0,'Medium':1.0,'High':2.0},
            },
            'train_test':{
                'test_size':0.2,
            },
            'feat_ratio_names':{
                'ratio_1':['TSH_Level','T3_Level'],
                'ratio_2':['Nodule_Size','T4_Level'],
                'ratio_3':['TSH_Level','T4_Level'],
                'ratio_4':['Nodule_Size','T3_Level']
            },
            'resample':'SMOTETomek',
            'target_name':'Diagnosis',
            
        }
        self.logger.info('Parameter file loaded successfully.')
        
    def run_pipeline(self):
        self.logger.info('Initializing DataProcessor class.')
        #Initialize data processing class
        data_processor = DataProcessor(data=self.raw_data,logger=self.logger, params=self.params)
        self.logger.info('DataProcessor class initialized successfully.')
        
        
        #Make ratio features
        if self.ratios:
            self.logger.info('Creating ratios of features.')
            for ratio in self.ratios:
                data_processor.feat_ratios(ratio[0],ratio[1])
            self.logger.info('Ratio features created successfully.')
        
        #Encode binary features
        self.logger.info('Encoding binary features.')
        data_processor.encode_binaries(
            binary_feats=self.params['feat_categories']['binary'],
            binary_map=self.params['encoding_utils']['binary'],
        )
        self.logger.info('Binary features encoded successfully.')
        
        #Encode categorical nominal features
        self.logger.info('Begin encoding categorical nominal features.')
        data_processor.encode_catnoms(
            catnom_feats=self.params['feat_categories']['categorical_nominal'],
        )
        self.logger.info('Categorical nominal features encoded successfully.')
        
        self.logger.info('Begin encoding catagorical ordinal features.')
        data_processor.encode_catords(
            catord_feats=self.params['feat_categories']['categorical_ordinal'],
            catord_map=self.params['encoding_utils']['categorical_ordinal'],
        )
        self.logger.info('Categorical ordinal features encoded successfully.')
        
        
        if self.params['resample'] == 'SMOTETomek':
            self.logger.info('Begin resampling of data.')
            data_processor.smote_tomek(target_name=self.params['target_name'])
            self.logger.info('Resampling of data completed successfully.')
        
        self.data = data_processor.data

In [34]:
pipeline = PipeLine(raw_data=deepcopy(df),logger=logger,ratios='all')

2025-03-28 13:24:08,478 - INFO - Initializing PipeLine class
2025-03-28 13:24:08,479 - INFO - Loading parameter dictionary.
2025-03-28 13:24:08,480 - INFO - Parameter file loaded successfully.
2025-03-28 13:24:08,481 - INFO - Initializing DataProcessor class.
2025-03-28 13:24:08,482 - INFO - Initlizizing DataProcessor class
2025-03-28 13:24:08,483 - INFO - DataProcessor class initialized successfully.
2025-03-28 13:24:08,484 - INFO - Creating ratios of features.
2025-03-28 13:24:08,492 - INFO - Ratio features created successfully.
2025-03-28 13:24:08,493 - INFO - Encoding binary features.
2025-03-28 13:24:08,918 - INFO - Binary features encoded successfully.
2025-03-28 13:24:08,919 - INFO - Begin encoding categorical nominal features.
2025-03-28 13:24:09,224 - INFO - Categorical nominal features encoded successfully.
2025-03-28 13:24:09,225 - INFO - Begin encoding catagorical ordinal features.
2025-03-28 13:24:09,284 - INFO - Categorical ordinal features encoded successfully.
2025-03-2

In [36]:
pipeline.data.head()

Unnamed: 0,Patient_ID,Age,Gender,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,...,Country_Nigeria,Country_Russia,Country_South Korea,Country_UK,Country_USA,Ethnicity_Asian,Ethnicity_Caucasian,Ethnicity_Hispanic,Ethnicity_Middle Eastern,Diagnosis
0,1,66,1.0,0.0,1.0,0.0,0.0,0.0,0.0,9.37,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2,29,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.83,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,86,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.26,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4,75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,35,0.0,1.0,1.0,0.0,0.0,0.0,0.0,9.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model 0 - All Features

In [37]:
X = deepcopy(pipeline.data)
y = X.pop('Diagnosis')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

model_1 = RandomForestClassifier()
model_1.fit(X_train, y_train)

preds_1 = model_1.predict(X_test)

report_1 = classification_report(y_test, preds_1)
print(report_1)

              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89     30551
         1.0       0.93      0.83      0.88     30366

    accuracy                           0.89     60917
   macro avg       0.89      0.89      0.89     60917
weighted avg       0.89      0.89      0.89     60917



In [39]:
def model_data_select(data, drop_list, ohe_feats, target):
    data.drop(columns=drop_list,inplace=True,axis=1)
    for feat in ohe_feats:
        data.drop(columns=df.filter(like=feat+'_').columns, inplace=True)
        
    return data

In [None]:
model