In [None]:
import pandas
import copy
import joblib
import json
import sklearn
import imblearn
import matplotlib
import seaborn
import numpy

In [None]:
print(f"pandas=={pandas.__version__}")
print(f"joblib=={joblib.__version__}")
print(f"json=={json.__version__}")
print(f"scikit-learn=={sklearn.__version__}")
print(f"imbalanced-learn=={imblearn.__version__}")
print(f"matplotlib=={matplotlib.__version__}")
print(f"seaborn=={seaborn.__version__}")
print(f"numpy=={numpy.__version__}")

In [1]:
#Import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from imblearn.combine import SMOTETomek
from copy import deepcopy
import joblib
import json

#Define global vars
fpath = '/Users/richardmiller/Downloads/thyroid_cancer_risk_data.csv'
model_results = {}
df = pd.read_csv(fpath)

# Data Processor:
<ol type="1">
<li> Take in data, params, and drop_list. Check input data types and raise errors if necessary.
<li> Transform the different data types in the data frame (one-hot encoding, ordinal encoding, etc.)
<li> Perform data engineering tasks (e.g., dividing the TSH_Levels/T4_Levels or Nodule_Size/T3_Levels).
<li> Drop any useless features (id numbers) and any transformed features (e.g. drop TSH_Levels and T4_Levels because of the feature engineering.
<li> Return a dataframe with the cleaned data set.
</ol>

In [2]:
##PACKAGE LIST
#pandas
#logging
#sklearn.preprocessing OneHotEncoder
#imblearn.combine SMOTETomek
class DataProcessor:
    def __init__(self,data,params,logger):
        self.logger = logger
        self.logger.info('Initlizizing DataProcessor class')
        
        self.data = data
        self.params = params
        self.logger = logger

        
    def feat_ratios(self,top,bottom):
        self.logger.debug('Before ratio encoding the features are: {self.data.columns}')
        self.logger.debug('Before ratio encoding the number of NaNs is: {self.data.isna().sum()}')
        '''Creates a new features by dividing two numerical features.

        Inputs: top (string) - name of feature in dataset to be in the numerator of the ratio.
                bottom (string) - name of feature in dataset to be in the denominator of the ratio.

        Outputs: None - Appends new feature to the dataset, self.data.
        
        '''
        #Divide top/bottom features
        new_feat = top+':'+bottom
        self.data[new_feat] = self.data[top]/self.data[bottom]
        self.logger.debug('After ratio encoding the features are: {self.data.columns}')
        self.logger.debug('After ratio encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def encode_binaries(self,binary_feats,binary_map):
        '''Encodes binary feats in a data set using the binary map dictionary. Entries in each feature
            must be keys in the dictionary and will be replaced by the corresponding value in the dictionary.
        
        Inputs:
            binary_feats (list of strings) - list containing strings with binary features 
            from the data set to be encoded.
            
            binary_maps (dictionary) - dictionary that has keys (string) corresponding to 
            entries in a binary feature. The values (int) of the dictionary will replace 
            the corresponding keys.
        
        Outputs:
            None - Replaces the feature with a binary encoded feature in the dataset (self.data).
        '''
        self.logger.debug('Before binary encoding the features are: {self.data.columns}')
        self.logger.debug('Before binary encoding the number of NaNs is: {self.data.isna().sum()}')
        self.data[binary_feats] = self.data[binary_feats].map(lambda x: binary_map[x])
        self.logger.debug('After binary encoding the features are: {self.data.columns}')
        self.logger.debug('After binary encoding the number of NaNs is: {self.data.isna().sum()}')
        
    def encode_catnoms(self, catnom_feats, encode_type='one_hot'):
        '''
        Performs encoding on categorical nominal features. Currently only supports one hot encoding.
        
        Inputs:
            catnom_feats (list of strings) - Each string in the list is the name of a categorical 
            nominal feature to be encoded.
            
            encode_type (str) - Type of encoding to be performed on categorical nominal features.
                Currently only one hot encoding is implimented.
                
        Outputs:
            None - appends the encoded features to the data set (self.data) and drops the unencoded
            features.
        
        '''
        self.logger.debug('Before categorical nominal encoding the features are: {self.data.columns}')
        self.logger.debug('Before categorical nominal encoding the number of NaNs is: {self.data.isna().sum()}')
        if encode_type == 'one_hot':
            #Initialize encoder and fit transform
            ohe_encoder = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
            ohe_encoded = ohe_encoder.fit_transform(self.data[catnom_feats])
            col_names = ohe_encoder.get_feature_names_out(catnom_feats)
            encoded_feats = pd.DataFrame(ohe_encoded,columns=col_names)

            #Reset indices to ensure alignment of the dataframe
            #then concatenate data with encoded feats
            self.data.reset_index(drop=True,inplace=True)
            encoded_feats.reset_index(drop=True,inplace=True)
            self.data = pd.concat([self.data,encoded_feats],axis=1)
            
            #Drop unencoded features to prevent issues with resampling and model fitting.
            self.data.drop(columns=catnom_feats,axis=1,inplace=True)
        self.logger.debug('After categorical nominal encoding the features are: {self.data.columns}')
        self.logger.debug('After categorical nominal encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def encode_catords(self,catord_feats,catord_map):
        '''
        Encodes categorical ordinal features using a mapping dictionary. Replaces the unencoded features
        with encoded features.
        
        Inputs: 
            catord_feats (list of strings) - Each string in the list must be a feature name in
                the dataset.
                
            catord_map (dict) - Dictionary of features containing keys (str) that are in the unencoded
                feature and corresponding values (int) that will be replacing the unencoded entries.
                
        Outputs:
            None - Replaces the unencoded feature in the dataset (self.data)
        '''
        self.logger.debug('Before categorical ordinal encoding the features are: {self.data.columns}')
        self.logger.debug('Before categorical ordinal encoding the number of NaNs is: {self.data.isna().sum()}')
        #Apply encoding
        self.data[catord_feats] = self.data[catord_feats].map(lambda x: catord_map[x])
        self.logger.debug('After categorical ordinal encoding the features are: {self.data.columns}')
        self.logger.debug('After categorical ordinal encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def smote_tomek(self,target_name):
        '''
        Performs SMOTETomek resampling to prevent model bias towards a majority feature.
        
        This function MUST be used AFTER encoding all features. Non-numeric features will raise
        an error.
        
        Inputs:
            target_name (str) - Name of the feature to be predicted (truth or target).
            
        Outputs:
            None - Resampled replaces the old data.
        '''
        self.logger.debug('Before SMOTE-Tomek resampling the features are: {self.data.columns}')
        self.logger.debug('Before SMOTE-Tomek resampling the number of NaNs is: {self.data.isna().sum()}')
        #Split data into features and target
        X = self.data
        y = self.data.pop(target_name)
        
        #Initialize and fit data
        smote_tomek = SMOTETomek()
        X_resampled, y_resampled = smote_tomek.fit_resample(X,y)
        
        #Concatenate data and replaced ata set.
        self.data = pd.concat([X_resampled,y_resampled],axis=1)
    
        self.logger.debug('After SMOTE-Tomek resampling the features are: {self.data.columns}')
        self.logger.debug('After SMOTE-Tomek resampling the number of NaNs is: {self.data.isna().sum()}')

# Pipeline: 

In [3]:
## PACKAGE LIST
#logging
#pandas
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("pipeline.log"),  # writes to file
        logging.StreamHandler()               # prints to console
    ]
)
logger = logging.getLogger(__name__)

class EncodingPipeLine:
    def __init__(self,raw_data,logger,run=True,ratios=None):
        '''
        Initializes PipeLine class that will pass data to DataProcessor class. After a successful run
        all data should be encoded and the data set should be resampled (if necessary).
        
        Inputs: 
            raw_data (pd.DataFrame) - Data set that will be encoded and resampled. Stored to retain
                the dataset for future use and testing.
                
            run (bool) - If true, the pipeline will pass the data through the encoding class.
            
            ratios (list/tuple of list/tuples of strings) - The features to be turned into a ratio 
                are placed into a list/tuple where position 0 is the numerator and position 1 is 
                the denominator. If multiple ratios need to be taken then they can be passed 
                as a list of lists.
                
                [(feat1,feat2),(feat3,feat4)] will give the rations feat1/feat2 and feat3/feat4.
    
        '''
        self.logger = logging.getLogger(__name__)
        logger.info('Initializing PipeLine class')
        logger.debug('Raw Data Shape: {self.raw_data.shape}')
        logger.debug('Raw Data Features: {self.raw_data.columns}')
        
        self.raw_data = raw_data
        self.ratios = ratios
        self.parameter_loader()
        
        if self.ratios == 'all':
            self.ratios = [v for k,v in self.params['feat_ratio_names'].items()]
            
        if run == True:
            self.run_pipeline()
            
    def parameter_loader(self):
        self.logger.info('Loading parameter dictionary.')
        '''
        Generates parameter dictionary. This will be replaced later with a json or yaml loading function.
        
        '''
        self.params = {
            'feat_categories':{
                'numerical_continuous':['TSH_Level','T3_Level','T4_Level','Nodule_Size',],
                'ordinal':['Age'],
                'binary':['Gender','Family_History','Radiation_Exposure','Iodine_Deficiency',
                    'Smoking','Obesity','Diabetes','Diagnosis',],
                'categorical_nominal':['Country','Ethnicity',],
                'categorical_ordinal':['Thyroid_Cancer_Risk',],
            },
            'encoding_utils':{
                'binary':{'Yes':1.0,'No':0.0,'Male':1.0,'Female':0.0,'Benign':0.0,'Malignant':1.0},
                'categorical_ordinal':{'Low':0.0,'Medium':1.0,'High':2.0},
            },
            'train_test':{
                'test_size':0.2,
            },
            'feat_ratio_names':{
                'ratio_1':['TSH_Level','T3_Level'],
                'ratio_2':['Nodule_Size','T4_Level'],
                'ratio_3':['TSH_Level','T4_Level'],
                'ratio_4':['Nodule_Size','T3_Level']
            },
            'resample':'SMOTETomek',
            'target_name':'Diagnosis',
            
        }
        self.logger.info('Parameter file loaded successfully.')
        
    def run_pipeline(self):
        self.logger.info('Initializing DataProcessor class.')
        #Initialize data processing class
        data_processor = DataProcessor(data=self.raw_data,logger=self.logger, params=self.params)
        self.logger.info('DataProcessor class initialized successfully.')
        
        
        #Make ratio features
        if self.ratios:
            self.logger.info('Creating ratios of features.')
            for ratio in self.ratios:
                data_processor.feat_ratios(ratio[0],ratio[1])
            self.logger.info('Ratio features created successfully.')
        
        #Encode binary features
        self.logger.info('Encoding binary features.')
        data_processor.encode_binaries(
            binary_feats=self.params['feat_categories']['binary'],
            binary_map=self.params['encoding_utils']['binary'],
        )
        self.logger.info('Binary features encoded successfully.')
        
        #Encode categorical nominal features
        self.logger.info('Begin encoding categorical nominal features.')
        data_processor.encode_catnoms(
            catnom_feats=self.params['feat_categories']['categorical_nominal'],
        )
        self.logger.info('Categorical nominal features encoded successfully.')
        
        self.logger.info('Begin encoding catagorical ordinal features.')
        data_processor.encode_catords(
            catord_feats=self.params['feat_categories']['categorical_ordinal'],
            catord_map=self.params['encoding_utils']['categorical_ordinal'],
        )
        self.logger.info('Categorical ordinal features encoded successfully.')
        
        
        if self.params['resample'] == 'SMOTETomek':
            self.logger.info('Begin resampling of data.')
            data_processor.smote_tomek(target_name=self.params['target_name'])
            self.logger.info('Resampling of data completed successfully.')
        else:
            pass
        
        self.data = data_processor.data

# Model Saver:

In [None]:
##Packages:
#pandas
#joblib
#json
#pathlib
class ModelSaver:
    def __init__(self,algorithm,X_valid,y_valid,output_dir,iteration_start=0,notes=None,save_params=False,params=None):
        self.algorithm = algorithm
        self.notes = notes
        self.output_dir = output_dir
        self.save_params = save_params
        self.params = params
        self.iteration = iteration_start
        self.model_save = {}
        
        if self.params != None:
            self.model_save['params'] = params

    def record_state(self, model, X_test, X_train, y_test, y_train, model_preds,model_scores,drop_list,params=None,model_notes=None):
        current_model = {
            'model':model,
            'X_test':X_test,
            'X_train':X_train,
            'y_test':y_test,
            'y_train':y_train,
            'model_preds':model_preds,
            'model_notes':model_notes,
            'model_scores':model_scores,
        }
        
        if self.save_params == True:
            current_model['params'] = params
            
        
        self.model_save[self.algorithm+'_'+str(self.iteration)] = current_model
        self.iteration += 1
            
            
    def save_state(self,output_fname,output_dir):
            
        dir_path = Path(output_dir)
        fpath = dir_path/output_fname
        joblib.dump(self.model_save, fpath)
    
    def update_state(self, algorithm, iteration):
        self.algorithm = algorithm
        self.iteration = iteration

    

In [5]:
fpath = '/Users/richardmiller/Downloads/thyroid_cancer_risk_data.csv'
X_raw = pd.read_csv(fpath)
y_raw = X_raw.pop('Diagnosis')

X, X_valid, y, y_valid = train_test_split(X_raw,y_raw,test_size=0.2,stratify=y_raw)

data = pd.concat([X,y],axis=1)

pipeline = EncodingPipeLine(raw_data=deepcopy(data),logger=logger,ratios='all')

2025-03-29 14:01:40,804 - INFO - Initializing PipeLine class
2025-03-29 14:01:40,807 - INFO - Loading parameter dictionary.
2025-03-29 14:01:40,808 - INFO - Parameter file loaded successfully.
2025-03-29 14:01:40,809 - INFO - Initializing DataProcessor class.
2025-03-29 14:01:40,809 - INFO - Initlizizing DataProcessor class
2025-03-29 14:01:40,810 - INFO - DataProcessor class initialized successfully.
2025-03-29 14:01:40,811 - INFO - Creating ratios of features.
2025-03-29 14:01:40,822 - INFO - Ratio features created successfully.
2025-03-29 14:01:40,823 - INFO - Encoding binary features.
2025-03-29 14:01:41,185 - INFO - Binary features encoded successfully.
2025-03-29 14:01:41,186 - INFO - Begin encoding categorical nominal features.
2025-03-29 14:01:41,465 - INFO - Categorical nominal features encoded successfully.
2025-03-29 14:01:41,466 - INFO - Begin encoding catagorical ordinal features.
2025-03-29 14:01:41,525 - INFO - Categorical ordinal features encoded successfully.
2025-03-2

In [7]:
import pickle
data_save = {
    'X_valid':X_valid,
    'y_valid':y_valid,
    'data':pipeline.data,
}

with open("resampled_data.pkl", "wb") as f:
    pickle.dump(data_save, f)

In [8]:
def model_data_select(data, drop_list,target,ohe_feats=None,test_size=None):
    data.drop(columns=drop_list,inplace=True,axis=1)
    if ohe_feats:
        for feat in ohe_feats:
            data.drop(columns=data.filter(like=feat+'_').columns, inplace=True, axis=1)
        
    X = data
    y = data.pop(target)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)

    return [X_train, X_test, y_train, y_test]

# Model 1 - Nodule_Size:T3_Level and TSH_Level:T4_Level

## Does Not Include: Gender, Ethnicity

In [26]:
drop_1 = [
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
    'Gender'
]

X_train_1, X_test_1, y_train_1, y_test_1 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_1,
    ohe_feats=['Ethnicity'],
    target='Diagnosis'
)

rf_model_1 = RandomForestClassifier()
rf_model_1.fit(X_train_1, y_train_1)

preds_1 = rf_model_1.predict(X_test_1)
report_1 = classification_report(y_test_1,preds_1)

print(report_1)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30193
         1.0       0.93      0.82      0.87     30320

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 2 - Nodule_Size:T4_Level and TSH_Level:T3_Level

## Does Not Include: Gender, Ethnicity

In [25]:
drop_2 = [
    'T4_Level',
    'Gender',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
]

X_train_2, X_test_2, y_train_2, y_test_2 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_2,
    ohe_feats=['Ethnicity'],
    target='Diagnosis'
)

rf_model_2 = RandomForestClassifier()
rf_model_2.fit(X_train_2, y_train_2)

preds_2 = rf_model_2.predict(X_test_2)
report_2 = classification_report(y_test_2,preds_2)


print(report_2)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30082
         1.0       0.93      0.82      0.87     30431

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 3 - No Ratios

In [27]:
drop_3 = [
    'Gender',
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
]

X_train_3, X_test_3, y_train_3, y_test_3 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_3,
    ohe_feats=['Ethnicity'],
    target='Diagnosis'
)
rf_model_3 = RandomForestClassifier()
rf_model_3.fit(X_train_3, y_train_3)

preds_3 = rf_model_3.predict(X_test_3)
report_3 = classification_report(y_test_3,preds_3)
model_results['model_3'] = report_3

print(report_3)

              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89     30302
         1.0       0.94      0.83      0.88     30211

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 4 - Nodule_Size:T3_Level and TSH_Level:T4_Level

## Includes: Ethnicity
## Does Note Include: Gender

In [28]:
drop_4 = [
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
    'Gender'
]

X_train_4, X_test_4, y_train_4, y_test_4 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_4,
    ohe_feats=None,
    target='Diagnosis'
)

rf_model_4 = RandomForestClassifier()
rf_model_4.fit(X_train_4, y_train_4)

preds_4 = rf_model_4.predict(X_test_4)
report_4 = classification_report(y_test_4,preds_4)
model_results['model_4'] = report_4

print(report_4)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30122
         1.0       0.93      0.82      0.87     30391

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 5 - Nodule_Size:T4_Level and TSH_Level:T3_Level

## Includes: Ethnicity
## Does Note Include: Gender

In [31]:
drop_5 = [
    'T4_Level',
    'Gender',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
]

X_train_5, X_test_5, y_train_5, y_test_5 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_5,
    ohe_feats=None,
    target='Diagnosis'
)

rf_model_5 = RandomForestClassifier()
rf_model_5.fit(X_train_5, y_train_5)

preds_5 = rf_model_5.predict(X_test_5)
report_5 = classification_report(y_test_5,preds_5)


print(report_5)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30227
         1.0       0.93      0.82      0.87     30286

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 6 - No Ratios

## Includes: Ethnicity
## Does Note Include: Gender

In [32]:
drop_6 = [
    'Gender',
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
]

X_train_6, X_test_6, y_train_6, y_test_6 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_6,
    ohe_feats=None,
    target='Diagnosis'
)
rf_model_6 = RandomForestClassifier()
rf_model_6.fit(X_train_6, y_train_6)

preds_6 = rf_model_6.predict(X_test_6)
report_6 = classification_report(y_test_6,preds_6)

print(report_6)

              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89     30112
         1.0       0.94      0.83      0.88     30401

    accuracy                           0.89     60513
   macro avg       0.89      0.89      0.89     60513
weighted avg       0.89      0.89      0.89     60513



# Model 7 - Nodule_Size:T3_Level and TSH_Level:T4_Level

## Includes: Gender
## Does Note Include: Ethnicity

In [33]:
drop_7 = [
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
]

X_train_7, X_test_7, y_train_7, y_test_7 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_7,
    ohe_feats=['Ethnicity'],
    target='Diagnosis'
)

rf_model_7 = RandomForestClassifier()
rf_model_7.fit(X_train_7, y_train_7)

preds_7 = rf_model_7.predict(X_test_7)
report_7 = classification_report(y_test_7,preds_7)

print(report_7)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30430
         1.0       0.93      0.82      0.87     30083

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 8 - Nodule_Size:T4_Level and TSH_Level:T3_Level

## Includes: Gender
## Does Note Include: Ethnicity

In [36]:
drop_8 = [
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
]

X_train_8, X_test_8, y_train_8, y_test_8 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_8,
    ohe_feats=['Ethnicity'],
    target='Diagnosis'
)

rf_model_8 = RandomForestClassifier()
rf_model_8.fit(X_train_8, y_train_8)

preds_8 = rf_model_8.predict(X_test_8)
report_8 = classification_report(y_test_8,preds_8)


print(report_8)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30071
         1.0       0.94      0.82      0.87     30442

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 9 - No Ratios

## Includes: Gender
## Does Note Include: Ethnicity

In [37]:
drop_9 = [
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
]

X_train_9, X_test_9, y_train_9, y_test_9 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_9,
    ohe_feats=['Ethnicity'],
    target='Diagnosis'
)
rf_model_9 = RandomForestClassifier()
rf_model_9.fit(X_train_9, y_train_9)

preds_9 = rf_model_9.predict(X_test_9)
report_9 = classification_report(y_test_9,preds_9)
model_results['model_9'] = report_9

print(report_9)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30109
         1.0       0.94      0.83      0.88     30404

    accuracy                           0.89     60513
   macro avg       0.89      0.89      0.89     60513
weighted avg       0.89      0.89      0.89     60513



# Model 10 - Nodule_Size:T3_Level and TSH_Level:T4_Level

## Includes: Gender, Ethnicity
## Does Note Include: NONE

In [39]:
drop_10 = [
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
]

X_train_10, X_test_10, y_train_10, y_test_10 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_10,
    ohe_feats=None,
    target='Diagnosis'
)

rf_model_10 = RandomForestClassifier()
rf_model_10.fit(X_train_10, y_train_10)

preds_10 = rf_model_10.predict(X_test_10)
report_10 = classification_report(y_test_10,preds_10)

print(report_10)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30284
         1.0       0.93      0.82      0.87     30229

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 11 - Nodule_Size:T4_Level and TSH_Level:T3_Level

## Includes: Gender, Ethnicity
## Does Note Include: None

In [40]:
drop_11 = [
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
]

X_train_11, X_test_11, y_train_11, y_test_11 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_11,
    ohe_feats=None,
    target='Diagnosis'
)

rf_model_11 = RandomForestClassifier()
rf_model_11.fit(X_train_11, y_train_11)

preds_11 = rf_model_11.predict(X_test_11)
report_11 = classification_report(y_test_11,preds_11)


print(report_11)

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.89     30052
         1.0       0.94      0.82      0.87     30461

    accuracy                           0.88     60513
   macro avg       0.89      0.88      0.88     60513
weighted avg       0.89      0.88      0.88     60513



# Model 12 - No Ratios

## Includes: Gender, Ethnicity
## Does Note Include: None

In [41]:
drop_12 = [
    'Patient_ID',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
]

X_train_12, X_test_12, y_train_12, y_test_12 = model_data_select(
    data=deepcopy(pipeline.data),
    drop_list=drop_12,
    ohe_feats=None,
    target='Diagnosis'
)
rf_model_12 = RandomForestClassifier()
rf_model_12.fit(X_train_12, y_train_12)

preds_12 = rf_model_12.predict(X_test_12)
report_12 = classification_report(y_test_12,preds_12)

print(report_12)

              precision    recall  f1-score   support

         0.0       0.85      0.94      0.89     30082
         1.0       0.93      0.83      0.88     30431

    accuracy                           0.89     60513
   macro avg       0.89      0.89      0.89     60513
weighted avg       0.89      0.89      0.89     60513



In [52]:
models = {
    'model_1':{
        'ohe_feats':'Ethnicity',
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender'],
        'model_notes':'Ratios are Nodule_Size:T3_Level and TSH_Level:T4_Level. Does not include: Gender, Ethnicity',
    },
    'model_2':{
        'ohe_feats':'Ethnicity',
        'algorithm':'RandomForestClassifier',
        'drop_list':['T4_Level','Gender','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level',],
        'model_notes':'Ratios are Nodule_Size:T4_Level and TSH_Level:T3_Level. Does not include: Gender, Ethnicity',
    },
    'model_3':{
        'ohe_feats':'Ethnicity',
        'algorithm':'RandomForestClassifier',
        'drop_list':['Gender','Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level','Nodule_Size:T4_Level','TSH_Level:T3_Level',],
        'model_notes':'No ratios. Does not include: Gender, Ethnicity',
    },
    'model_4':{
        'ohe_feats':None,
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'model_notes':'Ratios are Nodule_Size:T3_Level and TSH_Level:T4_Level. Does not include: Gender',
    },
    'model_5':{
        'ohe_feats':None,
        'algorithm':'RandomForestClassifier',
        'drop_list':['T4_Level','Gender','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level',],
        'model_notes':'Ratios are Nodule_Size:T4_Level and TSH_Level:T3_Level. Does not include: Gender',
    },
    'model_6':{
        'ohe_feats':None,
        'algorithm':'RandomForestClassifier',
        'drop_list':['Gender','Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level','Nodule_Size:T4_Level','TSH_Level:T3_Level',],
        'model_notes':'No ratios. Does not include: Gender',
    },
    'model_7':{
        'ohe_feats':'Ethnicity',
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'model_notes':'Ratios are Nodule_Size:T3_Level and TSH_Level:T4_Level. Does not include: Ethnicity',
    },
    'model_8':{
        'ohe_feats':'Ethnicity',
        'algorithm':'RandomForestClassifier',
        'drop_list':['T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level',],
        'model_notes':'Ratios are Nodule_Size:T4_Level and TSH_Level:T3_Level. Does not include: Ethnicity',
    },
    'model_9':{
        'ohe_feats':'Ethnicity',
        'algorithm':'RandomForestClassifier',
        'drop_list':['Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level','Nodule_Size:T4_Level','TSH_Level:T3_Level',],
        'model_notes':'No ratios. Does not include: Ethnicity',
    },
    'model_10':{
        'ohe_feats':None,
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'model_notes':'Ratios are Nodule_Size:T3_Level and TSH_Level:T4_Level. Includes all feats.',
    },
    'model_11':{
        'ohe_feats':None,
        'algorithm':'RandomForestClassifier',
        'drop_list':['T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level',],
        'model_notes':'Ratios are Nodule_Size:T4_Level and TSH_Level:T3_Level. Includes all feats.',
    },
    'model_12':{
        'ohe_feats':None,
        'algorithm':'RandomForestClassifier',
        'drop_list':['Patient_ID','Nodule_Size:T3_Level','TSH_Level:T4_Level','Nodule_Size:T4_Level','TSH_Level:T3_Level',],
        'model_notes':'No ratios. Includes all feats.',
    },
}

In [57]:
def model_iterator(data,X_valid,y_valid,params,model_dict,fname,dirname,target):
    for i, (k,v) in enumerate(model_dict.items()):

        #Initialize model saver class
        if i == 0:
            logging.info('Initializing ModelSaver class')
        
            model_saver = ModelSaver(
            v['algorithm'],
            X_valid,
            y_valid,
            output_dir='/Users/richardmiller/Downloads/',
            iteration_start=i,
            params=params,
        )
            algorithm_ = v['algorithm']
            
        #Check if model changed between iterations
        if v['algorithm'] != algorithm_:
            model_saver.update_state(v['algorithm'],i)

            
        logging.info('Training Model.')

        if v['algorithm'] == 'RandomForestClassifier':
            #Split data into train and test sets
            X_train, X_test, y_train, y_test = model_data_select(
                data=deepcopy(data),
                drop_list=v['drop_list'],
                ohe_feats=v['ohe_feats'],
                target=target
            )
    
            #Fit model
            rf_model = RandomForestClassifier()
            rf_model.fit(X_train, y_train)
    
            #Make predictions and evaluate with report metric
            preds = rf_model.predict(X_test)
            report = classification_report(y_test,preds)
            logging.info('Model trained.')
            if verbose == True:
                print(report)
    
            logging.info('Saving state.')
    
            #Append current model data to all other data.
            model_saver.record_save(
            model=rf_model,
            X_test=X_test,
            X_train=X_train,
            y_test=y_test,
            y_train=y_train,
            model_scores=report,
            drop_list=v['drop_list'],
            model_notes=v['model_notes'],
        )
            logging.info('State saved successfully')
    
            #Set the test variable
            algorithm_ = v['algorithm']

        if i == len(model_dict.keys()):
            model_saver.save_state(output_fname=fname, output_dir=dirname)
    

In [58]:
thingy = model_iterator(data,X_valid,y_valid,pipeline.params,models,'thyroid_cancer_models.pkl','/Users/richardmiller/Downloads/','Diagnosis')

2025-03-29 21:47:53,342 - INFO - Initializing ModelSaver class
2025-03-29 21:47:53,344 - INFO - Training Model.


KeyError: "['Nodule_Size:T4_Level', 'TSH_Level:T3_Level'] not found in axis"