# Thyroid Cancer Detection Using Machine Learning Classifiers

In this notebook I will be prototyping several machine learning algorithms and data cleaning/engineering methods to predict whether a patient has thyroid cancer from several features in their medical records such as: Thyroid Stimulating Hormone (TSH), T3 and T4 hormone levels, family history, size of nodules on the thyroid, radiation exposure, and other features.

I will start by doing some simple informative exploratory data analysis, followed by data cleaning, model building, and finally compairson of models. I will build a class to use as a pipeline for preprocessing data, a wrapper class to call the preprocessing pipeline, a class and function for model training, and finally a function for model comparison.

# Environment Setup

In [55]:
#Functionality packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import joblib
import math
import dill
from copy import deepcopy

#Preprocessing packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from imblearn.combine import SMOTETomek

#Import Classifiers
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier

#Import classifier utilities
from sklearn.model_selection import GridSearchCV

#Import calculators and metrics
from scipy.stats import ttest_ind
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#Other Utilities 
from pathlib import Path

### Check Dataset for Irregularities and NaNs



In [None]:
mac_fpath = '/Users/richardmiller/Downloads/thyroid_cancer_risk_data.csv'
pc_fpath = 'C:\\Users\\rwmil\\Downloads\\thyroid_cancer_risk_data.csv'
data = pd.read_csv(mac_fpath)
data.head()

## Ensure the data is correctly formatted

### Check for duplicate features

In [None]:
data_duplicates = data[data.duplicated()].shape
print(f"There are {data_duplicates[0]} duplicate entries and {data_duplicates[1]} features in the data set.")

### Define parameter dictionary

In [None]:
params = {
    'feat_categories':{
        'numerical_continuous':['TSH_Level','T3_Level','T4_Level','Nodule_Size'],
        'ordinal':['Age'],
        'binary':['Gender','Family_History','Radiation_Exposure','Iodine_Deficiency','Smoking','Obesity','Diabetes'],
        'categorical_nominal':['Country','Ethnicity'],
        'categorical_ordinal':['Thyroid_Cancer_Risk']
        },
    'data_types':{
        'TSH_Level':float,
        'T3_Level':float,
        'T4_Level':float,
        'Nodule_Size':float,
        'Age':int,
        'Gender':str,
        'Family_History':str,
        'Radiation_Exposure':str,
        'Iodine_Deficiency':str,
        'Smoking':str,
        'Obesity':str,
        'Diabetes':str,
        'Country':str,
        'Ethnicity':str,
        'Thyroid_Cancer_Risk':str,
    },
    'encoding_utils':{
        'binary':{'Yes':1.0,'No':0.0,'Male':1.0,'Female':0.0,'Benign':0.0,'Malignant':1.0},
        'categorical_ordinal':{'Low':0.0,'Medium':1.0,'High':2.0},
    },
    'train_test':{
        'test_size':0.2,
    },
    'feat_ratio_names':{
        'ratio_1':['TSH_Level','T3_Level'],
        'ratio_2':['Nodule_Size','T4_Level'],
        'ratio_3':['TSH_Level','T4_Level'],
        'ratio_4':['Nodule_Size','T3_Level']
    },
    'resample':'SMOTETomek',
    'target_name':'Diagnosis',
    'Thyroid_Cancer_Risk':['Low','Medium','High',],
    'allowables':{
        'TSH_Level':None,
        'T3_Level':None,
        'T4_Level':None,
        'Nodule_Size':None,
        'Age':None,
        'Gender':['Male','Female'],
        'Family_History':['Yes','No'],
        'Radiation_Exposure':['Yes','No'],
        'Iodine_Deficiency':['Yes','No'],
        'Smoking':['Yes','No'],
        'Obesity':['Yes','No'],
        'Diabetes':['Yes','No'],
        'Country':[
            'Russia',
            'Germany',
            'Nigeria',
            'India',
            'UK',
            'South Korea',
            'Brazil',
            'China',
            'Japan',
            'USA',
        ],
        'Ethnicity':[
            'Caucasian',
            'Hispanic',
            'Asian',
            'African',
            'Middle Eastern',
        ],
        'Thyroid_Cancer_Risk':['Low','Medium','High']
    },
}

### Define function to check data types

In [None]:
def data_type_check(data, params):
    '''
    Iterates through features list. Checks for nans, entry data types, and 
    allowed values in each feature column.
    
    Prints results.
    Returns nothing.
    '''
    for key,value in params['feat_categories'].items():
        for item in value:
            data_type = params['data_types'][item]
            allowables = params['allowables'][item]

            print(f'{item} has nan values? {data[item].isna().any()}')
            print(f"{item} has invalid data types? {(~data[item].map(lambda x: isinstance(x, data_type))).any()}")
            if allowables is not None:
                print(f"{item} has invalid values? {(~data[item].map(lambda x: x in allowables)).any()}")
                print(' ')
            else:
                print(' ')

data_type_check(data, params)

There are no missing/NaN values and all feature entries are of the correct type or are within range. Moving on to exploratory data analysis.

# Exploratory Data Analysis

## Continuous Data EDA


### Histplots with Kernel-Density

In [None]:
#Define plot size variables
X = deepcopy(data)
y = X.pop('Diagnosis')
num_feats = len(params['feat_categories']['numerical_continuous'])
num_rows = math.ceil(math.sqrt(num_feats))
num_cols = math.ceil(num_feats/num_rows)
n=3
#Initialize plot
fig,axs = plt.subplots(nrows=num_rows,ncols=num_cols,sharey=False,constrained_layout=True,figsize=((n+1)*num_rows,n*num_cols))

#Plot histogram
for i,ax in enumerate(axs.flat):
    feat_name = params['feat_categories']['numerical_continuous'][i]
    x = data[feat_name].to_numpy()
    sns.histplot(
        x=feat_name,
        hue='Diagnosis',
        data=df,
        ax=ax,
        color='orchid',
        kde=True,
        bins=int(np.log2(len(x))+1),
        element='step',
    )
    sns.kdeplot(
        data=df,
        x=feat_name,
        hue='Diagnosis',
        ax=ax,
        common_norm=False
    )
    
    ax.set_title(feat_name)
    ax.set_xlabel('concentration')
    

plt.show()
plt.tight_layout()

### Box and Whisker Plot

In [None]:
#Define plot size variables
y = df['Diagnosis']
num_feats = len(params['feat_categories']['numerical_continuous'])
num_rows = math.ceil(math.sqrt(num_feats))
num_cols = math.ceil(num_feats/num_rows)
#Initialize subplots
fig,axs = plt.subplots(nrows=num_rows,ncols=num_cols,constrained_layout=True,)

for i,ax in enumerate(axs.flat):
    feat_name = params['feat_categories']['numerical_continuous'][i]
    x = df[feat_name].to_numpy()
    sns.boxplot(x=x,y=y,ax=ax,color='coral')
    ax.set_title(feat_name)
    ax.set_xlabel('concentration')

### t-test on continuous variables

In [None]:
benign = df[df['Diagnosis'] == 'Benign']
malignant = df[df['Diagnosis'] == 'Malignant']

for feat in params['feat_categories']['numerical_continuous']:
    stat,p = ttest_ind(benign[feat],malignant[feat],equal_var=False)
    
    print(f"{feat}: t-test score = {stat:.4f}, p-value is {p:.4f} ")

The distribution of TSH_Level, T3_Level, and T4_Level all appear to be pretty uniform for both malignant and benign. I don't believe there is much predictive power in these three features. But histplots and box-and-whisker plots only find linear relationships between feature and target. But, it looks like Nodule_Size might hold some predictive power, the p-value for nodule size seems to indicate this.

**Next steps: Attempt to transform these features using log2, log10, or sqrt.**


**Future Steps: During model training, I will try ratios of these feats and see if they show any predictive power.**

### Pairplots (Non-Transformed Data)

In [None]:
sns.pairplot(data.sample(frac=0.01),corner=True,hue='Diagnosis',vars=params['feat_categories']['numerical_continuous'],plot_kws={'alpha':0.5})
plt.show()

In [None]:
#Transform data with log2 and sqrt
feat_names_log_transform = [i+'_log_transformed' for i in params['feat_categories']['numerical_continuous']]
feat_names_sqrt_transform = [i+'_sqrt_transformed' for i in params['feat_categories']['numerical_continuous']]

data[feat_names_log_transform] = data[params['feat_categories']['numerical_continuous']].apply(lambda x: np.log2(x))
data[feat_names_sqrt_transform] = data[params['feat_categories']['numerical_continuous']].apply(lambda x: np.sqrt(x))

### Pairplots (Log2 Transformed Data)

In [None]:
sns.pairplot(data.sample(frac=0.01),corner=True,hue='Diagnosis',vars=feat_names_log_transform,plot_kws={'alpha':0.5})
plt.show()

### Pairplots (Sqrt Transformed)

In [None]:
sns.pairplot(data.sample(frac=0.01),corner=True,hue='Diagnosis',vars=feat_names_sqrt_transform,plot_kws={'alpha':0.5})
plt.show()

The transforms show some skew in the data, except for T4_Level. It might be worth training a few models with log2 and sqrt transforms on the data. Log2 gives a more skewed Gaussian look than sqrt doe,s and I think models like RandomForestClassifier will have an easier time finding patterns in those.

**Future Steps: Train a model with log2 feats and compare to other models.**

## Categorical Ordinal Feature EDA

In [None]:
#Get categorical feat names
ordinal_feats = params['feat_categories']['categorical_ordinal']
data_ = pd.read_csv(pc_fpath)
data[ordinal_feats] = data_[ordinal_feats]
#Encode ordinal feats
ordinal_encoding_dict = {'Low':0,'Medium':1,'High':2}
data[ordinal_feats] = data[ordinal_feats].map(lambda x:ordinal_encoding_dict[x])

fig,ax = plt.subplots(1,2,figsize=(10,6))

sns.barplot(
    data=data,
    x='Thyroid_Cancer_Risk',
    y=(data['Diagnosis']=='Malignant').astype(int),
    order=[0,1,2],
    ax=ax[0]
)

ax[0].set_ylabel('Malignant Counts')
ax[0].set_title('Malignancy Rate by Thyroid Cancer Risk')

sns.barplot(
    data=data,
    x='Thyroid_Cancer_Risk',
    y=(data['Diagnosis']=='Benign').astype(int),
    order=[0,1,2],
    ax=ax[1]
)

ax[1].set_ylabel('Benign Counts')
ax[1].set_title('Benign Rate by Thyroid Cancer Risk')



It's pretty clear form the two graphs above that the Thyroid_Cancer_Risk feature is a good feature. If you are at a 2 (high) risk, then you have a high chance of malignancy. Where as being at a 0 (low) or a 1 (modelate) your risk of malignancy is much lower. This feature can be kept without any transformations other than encoding.

# Categorical Nominal Feature EDA

### Frequency Plots

### Define function to make frequency plots

In [None]:
def normalized_bar_plotter(data,target,feats_list,normalize=True):
    
    #Get names and counts of all unique target values
    unique_targets = list(data[target].unique())
    unique_target_counts = [data[data[target]==i].shape[0] for i in unique_targets]
    num_counts = data.shape[0] 
    results = []

    #Initialize plot area
    num_rows = math.ceil(math.sqrt(len(feats_list)*len(unique_targets)))
    num_cols = math.ceil((len(feats_list)*len(unique_targets))/num_rows)
    n=3
    fig,axs = plt.subplots(num_rows, num_cols, figsize=((n+1)*num_rows,n*num_cols))

    
    for feat in feats_list:
        #Get labels and ticks for a feat.
        xlabels = list(data[feat].unique())
        xticks = [list(range(len(i))) for i in xlabels]
        
        #Calculate normalized percentage
        if normalize == True:
            
            #Calculate values for all targets
            num_counts = data.shape[0] #total number of counts
            unique_target_vals = data[target].unique()
            unique_feat_vals = data[feat].unique()
            unique_feat_counts = data[feat].value_counts().to_numpy() #number of counts for each unique entry in the feature for all target values
            unique_feat_idxs = list(range(len(unique_feat_counts)))
            #Get values for each unique target value
            #Calculate values
            for idx,val in enumerate(unique_target_vals):
                num_unique_target_counts = len(data[data[target]==val]) #number of total counts for each unique feat entry for one of the target values
                target_counts_by_feat = data[data[target]==val][feat].value_counts().to_numpy() #number of counts for each unique feat entry for each unique target value

                #Calculations
                top = target_counts_by_feat/num_unique_target_counts
                bottom = unique_feat_counts/num_counts
                result = top/bottom

                #Store calculations and their results
                results.append((unique_feat_vals,feat,target,val,result,))

    else:
        for feat in feats_list:
            unique_feat_vals = data[feat].unique()
            for val in unique_targets:
                results.append((unique_feat_vals,feat,target,val,data[feat].to_numpy()))
        

    for i,ax in enumerate(axs.flat):
        sns.countplot(x=results[i][-1],ax=ax)
        ax.set_xticks(list(range(len(results[i][0]))))
        ax.set_xticklabels(results[i][0],rotation=90)
        ax.set_title(results[i][2]+':'+results[i][3]+' Counts by '+results[i][1])
        if normalize == True:
            ax.set_ylim(min(results[i][-1])-0.01,max(results[i][-1]+0.01))
            ax.set_ylabel('Normalized Counts')

        else:
            ax.set_ylabel('Raw Counts')

    plt.tight_layout()
    plt.show()

    return results

In [None]:
results = normalized_bar_plotter(data=data,target='Diagnosis',feats_list=['Country','Ethnicity'],normalize=False)

The number of benign and malignant counts is highest in India, so it might be tempting to think that being from India increases your chance of being diagnosed with thyroid cancer. But at the same time, it increases your chances of a benign result, too, and that doesn't make much sense. India does account for the majority of the data points by far. So, I think it would be better to get a weighted average based on how much each country or ethnicity contributes to the total dataset.

When I saw these results I was initially inspired by the idea of a weighted average similar to how the average atomic mass is calculated for each element. Avg. Mass = $\Sigma _i^Np_im_i$

Where $p_i$ is the percent abundance of an isotope and $m_i$ is the mass of the isotope.



For this problem, $p_i$ is the percentage of diagnosis counts divided by the total number of counts for that diagnosis across all countries: $p_i = \frac{d_i}{D}$ 

and the "mass" (or more appropriately the average risk) of each country is the percentage of counts (both malignant and benign) out of all country counts (both malignant or benign) $m_i=\frac{C}{c_i}$. 

Where $d_i$ is the number of counts for a diagnosis in the $i-$th country, $D$ is the total counts for that diagnosis (e.g., Malignant or Benign), $c_i$ is the total number of all counts from that country, and $C$ is the total counts of all countries.

And the average risk can be calculated by $\frac{d_i\cdot C}{c_i\cdot D}$


In [None]:
results = normalized_bar_plotter(data=data,target='Diagnosis',feats_list=['Country','Ethnicity'],normalize=True)

We can see that plotting the count frequency by weight gives a relatively flat distribution. In other words, there isn't much difference between any two countries or ethnicities. Therefore, I don't believe that these features have any predictive power, so I will be dropping them.

**Next Step: Move on to binary feature EDA**

## Binary Features EDA

### Correlation Heatmap

In [None]:
binary_list = deepcopy(params['feat_categories']['binary'])
binary_data = data[binary_list].map(lambda x:params['encoding_utils']['binary'][x])
sns.heatmap(binary_data.corr(),cmap='coolwarm',annot=True,fmt='.3f')

It looks like there is some predictive power in the family history, radiation exposure, and iodine deficiency features. But the rest seems really small. However, the correlation coefficient only shows linear relationships. Because the features are easy to encode and work with I will keep them all. I might train a model without Gender, Smoking, Obesity, and Diagnosis to compare them. But I think leaving them in will at least add some randomness and reduce overfitting.

**Future Steps: Train models with and without smoking, obesity, gender, and diabetes. Compare the results**

The only remaining feature to explore is age. But I will not be doing any analysis because cancer is probabilistic in nature and as time increases, the number of cancer-causing events also increases. In general, it's safe to say that as you get older, your risk of developing cancer increases. So, I will keep the age feature without any transformation or analysis.

# Feature Selection

## Reload Data Set

In [None]:
mac_fpath = '/Users/richardmiller/Downloads/thyroid_cancer_risk_data.csv'
pc_fpath = 'C:\\Users\\rwmil\\Downloads\\thyroid_cancer_risk_data.csv'
data = pd.read_csv(mac_fpath)

## Define Data Processing Class

In [2]:
##PACKAGE LIST
#pandas
#logging
#sklearn.preprocessing OneHotEncoder
#imblearn.combine SMOTETomek
class DataProcessor:
    def __init__(self,data,params,logger):
        self.logger = logger
        self.logger.info('Initlizizing DataProcessor class')
        
        self.data = data
        self.params = params
        self.logger = logger

        
    def feat_ratios(self,top,bottom):
        self.logger.debug(f'Before ratio encoding the features are: {self.data.columns}')
        self.logger.debug(f'Before ratio encoding the number of NaNs is: {self.data.isna().sum()}')
        '''Creates a new features by dividing two numerical features.

        Inputs: top (string) - name of feature in dataset to be in the numerator of the ratio.
                bottom (string) - name of feature in dataset to be in the denominator of the ratio.

        Outputs: None - Appends new feature to the dataset, self.data.
        
        '''
        #Divide top/bottom features
        new_feat = top+':'+bottom
        self.data[new_feat] = self.data[top]/self.data[bottom]
        self.logger.debug(f'After ratio encoding the features are: {self.data.columns}')
        self.logger.debug(f'After ratio encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def encode_binaries(self,binary_feats,binary_map):
        '''Encodes binary feats in a data set using the binary map dictionary. Entries in each feature
            must be keys in the dictionary and will be replaced by the corresponding value in the dictionary.
        
        Inputs:
            binary_feats (list of strings) - list containing strings with binary features 
            from the data set to be encoded.
            
            binary_maps (dictionary) - dictionary that has keys (string) corresponding to 
            entries in a binary feature. The values (int) of the dictionary will replace 
            the corresponding keys.
        
        Outputs:
            None - Replaces the feature with a binary encoded feature in the dataset (self.data).
        '''
        self.logger.debug(f'Before binary encoding the features are: {self.data.columns}')
        self.logger.debug(f'Before binary encoding the number of NaNs is: {self.data.isna().sum()}')
        self.data[binary_feats] = self.data[binary_feats].map(lambda x: binary_map[x])
        self.logger.debug(f'After binary encoding the features are: {self.data.columns}')
        self.logger.debug(f'After binary encoding the number of NaNs is: {self.data.isna().sum()}')
        
    def encode_catnoms(self, catnom_feats, encode_type='one_hot'):
        '''
        Performs encoding on categorical nominal features. Currently only supports one hot encoding.
        
        Inputs:
            catnom_feats (list of strings) - Each string in the list is the name of a categorical 
            nominal feature to be encoded.
            
            encode_type (str) - Type of encoding to be performed on categorical nominal features.
                Currently only one hot encoding is implimented.
                
        Outputs:
            None - appends the encoded features to the data set (self.data) and drops the unencoded
            features.
        
        '''
        self.logger.debug(f'Before categorical nominal encoding the features are: {self.data.columns}')
        self.logger.debug(f'Before categorical nominal encoding the number of NaNs is: {self.data.isna().sum()}')
        if encode_type == 'one_hot':
            #Initialize encoder and fit transform
            ohe_encoder = OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
            ohe_encoded = ohe_encoder.fit_transform(self.data[catnom_feats])
            col_names = ohe_encoder.get_feature_names_out(catnom_feats)
            encoded_feats = pd.DataFrame(ohe_encoded,columns=col_names)

            #Reset indices to ensure alignment of the dataframe
            #then concatenate data with encoded feats
            self.data.reset_index(drop=True,inplace=True)
            encoded_feats.reset_index(drop=True,inplace=True)
            self.data = pd.concat([self.data,encoded_feats],axis=1)
            
            #Drop unencoded features to prevent issues with resampling and model fitting.
            self.data.drop(columns=catnom_feats,axis=1,inplace=True)
        self.logger.debug('After categorical nominal encoding the features are: {self.data.columns}')
        self.logger.debug('After categorical nominal encoding the number of NaNs is: {self.data.isna().sum()}')
        
        
    def encode_catords(self,catord_feats,catord_map):
        '''
        Encodes categorical ordinal features using a mapping dictionary. Replaces the unencoded features
        with encoded features.
        
        Inputs: 
            catord_feats (list of strings) - Each string in the list must be a feature name in
                the dataset.
                
            catord_map (dict) - Dictionary of features containing keys (str) that are in the unencoded
                feature and corresponding values (int) that will be replacing the unencoded entries.
                
        Outputs:
            None - Replaces the unencoded feature in the dataset (self.data)
        '''
        self.logger.debug(f'Before categorical ordinal encoding the features are: {self.data.columns}')
        self.logger.debug('fBefore categorical ordinal encoding the number of NaNs is: {self.data.isna().sum()}')
        #Apply encoding
        self.data[catord_feats] = self.data[catord_feats].map(lambda x: catord_map[x])
        self.logger.debug(f'After categorical ordinal encoding the features are: {self.data.columns}')
        self.logger.debug(f'After categorical ordinal encoding the number of NaNs is: {self.data.isna().sum()}')

    def sqrt_transform(self,feat_list):
        transformed_feats = ['sqrt_transformed_'+feat for feat in feat_list]
        for i, feat in enumerate(feat_list):
            self.data[transformed_feats[i]] = self.data[feat].map(lambda x:np.sqrt(x))

    def log1p_transform(self,feat_list):
        transformed_feats = ['log1p_transformed_'+feat for feat in feat_list]
        for i, feat in enumerate(feat_list):
            self.data[transformed_feats[i]] = self.data[feat].map(lambda x:np.log1p(x))
            
    def smote_tomek(self,target_name):
        '''
        Performs SMOTETomek resampling to prevent model bias towards a majority feature.
        
        This function MUST be used AFTER encoding all features. Non-numeric features will raise
        an error.
        
        Inputs:
            target_name (str) - Name of the feature to be predicted (truth or target).
            
        Outputs:
            None - Resampled replaces the old data.
        '''
        self.logger.debug(f'Before SMOTE-Tomek resampling the features are: {self.data.columns}')
        self.logger.debug(f'Before SMOTE-Tomek resampling the number of NaNs is: {self.data.isna().sum()}')
        
        #Split data into features and target
        X = self.data
        y = self.data.pop(target_name)

        #Initialize and fit data
        smote_tomek = SMOTETomek()
        X_resampled, y_resampled = smote_tomek.fit_resample(X,y)
        
        #Concatenate data and replaced ata set.
        self.data = pd.concat([X_resampled,y_resampled],axis=1)
    
        self.logger.debug(f'After SMOTE-Tomek resampling the features are: {self.data.columns}')
        self.logger.debug(f'After SMOTE-Tomek resampling the number of NaNs is: {self.data.isna().sum()}')


## Define Data Processing Wrapper Class

In [3]:
## PACKAGE LIST
#logging
#pandas
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("pipeline.log"),  # writes to file
        logging.StreamHandler()               # prints to console
    ]
)
logger = logging.getLogger(__name__)

class EncodingPipeLine:
    def __init__(self,raw_data,logger,run=True,ratios=None):
        '''
        Initializes PipeLine class that will pass data to DataProcessor class. After a successful run
        all data should be encoded and the data set should be resampled (if necessary).
        
        Inputs: 
            raw_data (pd.DataFrame) - Data set that will be encoded and resampled. Stored to retain
                the dataset for future use and testing.
                
            run (bool) - If true, the pipeline will pass the data through the encoding class.
            
            ratios (list/tuple of list/tuples of strings) - The features to be turned into a ratio 
                are placed into a list/tuple where position 0 is the numerator and position 1 is 
                the denominator. If multiple ratios need to be taken then they can be passed 
                as a list of lists.
                
                [(feat1,feat2),(feat3,feat4)] will give the rations feat1/feat2 and feat3/feat4.
    
        '''
        self.logger = logging.getLogger(__name__)
        logger.info('Initializing PipeLine class')
        logger.debug('Raw Data Shape: {self.raw_data.shape}')
        logger.debug('Raw Data Features: {self.raw_data.columns}')
        
        self.raw_data = raw_data
        self.ratios = ratios
        self.parameter_loader()
        
        if self.ratios == 'all':
            self.ratios = [v for k,v in self.params['feat_ratio_names'].items()]
            
        if run == True:
            self.run_pipeline()
            
    def parameter_loader(self):
        self.logger.info('Loading parameter dictionary.')
        '''
        Generates parameter dictionary. This will be replaced later with a json or yaml loading function.
        
        '''
        self.params = {
            'feat_categories':{
                'numerical_continuous':['TSH_Level','T3_Level','T4_Level','Nodule_Size',],
                'ordinal':['Age'],
                'binary':['Gender','Family_History','Radiation_Exposure','Iodine_Deficiency',
                    'Smoking','Obesity','Diabetes','Diagnosis',],
                'categorical_nominal':['Country','Ethnicity',],
                'categorical_ordinal':['Thyroid_Cancer_Risk',],
            },
            'encoding_utils':{
                'binary':{'Yes':1.0,'No':0.0,'Male':1.0,'Female':0.0,'Benign':0.0,'Malignant':1.0},
                'categorical_ordinal':{'Low':0.0,'Medium':1.0,'High':2.0},
            },
            'train_test':{
                'test_size':0.2,
            },
            'feat_ratio_names':{
                'ratio_1':['TSH_Level','T3_Level'],
                'ratio_2':['Nodule_Size','T4_Level'],
                'ratio_3':['TSH_Level','T4_Level'],
                'ratio_4':['Nodule_Size','T3_Level']
            },
            'resample':'SMOTETomek',
            'target_name':'Diagnosis',
            
        }
        self.logger.info('Parameter file loaded successfully.')
        
    def run_pipeline(self):
        self.logger.info('Initializing DataProcessor class.')
        #Initialize data processing class
        data_processor = DataProcessor(data=self.raw_data,logger=self.logger, params=self.params)
        self.logger.info('DataProcessor class initialized successfully.')
        
        
        #Make ratio features
        if self.ratios:
            self.logger.info('Creating ratios of features.')
            for ratio in self.ratios:
                data_processor.feat_ratios(ratio[0],ratio[1])
            self.logger.info('Ratio features created successfully.')
        
        #Encode binary features
        self.logger.info('Encoding binary features.')
        data_processor.encode_binaries(
            binary_feats=self.params['feat_categories']['binary'],
            binary_map=self.params['encoding_utils']['binary'],
        )
        self.logger.info('Binary features encoded successfully.')
        
        #Encode categorical nominal features
        self.logger.info('Begin encoding categorical nominal features.')
        data_processor.encode_catnoms(
            catnom_feats=self.params['feat_categories']['categorical_nominal'],
        )
        self.logger.info('Categorical nominal features encoded successfully.')
        
        self.logger.info('Begin encoding catagorical ordinal features.')
        data_processor.encode_catords(
            catord_feats=self.params['feat_categories']['categorical_ordinal'],
            catord_map=self.params['encoding_utils']['categorical_ordinal'],
        )
        self.logger.info('Categorical ordinal features encoded successfully.')

        #Transform numerical continuous features
        self.logger.info('Begin encoding numerical continuous features.')
        data_processor.log1p_transform(self.params['feat_categories']['numerical_continuous'])
        data_processor.sqrt_transform(self.params['feat_categories']['numerical_continuous'])
        self.logger.info('Numerical continuous feats transformed successfully.')
        
        if self.params['resample'] == 'SMOTETomek':
            self.logger.info('Begin resampling of data.')
            data_processor.smote_tomek(target_name=self.params['target_name'])
            self.logger.info('Resampling of data completed successfully.')
        else:
            pass
        
        self.data = data_processor.data

## Define Class to Save Model Results

In [4]:
##Packages:
#pandas
#dill
#json
#pathlib
#copy
class ModelSaver:
    def __init__(self,algorithm,X_valid,y_valid,output_dir,iteration_start=0,notes=None,save_params=False,params=None):
        self.algorithm = algorithm
        self.notes = notes
        self.output_dir = output_dir
        self.save_params = save_params
        self.params = params
        self.iteration = iteration_start
        self.model_save = {}
        self.model_out = {}
        self.all_model_data = {
            'X_valid':X_valid,
            'y_valid':y_valid,
            'notes':notes,
            'algorithm':[algorithm],
        }
        
        if self.params != None:
            self.all_model_data['params'] = params
        
        

    def record_state(self, model, X_test, X_train, y_test, y_train, model_preds,model_scores,drop_list,params=None,model_notes=None):
        current_model = {
            'model':model,
            'X_test':X_test,
            'X_train':X_train,
            'y_test':y_test,
            'y_train':y_train,
            'model_preds':model_preds,
            'model_notes':model_notes,
            'model_scores':model_scores,
        }
        
        if self.save_params == True:
            current_model['params'] = params
            
        
        self.model_save[self.algorithm+'_'+str(self.iteration)] = current_model
        self.iteration += 1
            
            
    def save_state(self,output_fname,output_dir):
            
        self.model_out[self.algorithm] = deepcopy(self.model_save)
        dir_path = Path(output_dir)
        fpath = dir_path/output_fname
        with open(fpath,'wb') as f:
            dill.dump(self.model_out,f)
        logging.info('Data Saved to Disk')
    
    def update_state(self, algorithm, iteration):
        self.algorithm = algorithm
        self.iteration = iteration
        self.model_out[self.algorithm] = deepcopy(self.model_save)
        self.model_save = {}
        

    

## Define Function to Select Encoded Features

In [5]:
def model_data_select(data, drop_list,target,ohe_feats_drop=None,test_size=None):
    data.drop(columns=drop_list,inplace=True,axis=1)
    if ohe_feats_drop:
        for feat in ohe_feats_drop:
            data.drop(columns=data.filter(like=feat+'_').columns, inplace=True, axis=1)
        
    X = data
    y = data.pop(target)
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)

    return [X_train, X_test, y_train, y_test]

## Define Function to Iterate Through Models

In [6]:
def model_iterator(data,X_valid,y_valid,params,model_dict,fname,dirname,target):
    for i, (k,v) in enumerate(model_dict.items()):
        #Print iteration number
        logger.info(f"Training model {i+1}/{len(model_dict)}")
        #Initialize model saver class
        if i == 0:
            logging.info('Initializing ModelSaver class')
        
            model_saver = ModelSaver(
            v['algorithm'],
            X_valid,
            y_valid,
            output_dir='/Users/richardmiller/Downloads/',
            iteration_start=i,
            params=params,
        )
            algorithm_ = v['algorithm']
            
        #Check if model changed between iterations
        if v['algorithm'] != algorithm_:
            model_saver.update_state(v['algorithm'],i)

            
        logging.info('Training Model.')

        if v['algorithm'] == 'RandomForestClassifier':
            #Split data into train and test sets
            X_train, X_test, y_train, y_test = model_data_select(
                data=deepcopy(data),
                drop_list=v['drop_list'],
                ohe_feats_drop=v['ohe_feats_drop'],
                target=target
            )
    
            #Fit model
            rf_model = RandomForestClassifier()
            rf_model.fit(X_train, y_train)
    
            #Make predictions and evaluate with report metric
            preds = rf_model.predict(X_test)
            report = classification_report(y_test,preds,output_dict=True)
            logging.info('Model trained.')

        elif v['algorithm'] == 'XGBClassifier':
            xgb_model = XGBClassifier()
            xgb_model.fit(X_train, y_train)

            preds = xgb_model.predict(X_test)
            report = classification_report(y_test,preds)
            
        logging.info('Saving state.')

        #Append current model data to all other data.
        model_saver.record_state(
        model=rf_model,
        X_test=X_test,
        X_train=X_train,
        y_test=y_test,
        y_train=y_train,
        model_scores=report,
        model_preds=preds,
        drop_list=v['drop_list'],
        model_notes=v['model_notes'],
    )
        logging.info('State saved successfully')

        #Set the test variable
        algorithm_ = v['algorithm']

        if i == len(model_dict)-1:
            model_saver.save_state(output_fname=fname, output_dir=dirname)

            logger.info('--------------------------------------------------')
            logger.info('-ALL MODELS TRAINED AND SAVED TO DISK SUCCESSFULLY-')
            logger.info('--------------------------------------------------')
    return model_saver

## Load Raw Data Set from disk, split off validation set, pass data to pipeline for encoding/transforming.

In [None]:
#Import Data
fpath = '/Users/richardmiller/Downloads/thyroid_cancer_risk_data.csv'
X_raw = pd.read_csv(fpath)
y_raw = X_raw.pop('Diagnosis')

#Split off validation set
X, X_valid, y, y_valid = train_test_split(X_raw,y_raw,test_size=0.2,stratify=y_raw)

#Load and encode/transform data
data = pd.concat([X,y],axis=1)
pipeline = EncodingPipeLine(raw_data=deepcopy(data),logger=logger,ratios='all')

## Save Encoded/Transformed Data Set

In [None]:
import joblib
data_save = {
    'X_valid':X_valid,
    'y_valid':y_valid,
    'data':pipeline.data,
}

with open("resampled_data.pkl", "wb") as f:
    joblib.dump(data_save, f)

## Load Encoded/Transformed Data Set

In [25]:
import joblib
with open("resampled_data.pkl","rb") as f:
    encoded_data = joblib.load(f)

## RandomForestModel:

1. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; No Ethnicity, No Country, No Gender; Transforms: None
2. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; Yes Ethnicity, No Country, No Gender; Transforms: None
3. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; No Ethnicity, Yes Country, No Gender; Transforms: None
4. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; No Ethnicity, No Country, Yes Gender; Transforms: None
5. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; Yes Ethnicity, Yes Country, No Gender; Transforms: None
6. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; Yes Ethnicity, No Country, Yes Gender; Transforms: None
7. Ratios: Nodule_Size:T4_Level, TSH_Level:T3_Level; No Ethnicity, Yes Country, Yes Gender; Transforms: None
8. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; No Ethnicity, No Country, No Gender; Transforms: None
9. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; Yes Ethnicity, No Country, No Gender; Transforms: None
10. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; No Ethnicity, Yes Country, No Gender; Transforms: None
11. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; No Ethnicity, No Country, Yes Gender; Transforms: None
12. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; Yes Ethnicity, Yes Country, No Gender; Transforms: None
13. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; Yes Ethnicity, No Country, Yes Gender; Transforms: None
14. Ratios: Nodule_Size:T3_Level, TSH_Level:T4_Level; No Ethnicity, Yes Country, Yes Gender; Transforms: None
15. Ratios: None; No Ethnicity, No Country, No Gender; Transforms:sqrt
16. Ratios: None; Yes Ethnicity, No Country, No Gender; Transforms:sqrt
17. Ratios: None; No Ethnicity, Yes Country, No Gender; Transforms:sqrt
18. Ratios: None; No Ethnicity, No Country, Yes Gender; Transforms:sqrt
19. Ratios: None; Yes Ethnicity, Yes Country, No Gender; Transforms:sqrt
20. Ratios: None; Yes Ethnicity, No Country, Yes Gender; Transforms:sqrt
21. Ratios: None; No Ethnicity, Yes Country, Yes Gender; Transforms:sqrt
22. Ratios: None; Yes Ethnicity, Yes Country, Yes Gender; Transforms:sqrt
23. Ratios: None; No Ethnicity, No Country, No Gender; Transforms:log
24. Ratios: None; Yes Ethnicity, No Country, No Gender; Transforms:log
25. Ratios: None; No Ethnicity, Yes Country, No Gender; Transforms:log
26. Ratios: None; No Ethnicity, No Country, Yes Gender; Transforms:log
27. Ratios: None; Yes Ethnicity, Yes Country, No Gender; Transforms:log
28. Ratios: None; Yes Ethnicity, No Country, Yes Gender; Transforms:log
29. Ratios: None; No Ethnicity, Yes Country, Yes Gender; Transforms:log
30. Ratios: None; Yes Ethnicity, Yes Country, Yes Gender; Transforms:log

## Define dictionary with model info.

In [7]:
rf_models = {
    #RandomForestClassifiers
    'model_1':{
        'ohe_feats_drop':['Country','Ethnicity','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'No',
            'Country':'No',
            'Gender':'No',
            'Transforms': None,
        },
    },
    'model_2':{
        'ohe_feats_drop':['Country','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios':'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'Yes', 
            'Country':'No',
            'Gender':'No',
            'Transforms': None
        },
    },
    
    'model_3':{
        'ohe_feats_drop':['Ethnicity','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'No',
            'Country':'Yes',
            'Gender':'No',
            'Transforms': None,
        },
    },
    
    'model_4':{
        'ohe_feats_drop':['Country','Ethnicity','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': None,
        },
    },
    
    'model_5':{
        'ohe_feats_drop':['log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'Yes',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': None,
        },
    },
    
    'model_6':{
        'ohe_feats_drop':['Country','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': None,
        },
    },
    
    'model_7':{
        'ohe_feats_drop':['Ethnicity','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T4_Level, TSH_Level:T3_Level',
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'Yes',
            'Transforms': None,
        },
    },
    
    'model_8':{
        'ohe_feats_drop':['Ethnicity','Country','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'No',
            'Transforms': None,
        },
    },
    
    'model_9':{
        'ohe_feats_drop':['Country','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'No',
            'Transforms': None,
        },
    },
    
    'model_10':{
        'ohe_feats_drop':['Ethnicity','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': None,
        },
    },
    
    'model_11':{
        'ohe_feats_drop':['Ethnicity','Country','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': None,
        },
    },
    
    'model_12':{
        'ohe_feats_drop':['log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'Yes',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': None,
        },
    },
    'model_13':{
        'ohe_feats_drop':['Country','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': None,
        },
    },
    
    'model_14':{
        'ohe_feats_drop':['Ethnicity','log','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':None,
        'model_notes':{
            'Ratios': 'Nodule_Size:T3_Level, TSH_Level:T4_Level',
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'Yes',
            'Transforms': None,
        },
    },
    
    'model_15':{
        'ohe_feats_drop':['Ethnicity','Country','log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'No',
            'Transforms': 'sqrt',
        },
    },
    'model_16':{
        'ohe_feats_drop':['Country','log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'No',
            'Transforms': 'sqrt',
        },
    },
    
    'model_17':{
        'ohe_feats_drop':['Ethnicity','log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': 'sqrt',
        },
    },
    
    'model_18':{
        'ohe_feats_drop':['Ethnicity','Country','log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': 'sqrt',
        },
    },
    
    'model_19':{
        'ohe_feats_drop':['log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': 'sqrt',
        },
    },
    
    'model_20':{
        'ohe_feats_drop':['Country','log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': 'sqrt',
        },
    },
    
    'model_21':{
        'ohe_feats_drop':['Ethnicity','log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'Yes',
            'Transforms': 'sqrt',
        },
    },
    
    'model_22':{
        'ohe_feats_drop':['log'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'sqrt',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'Yes', 
            'Gender':'Yes',
            'Transforms': 'sqrt',
        },
    },
    
    'model_23':{
        'ohe_feats_drop':['Ethnicity','Country','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'No',
            'Transforms': 'log',
        },
    },
    'model_24':{
        'ohe_feats_drop':['Country','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'No',
            'Transforms': 'log',
        },
    },
    
    'model_25':{
        'ohe_feats_drop':['Ethnicity','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': 'log',
        },
    },
    
    'model_26':{
        'ohe_feats_drop':['Ethnicity','Country','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': 'log',
        },
    },
    
    'model_27':{
        'ohe_feats_drop':['sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID','Gender',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'Yes', 
            'Gender':'No',
            'Transforms': 'log',
        },
    },
    
    'model_28':{
        'ohe_feats_drop':['Country','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'No', 
            'Gender':'Yes',
            'Transforms': 'log',
        },
    },
    
    'model_29':{
        'ohe_feats_drop':['Ethnicity','sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'No',
            'Country':'Yes', 
            'Gender':'Yes',
            'Transforms': 'log',
        },
    },
    
    'model_30':{
        'ohe_feats_drop':['sqrt'],
        'algorithm':'RandomForestClassifier',
        'drop_list':['Nodule_Size:T4_Level','TSH_Level:T3_Level','Nodule_Size:T3_Level','TSH_Level:T4_Level','T4_Level','T3_Level','Nodule_Size','TSH_Level','Patient_ID',],
        'continuous_transforms':'log',
        'model_notes':{
            'Ratios': None,
            'Ethnicity':'Yes',
            'Country':'Yes', 
            'Gender':'Yes',
            'Transforms': 'log',
        },
    },
}

## Code to check model dictionary

In [None]:
for i,(k,v) in enumerate(models.items()):
    not_included = []
    data_ = deepcopy(encoded_data['data'])
    
    not_included = deepcopy(models['model_'+str(i+1)]['drop_list'])
    ohe_feats = v['ohe_feats_drop']
    if ohe_feats != None:
        not_included.extend(models['model_'+str(i+1)]['ohe_feats_drop'])
    X, _,_,_ = model_data_select(data=data_,drop_list=v['drop_list'],target='Diagnosis',ohe_feats_drop=v['ohe_feats_drop'],)
    for col in X.columns:
        if col in not_included:
            print('model_'+str(i+1))

In [8]:
params = {
            'feat_categories':{
                'numerical_continuous':['TSH_Level','T3_Level','T4_Level','Nodule_Size',],
                'ordinal':['Age'],
                'binary':['Gender','Family_History','Radiation_Exposure','Iodine_Deficiency',
                    'Smoking','Obesity','Diabetes','Diagnosis',],
                'categorical_nominal':['Country','Ethnicity',],
                'categorical_ordinal':['Thyroid_Cancer_Risk',],
            },
            'encoding_utils':{
                'binary':{'Yes':1.0,'No':0.0,'Male':1.0,'Female':0.0,'Benign':0.0,'Malignant':1.0},
                'categorical_ordinal':{'Low':0.0,'Medium':1.0,'High':2.0},
            },
            'train_test':{
                'test_size':0.2,
            },
            'feat_ratio_names':{
                'ratio_1':['TSH_Level','T3_Level'],
                'ratio_2':['Nodule_Size','T4_Level'],
                'ratio_3':['TSH_Level','T4_Level'],
                'ratio_4':['Nodule_Size','T3_Level']
            },
            'resample':'SMOTETomek',
            'target_name':'Diagnosis',
            
        }

In [None]:
thing = model_iterator(
    data=deepcopy(encoded_data['data']),
    X_valid=encoded_data['X_valid'],y_valid=encoded_data['y_valid'],
    params=params,
    model_dict=rf_models,
    fname='rf_model_out.dill',dirname='/Users/richardmiller/Downloads/',
    target='Diagnosis',
)

In [None]:
import dill
with open("/Users/richardmiller/Downloads/rf_model_out.dill", "wb") as f:
    dill.dump(thing.model_out, f)

In [9]:
with open("/Users/richardmiller/Downloads/rf_model_out.dill", "rb") as f:
    models = dill.load(f)

In [10]:
selected_models = {}
for i,model in enumerate(models['RandomForestClassifier'].keys()):
    if models['RandomForestClassifier'][model]['model_scores']['0.0']['precision'] > 0.845:
        selected_models[model] = {
            'benign_precision':models['RandomForestClassifier'][model]['model_scores']['0.0']['precision'],
            'malignant_precision':models['RandomForestClassifier'][model]['model_scores']['1.0']['precision'],
            'model_notes':models['RandomForestClassifier'][model]['model_notes'],
        }

In [23]:
model_keys = list(selected_models.keys())
for k,v in selected_models.items():
    print(k)
    print(f"Benign Precision: {v['benign_precision']}")
    print(f"Malignant Precision: {v['malignant_precision']}")

RandomForestClassifier_1
Benign Precision: 0.8471464167062174
Malignant Precision: 0.9379772769603278
RandomForestClassifier_2
Benign Precision: 0.847591128483018
Malignant Precision: 0.9363100126931979
RandomForestClassifier_4
Benign Precision: 0.8481102712316585
Malignant Precision: 0.933860263962419
RandomForestClassifier_5
Benign Precision: 0.8470367278797997
Malignant Precision: 0.9359937807722207
RandomForestClassifier_6
Benign Precision: 0.8455660687102647
Malignant Precision: 0.9365343387299475
RandomForestClassifier_8
Benign Precision: 0.8454446375445521
Malignant Precision: 0.9387026458208058
RandomForestClassifier_10
Benign Precision: 0.8464939428773544
Malignant Precision: 0.9344348479158844
RandomForestClassifier_11
Benign Precision: 0.8502006241640659
Malignant Precision: 0.9341929250891795
RandomForestClassifier_13
Benign Precision: 0.847419373997743
Malignant Precision: 0.9346055127775917
RandomForestClassifier_16
Benign Precision: 0.8482797518330514
Malignant Precision

The most promising model looks to be the one that includes Ethnicity, Country, Gender, and performs a log transform on the numeric continuous features. I will use those features to do a grid search.

In [52]:
#Get encoded train and test data
drop_list = [
    'Nodule_Size:T4_Level',
    'TSH_Level:T3_Level',
    'Nodule_Size:T3_Level',
    'TSH_Level:T4_Level',
    'T4_Level',
    'T3_Level',
    'Nodule_Size',
    'TSH_Level',
    'Patient_ID',
]
data = pd.DataFrame(encoded_data['data'])
[X_test, X_train, y_test, y_train] = model_data_select(
    data=data,
    target='Diagnosis',
    drop_list=drop_list,
    ohe_feats_drop=['sqrt'],
    test_size=0.2
)

#Define grid params
grid_params = {
    'n_estimators':[50, 100, 200,],
    'max_depth':[None, 10, 20, 30,],
    'min_samples_split':[2,5,10,],
    'min_samples_leaf':[1,2,4,],
    'bootstrap':[True,False,],
}

#Define model and grid search
random_forest = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=random_forest,
    param_grid = grid_params,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy',
)

gs = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  24.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  12.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  12.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   6.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  24.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   5.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total 

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   5.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   5.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  25.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  12.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   6.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  24.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   5.8s
[CV] END bootstrap=True, max_depth=None, min_samples_

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  11.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  25.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  23.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=  12.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  22.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  22.8s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.4s
[CV] END bootstrap=True, max_depth=10, min_sample

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  11.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   6.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  12.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  24.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   6.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  23.5s
[CV] END bootstrap=True, max_depth=None, min_sa

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   5.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  11.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   6.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  12.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   6.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  11.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  11.8s
[CV] END bootstrap=True, max_depth=None, min_sampl

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  24.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   6.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  24.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  23.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=  12.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   5.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  22.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   5.9s
[CV] END bootstrap=True, max_depth=None, min_sa

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=  24.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  24.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  11.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   6.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=  12.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  10.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  22.4s
[CV] END bootstrap=True, max_depth=None, min_s

In [53]:
joblib.dump(gs,'random_forest_gridsearch_accuracy.pkl')

['random_forest_gridsearch_accuracy.pkl']

[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   6.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  24.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  24.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   5.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  12.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=50; total time=   5.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=  23.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=4, min_samples_split=5, n_estimators=50; total time=   5.3s
[CV] END bootstrap=True, max_depth=None, min_samp

In [50]:
print(gs.best_score_)
print(gs.best_params_)

0.9102673976224086
{'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


In [47]:
gs_acc = joblib.load('random_forest_gridsearch_accuracy.pkl')