In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import joblib


In [2]:
# Load the dataset (assuming it's a CSV file)
# Replace 'your_dataset.csv' with the actual dataset file path
df = pd.read_csv('thyroidDF.csv')


In [3]:
# Drop unneccesary col
df.drop([ 'query_on_thyroxine',
        'sick', 'pregnant', 'thyroid_surgery',
        'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured',
       'T3_measured','TT4_measured','T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'referral_source',
       'target', 'patient_id','FTI','T4U'], axis=1, inplace=True)


In [4]:
# Display initial dataset information
print("Initial Dataset Shape:", df.shape)
print(df.head())


Initial Dataset Shape: (9172, 8)
   age sex on_thyroxine on_antithyroid_meds I131_treatment  TSH   T3    TT4
0   29   F            f                   f              f  0.3  NaN    NaN
1   29   F            f                   f              f  1.6  1.9  128.0
2   41   F            f                   f              f  NaN  NaN    NaN
3   36   F            f                   f              f  NaN  NaN    NaN
4   32   F            f                   f              f  NaN  NaN    NaN


In [5]:
# Analyze missing values
print("Missing values per column:")
print(df.isnull().sum())


Missing values per column:
age                       0
sex                     307
on_thyroxine              0
on_antithyroid_meds       0
I131_treatment            0
TSH                     842
T3                     2604
TT4                     442
dtype: int64


In [6]:
# Handle missing numerical features
numerical_columns = ['TSH', 'T3', 'TT4']  # Add more numerical columns if needed
for column in numerical_columns:
    df[column].fillna(df[column].median(), inplace=True)  # Impute with median


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)  # Impute with median


In [7]:
# Handle missing categorical features
categorical_columns = ['sex', 'on_thyroxine', 'on_antithyroid_meds', 'I131_treatment']
for column in categorical_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)  # Impute with mode


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)  # Impute with mode


In [8]:
# Classify thyroid stage based on TSH, T3, and TT4 values
def classify_thyroid(row):
    TSH = row['TSH']
    T3 = row['T3']
    TT4 = row['TT4']

    # Handle missing values
    if pd.isna(TSH) or pd.isna(T3) or pd.isna(TT4):
        return "Unknown"

    # Broadened ranges for "Normal" classification
    if 0.35 <= TSH <= 5.0 and 0.9 <= T3 <= 3.0 and 50 <= TT4 <= 180:
        return "Normal"

    # Hypothyroidism classifications
    elif TSH > 5.0 and (0.9 <= T3 <= 3.0 and 50 <= TT4 <= 180):
        return "Subclinical Hypothyroidism"
    elif TSH > 5.0 and (T3 < 0.9 or TT4 < 50):
        return "Overt Hypothyroidism"

    # Hyperthyroidism classifications
    elif TSH < 0.35 and (0.9 <= T3 <= 3.0 and 50 <= TT4 <= 180):
        return "Subclinical Hyperthyroidism"
    elif TSH < 0.35 and (T3 > 3.0 or TT4 > 180):
        return "Overt Hyperthyroidism"

    # Reduced reliance on "Borderline" category
    else:
        return "Unclassified"


In [9]:
# Apply the classification function
df['Thyroid Stage'] = df.apply(classify_thyroid, axis=1)

# View updated classification counts
print("Thyroid Stage Classification Counts:")
print(df['Thyroid Stage'].value_counts())


Thyroid Stage Classification Counts:
Thyroid Stage
Normal                         5789
Subclinical Hyperthyroidism    1420
Subclinical Hypothyroidism      768
Unclassified                    590
Overt Hyperthyroidism           356
Overt Hypothyroidism            249
Name: count, dtype: int64


In [10]:
# Encode the target column with LabelEncoder
label_encoder = LabelEncoder()
df['Thyroid Stage'] = label_encoder.fit_transform(df['Thyroid Stage'])


In [11]:
# Verify the transformed DataFrame
print("Transformed Dataset Shape:", df.shape)
print(df.head())


Transformed Dataset Shape: (9172, 9)
   age sex on_thyroxine on_antithyroid_meds I131_treatment  TSH   T3    TT4  \
0   29   F            f                   f              f  0.3  1.9  104.0   
1   29   F            f                   f              f  1.6  1.9  128.0   
2   41   F            f                   f              f  1.4  1.9  104.0   
3   36   F            f                   f              f  1.4  1.9  104.0   
4   32   F            f                   f              f  1.4  1.9  104.0   

   Thyroid Stage  
0              3  
1              0  
2              0  
3              0  
4              0  


In [49]:
"Columns in the DataFrame:", df.columns.tolist()


('Columns in the DataFrame:',
 ['age',
  'sex',
  'on_thyroxine',
  'on_antithyroid_meds',
  'I131_treatment',
  'TSH',
  'T3',
  'TT4',
  'Thyroid Stage'])

In [13]:
# Encode categorical columns using LabelEncoder
categorical_columns = ['sex', 'on_thyroxine', 'on_antithyroid_meds', 'I131_treatment', 'Thyroid Stage']
label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [14]:
# Split the data into features (X) and target (y)
X = df.drop('Thyroid Stage', axis=1)
y = df['Thyroid Stage']


In [15]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [16]:
# Check class distribution in the training set
print("Class distribution in training set:", Counter(y_train))


Class distribution in training set: Counter({0: 4052, 3: 994, 4: 538, 5: 413, 1: 249, 2: 174})


In [17]:
# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
clf.fit(X_train, y_train)


In [18]:
# Evaluate the Random Forest model
y_pred = clf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1737
           1       1.00      1.00      1.00       107
           2       1.00      1.00      1.00        75
           3       1.00      1.00      1.00       426
           4       1.00      1.00      1.00       230
           5       1.00      0.99      0.99       177

    accuracy                           1.00      2752
   macro avg       1.00      1.00      1.00      2752
weighted avg       1.00      1.00      1.00      2752



In [19]:
# Train an XGBoost Classifier
xgb_clf = XGBClassifier(random_state=42)
xgb_clf.fit(X_train, y_train)


In [20]:
# Evaluate the XGBoost model
y_pred_xgb = xgb_clf.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1737
           1       1.00      1.00      1.00       107
           2       1.00      0.97      0.99        75
           3       0.99      1.00      0.99       426
           4       0.98      0.98      0.98       230
           5       1.00      0.97      0.99       177

    accuracy                           0.99      2752
   macro avg       0.99      0.99      0.99      2752
weighted avg       0.99      0.99      0.99      2752



In [21]:
# Train a K-Nearest Neighbors (KNN) Classifier
knn_clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_clf.fit(X_train, y_train)


In [22]:
# Evaluate the KNN model
y_pred_knn = knn_clf.predict(X_test)
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))


KNN Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.94      0.85      1737
           1       0.71      0.59      0.64       107
           2       0.88      0.75      0.81        75
           3       0.54      0.34      0.41       426
           4       0.87      0.64      0.74       230
           5       0.57      0.20      0.29       177

    accuracy                           0.75      2752
   macro avg       0.72      0.57      0.62      2752
weighted avg       0.73      0.75      0.73      2752



In [23]:
# Save the Random Forest model using joblib
joblib.dump(clf, 'RF1_model.pkl')
print("Random Forest model saved as 'RF1_model.pkl'")


Random Forest model saved as 'RF1_model.pkl'
