In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Load the dataset
file_path = 'thyroidDF.csv'
data = pd.read_csv(file_path)

# Convert 'F' and 'M' to binary values
data['sex'] = data['sex'].map({'F': 0, 'M': 1})

# Convert 'f' and 't' to binary values
binary_columns = ['on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_meds', 'sick', 'pregnant',
                  'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid',
                  'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured',
                  'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured']

for column in binary_columns:
    data[column] = data[column].map({'f': 0, 't': 1})

# Drop the columns 'patient_id' and 'referral_source'
data.drop(columns=['patient_id', 'referral_source'], inplace=True)

# Handling missing values
# Impute missing numerical values with the mean
num_imputer = SimpleImputer(strategy='mean')
data[['TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']] = num_imputer.fit_transform(data[['TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG']])

# Impute missing categorical values with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
data[['sex']] = cat_imputer.fit_transform(data[['sex']])

# Separate features and target variable
X = data.drop(columns=['target'])
y = data['target']

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Ensure the target names match the classes in y_test
unique_classes = sorted(set(y_test))
target_names = label_encoder.inverse_transform(unique_classes)

report = classification_report(y_test, y_pred, target_names=target_names)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Save the trained model
model_path = 'random_forest_model.pkl'
joblib.dump(rf_model, model_path)
label_encoder_path = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_path)

Accuracy: 0.9307901907356948
Classification Report:
              precision    recall  f1-score   support

           -       0.95      0.97      0.96      1328
           A       0.63      0.81      0.71        21
          AK       0.80      0.80      0.80        10
           B       0.00      0.00      0.00         4
           F       0.93      0.93      0.93        40
          FK       0.00      0.00      0.00         1
           G       0.95      1.00      0.97        69
          GI       0.00      0.00      0.00         1
          GK       0.86      1.00      0.92         6
           I       0.85      0.71      0.77        82
           J       1.00      0.25      0.40        12
           K       0.87      0.92      0.89       106
          KJ       1.00      0.50      0.67         2
           L       0.67      0.50      0.57        28
           M       1.00      1.00      1.00        25
          MK       1.00      1.00      1.00         6
           N       0.71      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['label_encoder.pkl']

In [10]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[1291    4    2    0    1    0    2    0    0    9    0   11    0    5
     0    0    0    1    0    1    1]
 [   4   17    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    8    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    2    0]
 [   4    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0   37    0    2    0    1    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    1    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0   69    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   1    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0    0    6    0    0    0    0    0
     0    0    0    0    0    