In [1]:
import pandas as pd

data_path = r'C:\Users\asusz\OneDrive\Робочий стіл\СА\ThyroCare\data_new.csv'

df = pd.read_csv(data_path)

In [3]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
label_enc_country = LabelEncoder()
label_enc_ethnicity = LabelEncoder()
df_encoded['Country'] = label_enc_country.fit_transform(df_encoded['Country'])
df_encoded['Ethnicity'] = label_enc_ethnicity.fit_transform(df_encoded['Ethnicity'])
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])

df_encoded.head()

Unnamed: 0,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
0,66,0,6,2,0,1,0,0,0,0,9.37,1.67,6.16,1.08,1,0
1,29,0,2,3,0,1,0,0,0,0,1.83,1.73,10.54,4.05,1,0
2,86,0,5,2,0,0,0,0,0,0,6.26,2.59,10.57,4.61,1,0
3,75,1,3,1,0,0,0,0,0,0,4.1,2.62,11.04,2.46,2,0
4,35,1,2,0,1,1,0,0,0,0,9.1,2.11,10.71,2.11,3,0


In [5]:
df_encoded['Combination'] = df_encoded['Smoking'].astype(str) + df_encoded['Obesity'].astype(str) + df_encoded['Diabetes'].astype(str)
df_encoded['Combination'] = df_encoded['Combination'].apply(lambda x: int(x, 2))

In [7]:
from sklearn.utils import resample

major1 = df_encoded[df_encoded['Thyroid_Cancer_Risk'] == 1]
major2 = df_encoded[df_encoded['Thyroid_Cancer_Risk'] == 2]
minor = df_encoded[df_encoded['Thyroid_Cancer_Risk'] == 3]

major1_downsampled = resample(
    major1, 
    replace=False,     # без повторів
    n_samples=len(minor), 
    random_state=42
)

major2_downsampled = resample(
    major2, 
    replace=False,     # без повторів
    n_samples=len(minor), 
    random_state=42
)

df_balanced = pd.concat([major1_downsampled, major2_downsampled, minor])
print(df_balanced['Thyroid_Cancer_Risk'].value_counts())


Thyroid_Cancer_Risk
1    31903
2    31903
3    31903
Name: count, dtype: int64


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_balanced.drop(columns=['Thyroid_Cancer_Risk','Diagnosis','Smoking', 'Obesity', 'Diabetes'])
y = df_balanced['Thyroid_Cancer_Risk']  

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
cols_to_scale = ['TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size']
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  
    num_class=3,  
    n_estimators=200,  
    max_depth=15,  
    learning_rate=0.1,
    subsample = 1,
    colsample_bytree = 1,
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.6708285445616968

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.51      0.51      6380
           1       0.51      0.51      0.51      6381
           2       1.00      1.00      1.00      6381

    accuracy                           0.67     19142
   macro avg       0.67      0.67      0.67     19142
weighted avg       0.67      0.67      0.67     19142


Confusion Matrix:
[[3232 3148    0]
 [3153 3228    0]
 [   0    0 6381]]


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 150,200], 
    'max_depth': [10, 15], 
    'learning_rate': [0.05, 0.1],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0] 
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150, 'subsample': 1.0}
Best score: 0.6699753514443518


In [19]:
feature_importances = xgb_model.feature_importances_

feature_names = X.columns  
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
5,Radiation_Exposure,0.271743
4,Family_History,0.244393
6,Iodine_Deficiency,0.201251
3,Ethnicity,0.131946
2,Country,0.073021
7,TSH_Level,0.011651
9,T4_Level,0.011597
10,Nodule_Size,0.011561
8,T3_Level,0.011392
0,Age,0.011341


In [23]:
import joblib

joblib.dump(xgb_model, 'mdl_risk.pkl')

['mdl_risk.pkl']

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df_balanced.drop(columns=['Thyroid_Cancer_Risk','Diagnosis','Smoking', 'Obesity', 'Diabetes', 'Ethnicity','Country'])
y = df_balanced['Thyroid_Cancer_Risk']  

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
cols_to_scale = ['TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size']
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  
    num_class=3,  
    n_estimators=200,  
    max_depth=15,  
    learning_rate=0.1,
    subsample = 1,
    colsample_bytree = 1,
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.4817678403510605

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.34      0.38      6380
           1       0.43      0.33      0.37      6381
           2       0.54      0.78      0.64      6381

    accuracy                           0.48     19142
   macro avg       0.47      0.48      0.46     19142
weighted avg       0.47      0.48      0.46     19142


Confusion Matrix:
[[2154 2119 2107]
 [2173 2103 2105]
 [ 726  690 4965]]
