In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv(r'../dataset/thyroid_cancer_risk_data.csv')
dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
0,1,66,Male,Russia,Caucasian,No,Yes,No,No,No,No,9.37,1.67,6.16,1.08,Low,Benign
1,2,29,Male,Germany,Hispanic,No,Yes,No,No,No,No,1.83,1.73,10.54,4.05,Low,Benign
2,3,86,Male,Nigeria,Caucasian,No,No,No,No,No,No,6.26,2.59,10.57,4.61,Low,Benign
3,4,75,Female,India,Asian,No,No,No,No,No,No,4.1,2.62,11.04,2.46,Medium,Benign
4,5,35,Female,Germany,African,Yes,Yes,No,No,No,No,9.1,2.11,10.71,2.11,High,Benign


In [3]:
dataset = dataset.drop(columns=['Patient_ID','Country', 'Ethnicity'])
dataset.head()

Unnamed: 0,Age,Gender,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk,Diagnosis
0,66,Male,No,Yes,No,No,No,No,9.37,1.67,6.16,1.08,Low,Benign
1,29,Male,No,Yes,No,No,No,No,1.83,1.73,10.54,4.05,Low,Benign
2,86,Male,No,No,No,No,No,No,6.26,2.59,10.57,4.61,Low,Benign
3,75,Female,No,No,No,No,No,No,4.1,2.62,11.04,2.46,Medium,Benign
4,35,Female,Yes,Yes,No,No,No,No,9.1,2.11,10.71,2.11,High,Benign


In [4]:
dataset.isnull().sum().sort_values(ascending=False)

Age                    0
Gender                 0
Family_History         0
Radiation_Exposure     0
Iodine_Deficiency      0
Smoking                0
Obesity                0
Diabetes               0
TSH_Level              0
T3_Level               0
T4_Level               0
Nodule_Size            0
Thyroid_Cancer_Risk    0
Diagnosis              0
dtype: int64

In [5]:
dataset.describe()

Unnamed: 0,Age,TSH_Level,T3_Level,T4_Level,Nodule_Size
count,212691.0,212691.0,212691.0,212691.0,212691.0
mean,51.918497,5.045102,2.001727,8.246204,2.503403
std,21.632815,2.860264,0.866248,2.164188,1.444631
min,15.0,0.1,0.5,4.5,0.0
25%,33.0,2.57,1.25,6.37,1.25
50%,52.0,5.04,2.0,8.24,2.51
75%,71.0,7.52,2.75,10.12,3.76
max,89.0,10.0,3.5,12.0,5.0


In [6]:
from sklearn.preprocessing import OrdinalEncoder

In [7]:
binaryColumns = ['Family_History', 'Radiation_Exposure', 'Iodine_Deficiency', 'Smoking', 'Obesity', 'Diabetes']

In [8]:
dataset[binaryColumns] = dataset[binaryColumns].replace({'Yes':1, 'No':0})

dataset['Gender'] = dataset['Gender'].map({'Female':1, 'Male':0})

oe = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
dataset['Thyroid_Cancer_Risk'] = oe.fit_transform(dataset[['Thyroid_Cancer_Risk']])

  dataset[binaryColumns] = dataset[binaryColumns].replace({'Yes':1, 'No':0})


In [9]:
dataset.rename(columns={
    'Gender':'Female',
    'Thyroid_Cancer_Risk':'Cancer_Risk'
}, inplace=True)

In [10]:
dataset.head()

Unnamed: 0,Age,Female,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Cancer_Risk,Diagnosis
0,66,0,0,1,0,0,0,0,9.37,1.67,6.16,1.08,0.0,Benign
1,29,0,0,1,0,0,0,0,1.83,1.73,10.54,4.05,0.0,Benign
2,86,0,0,0,0,0,0,0,6.26,2.59,10.57,4.61,0.0,Benign
3,75,1,0,0,0,0,0,0,4.1,2.62,11.04,2.46,1.0,Benign
4,35,1,1,1,0,0,0,0,9.1,2.11,10.71,2.11,2.0,Benign


In [11]:
x = dataset.iloc[:, :-1]
y = dataset['Diagnosis']

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
rdc = RandomForestClassifier(n_estimators=200, criterion='entropy')
rdc.fit(x_train, y_train)

In [16]:
rdc.score(x_train, y_train)*100, rdc.score(x_test, y_test)*100

(100.0, 82.54607572718155)

In [17]:
rdc.predict([[29, 0, 0, 1, 0, 0, 0, 0, 1.83, 1.73, 10.54, 4.05, 0.0]])



array(['Benign'], dtype=object)

In [18]:
#important feature selection
impFeatures = pd.Series(rdc.feature_importances_, index=x.columns).sort_values(ascending=False)
impFeatures

Cancer_Risk           0.165503
TSH_Level             0.160010
T4_Level              0.159470
Nodule_Size           0.155853
T3_Level              0.151741
Age                   0.128238
Family_History        0.014054
Female                0.013404
Obesity               0.012746
Smoking               0.011165
Iodine_Deficiency     0.010234
Diabetes              0.008976
Radiation_Exposure    0.008606
dtype: float64

In [19]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [21]:
hyperparameters = {
    "n_estimators": [i for i in range(100, 1000)],       # 100 to 999
    "criterion": ['gini', 'entropy', 'log_loss'],       # splitting criteria
    "max_depth": [j for j in range(5, 100)],           # 5 to 99
    "max_features": [k for k in range(3, 13)]          # 3 to 12
}


In [None]:
gs = GridSearchCV(RandomForestClassifier(), param_grid=hyperparameters, cv=5)
gs.fit(x_train, y_train)

In [None]:
gs.best_params_

In [None]:
gs.best_score_