### Download data set from here
#### https://www.kaggle.com/dariomandarino/hypothyroid-multi-dataset

In [1]:
!pip install catboost



In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from  xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from scipy import stats
from sklearn import metrics
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier


import os




# Introduction
### In this notebook I want to study thyroid problems and I want to train and prepare some classifiers that could recognize any kind of thyroid problem in a patient.<br>To do this, I will use six datasets I got from -> https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease:
* allhyperTestEDIT and allhyperTrainEDIT present classes'hyperthyroid','T3 toxic','goitre','secondary toxic' and 'negative'
* allhypoDATA and allhypoTEST present classes 'hypothyroid','primary hypothyroid','compensated hypothyroid','secondary hypothyroid' and 'negative'
* hypothyroid present classes 'hypothyroid' and 'negative'
* sick-euthyroid present classes 'sick-euthyroid' and 'negative'
* thyroid0387 present classes hyperthyroid conditions (A, B, C, D), hypothyroid conditions (E, F, G, H), binding protein (I, J), general health (K), replacement therapy (L, M, N), discordant results (R) 
* ann-test and ann-train present classes normal (not hypothyroid), hyperfunction and subnormal functioning

### <br> You can find the full documentation on the link above. I want to build a dataset, merging these six above, which present only three classes: hypothyroid, hyperthyroid and negative. Once this work is done, I will go on with the data pre processing and then I will train and test the classifiers.

# Part 1: Data Integration
### I have to integrate these six different datasets. I will start from the 'all' series because they have the same scheme. 

In [3]:
allHyperTest = pd.read_csv("allhyperTestEDIT.CSV")
allHyperTrain = pd.read_csv("allhyperTrainEDIT.CSV")
allHypoTest = pd.read_csv("allhypoTEST.csv")
allHypoTrain = pd.read_csv("allhypoDATA.CSV")

display(allHypoTest.head(10))
display(allHypoTrain.dtypes)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Target,ID
0,35,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,f,?,other,negative,219
1,63,M,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,3.5,t,2.5,t,108,t,0.96,t,113,f,?,SVI,negative,2059
2,25,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,4.6,t,2.4,t,61,t,0.82,t,75,f,?,SVHD,negative,399
3,53,F,f,f,f,f,f,f,f,t,f,f,f,f,f,f,t,0.25,t,2.1,t,145,t,1.03,t,141,f,?,other,negative,1911
4,92,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,1.3,t,120,t,0.84,t,143,f,?,SVI,negative,487
5,67,M,f,f,f,f,f,f,f,t,f,f,f,f,f,f,t,0.81,f,?,t,84,t,0.83,t,101,f,?,other,negative,1234
6,60,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.2,t,2.6,t,117,t,1.31,t,90,f,?,other,negative,1113
7,60,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,27,t,1.8,t,65,t,0.99,t,66,f,?,SVI,compensated_hypothyroid,1344
8,48,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,2.8,f,?,t,112,t,0.92,t,121,f,?,other,negative,2758
9,27,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,2.6,t,2.2,t,94,t,0.89,t,106,f,?,SVI,negative,3230


age                          object
sex                          object
on_thyroxine                 object
query_on_thyroxine           object
on_antithyroid_medication    object
sick                         object
pregnant                     object
thyroid_surgery              object
I131_treatment               object
query_hypothyroid            object
query_hyperthyroid           object
lithium                      object
goitre                       object
tumor                        object
hypopituitary                object
psych                        object
TSH_measured                 object
TSH                          object
T3_measured                  object
T3                           object
TT4_measured                 object
TT4                          object
T4U_measured                 object
T4U                          object
FTI_measured                 object
FTI                          object
TBG_measured                 object
TBG                         

### ID is an identificator, so I have to check if there are any istances with the same value for this attribute:

In [4]:
def handleDuplicated(df):
    if df["ID"].duplicated().sum() == 0 :
        print("There aren't duplicates")
    elif (df["ID"].duplicated().sum()) < len(df) / 100:
        df["ID"].drop_duplicates(keep="first", inplace=True)
        print("duplicates were less than the 1% of all the data, they have been dropped")
    else:
        index_duplicated = df["ID"].duplicated().index
        print("duplicates are more than the 1% of all the data, they have been preserved")
        print(index_duplicated)

handleDuplicated(allHyperTest)
handleDuplicated(allHyperTrain)
handleDuplicated(allHypoTest)
handleDuplicated(allHypoTrain)

There aren't duplicates
There aren't duplicates
There aren't duplicates
There aren't duplicates


 ### Now it's possible to drop the ID attribute because it's useless for the classification:

In [5]:
del allHyperTest["ID"]
del allHyperTrain["ID"]
del allHypoTest["ID"]
del allHypoTrain["ID"]

### From these four datasets I will obtain all the istances presentig a class that's different from 'negative':

In [6]:
def notCorrect_TargetFilter(df,correct_Target,target):
    df = df[df.Target.isin(correct_Target)]
    df.replace(correct_Target,target,inplace = True)
    return df
    
allHyperTest = notCorrect_TargetFilter(allHyperTest,["hyperthyroid","T3_toxic","goitre","secondary_toxic"],"hyperthyroid")
allHyperTrain = notCorrect_TargetFilter(allHyperTrain,["hyperthyroid","T3_toxic","goitre","secondary_toxic"],"hyperthyroid")
allHypoTest = notCorrect_TargetFilter(allHypoTest,["hypothyroid", "primary_hypothyroid", "compensated_hypothyroid", "secondary_hypothyroid"],"hypothyroid")
allHypoTrain = notCorrect_TargetFilter(allHypoTrain,["hypothyroid", "primary_hypothyroid", "compensated_hypothyroid", "secondary_hypothyroid"],"hypothyroid")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


### Now I will merge the four datasets:

In [7]:
allDataset = pd.concat([allHyperTest,allHyperTrain,allHypoTest,allHypoTrain], ignore_index = True)
display(allDataset.shape)

(393, 30)

### That's all for the 'all' series. Let's go on with thyroid0387:

In [8]:
thyroid0387 = pd.read_csv("thyroid0387EDIT.CSV")
display(thyroid0387.head(10))
display(thyroid0387.dtypes)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Target,ID
0,29,F,f,f,f,f,f,f,f,t,f,f,f,f,f,f,t,0.3,f,?,f,?,f,?,f,?,f,?,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.6,t,1.9,t,128,f,?,f,?,f,?,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,t,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,t,11,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,t,26,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,t,36,other,S,840803047
5,60,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,t,26,other,-,840803048
6,77,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,f,?,f,?,f,?,f,?,f,?,t,21,other,-,840803068
7,28,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,0.7,t,2.6,t,116,f,?,f,?,f,?,SVI,-,840807019
8,28,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.2,t,1.8,t,76,f,?,f,?,f,?,other,-,840808060
9,28,F,f,f,f,f,f,f,f,f,f,f,f,f,f,f,t,1.9,t,1.7,t,83,f,?,f,?,f,?,other,-,840808073


age                           int64
sex                          object
on_thyroxine                 object
query_on_thyroxine           object
on_antithyroid_medication    object
sick                         object
pregnant                     object
thyroid_surgery              object
I131_treatment               object
query_hypothyroid            object
query_hyperthyroid           object
lithium                      object
goitre                       object
tumor                        object
hypopituitary                object
psych                        object
TSH_measured                 object
TSH                          object
T3_measured                  object
T3                           object
TT4_measured                 object
TT4                          object
T4U_measured                 object
T4U                          object
FTI_measured                 object
FTI                          object
TBG_measured                 object
TBG                         

### We have the ID attribute here too, so:

In [9]:
handleDuplicated(thyroid0387)

There aren't duplicates


In [10]:
del thyroid0387["ID"]

### This dataset has different interesting classes: A,B,C,D,E,F,G,H. All the others should be considered as 'negative'. I have to be careful because 'F' and 'M' are used in the 'sex' attribute too, so before any sostitution, I have to handle this problem:

In [11]:
thyroid0387['sex'] = thyroid0387['sex'].map({'F': 1, 'M': 0})

thyroid0387.replace(['A','B','C','D'],"hyperthyroid",inplace = True)
thyroid0387.replace(['E','F','G','H'],"hypothyroid",inplace = True)

for value in set(thyroid0387['Target']):
    if(value != 'hypothyroid' and value != 'hyperthyroid'):
        thyroid0387.replace(value,'negative',inplace=True)

### Let's continue with the 'hypothyroid' dataset:

In [12]:
hypothyroid = pd.read_csv("hypothyroid.csv")
display(hypothyroid.shape)
display(hypothyroid.head(10))
display(hypothyroid.dtypes)

(3163, 26)

Unnamed: 0.1,Unnamed: 0,Age,Sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,goitre,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
0,hypothyroid,72,M,f,f,f,f,f,f,f,f,f,f,f,y,30.0,y,0.6,y,15.0,y,1.48,y,10.0,n,?
1,hypothyroid,15,F,t,f,f,f,f,f,f,f,f,f,f,y,145.0,y,1.7,y,19.0,y,1.13,y,17.0,n,?
2,hypothyroid,24,M,f,f,f,f,f,f,f,f,f,f,f,y,0.0,y,0.2,y,4.0,y,1.0,y,0.0,n,?
3,hypothyroid,24,F,f,f,f,f,f,f,f,f,f,f,f,y,430.0,y,0.4,y,6.0,y,1.04,y,6.0,n,?
4,hypothyroid,77,M,f,f,f,f,f,f,f,f,f,f,f,y,7.3,y,1.2,y,57.0,y,1.28,y,44.0,n,?
5,hypothyroid,85,F,f,f,f,f,t,f,f,f,f,f,f,y,138.0,y,1.1,y,27.0,y,1.19,y,23.0,n,?
6,hypothyroid,64,F,f,f,f,t,f,f,f,f,f,f,f,y,7.7,y,1.3,y,54.0,y,0.86,y,63.0,n,?
7,hypothyroid,72,F,f,f,f,f,f,f,f,f,f,f,f,y,21.0,y,1.9,y,34.0,y,1.05,y,32.0,n,?
8,hypothyroid,20,F,f,f,f,f,t,f,f,f,f,f,f,y,92.0,n,?,y,39.0,y,1.21,y,32.0,n,?
9,hypothyroid,42,F,f,f,f,f,f,f,f,f,f,f,f,y,48.0,n,?,y,7.6,y,1.02,y,7.5,n,?


Unnamed: 0                   object
Age                          object
Sex                          object
on_thyroxine                 object
query_on_thyroxine           object
on_antithyroid_medication    object
thyroid_surgery              object
query_hypothyroid            object
query_hyperthyroid           object
pregnant                     object
sick                         object
tumor                        object
lithium                      object
goitre                       object
TSH_measured                 object
TSH                          object
T3_measured                  object
T3                           object
TT4_measured                 object
TT4                          object
T4U_measured                 object
T4U                          object
FTI_measured                 object
FTI                          object
TBG_measured                 object
TBG                          object
dtype: object

### The 'Unnamed' attribute indicate the class of the istance, so I have to rename it. Then I will filter the 'hypothyroid' class istances. For this dataset I don't have 'I131_treatment', 'hypopituitary', 'psych' and 'referral_source' attributes.

In [13]:
hypothyroid = hypothyroid.rename(columns={hypothyroid.columns[0]:"Target",hypothyroid.columns[1]:"age",hypothyroid.columns[2]:"sex" })
hypothyroid = hypothyroid[hypothyroid.Target.isin(['hypothyroid'])]

### For 'sick-euthyroid' I have to filter all the 'negative' istances:

In [14]:
sick_euthyroid = pd.read_csv("sick-euthyroid.CSV")
display(sick_euthyroid.shape)
display(sick_euthyroid.head(10))
display(sick_euthyroid.dtypes)

(3163, 26)

Unnamed: 0,Target,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,goitre,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
0,sick-euthyroid,72,M,f,f,f,f,f,f,f,f,f,f,f,n,?,y,1.0,y,83,y,0.95,y,87,n,?
1,sick-euthyroid,45,F,f,f,f,f,f,f,f,f,f,f,f,y,1.90,y,1.0,y,82,y,0.73,y,112,n,?
2,sick-euthyroid,64,F,f,f,f,f,f,f,f,t,f,f,f,y,0.09,y,1.0,y,101,y,0.82,y,123,n,?
3,sick-euthyroid,56,M,f,f,f,f,f,f,f,f,f,f,f,y,0,y,0.8,y,76,y,0.77,y,99,n,?
4,sick-euthyroid,78,F,t,f,f,f,t,f,f,f,f,f,f,y,2.60,y,0.3,y,87,y,0.95,y,91,n,?
5,sick-euthyroid,80,M,f,f,f,f,f,f,f,f,f,f,f,y,1.40,y,0.8,y,105,y,0.88,y,120,n,?
6,sick-euthyroid,74,F,f,f,f,f,f,f,f,f,f,f,f,y,0,y,0.7,y,98,y,0.81,y,121,n,?
7,sick-euthyroid,?,F,f,f,f,f,f,f,f,f,f,f,f,y,1.40,y,1.1,y,121,y,1.11,y,109,n,?
8,sick-euthyroid,42,F,f,f,f,f,f,f,f,f,f,f,f,y,2.30,y,1.1,y,93,y,0.73,y,127,n,?
9,sick-euthyroid,89,M,f,f,f,f,f,f,f,f,f,f,f,y,0.80,y,0.8,y,111,y,0.68,y,165,n,?


Target                       object
age                          object
sex                          object
on_thyroxine                 object
query_on_thyroxine           object
on_antithyroid_medication    object
thyroid_surgery              object
query_hypothyroid            object
query_hyperthyroid           object
pregnant                     object
sick                         object
tumor                        object
lithium                      object
goitre                       object
TSH_measured                 object
TSH                          object
T3_measured                  object
T3                           object
TT4_measured                 object
TT4                          object
T4U_measured                 object
T4U                          object
FTI_measured                 object
FTI                          object
TBG_measured                 object
TBG                          object
dtype: object

### For this dataset I don't have 'I131_treatment', 'hypopituitary', 'psych' and 'referral_source' attributes.

In [15]:
sick_euthyroid = sick_euthyroid[sick_euthyroid.Target.isin(['negative'])]
display(sick_euthyroid.shape)

(2870, 26)

### Now it's time to work on the "ann" series:

In [16]:
ann_train = pd.read_csv("ann-train.CSV")
ann_test = pd.read_csv("ann-test.CSV")
display(ann_test.head(10))
display(ann_test.dtypes)

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,lithium,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,Target
0,0.29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0061,0.028,0.111,0.131,0.085,2
1,0.32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0013,0.019,0.084,0.078,0.107,3
2,0.35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.031,0.239,0.1,0.239,3
3,0.21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.001,0.018,0.087,0.088,0.099,3
4,0.22,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0004,0.022,0.134,0.135,0.099,3
5,0.22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0016,0.02,0.123,0.113,0.109,3
6,0.39,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0016,0.036,0.133,0.144,0.093,3
7,0.77,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00081,0.02,0.08,0.096,0.08316,3
8,0.23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.014,0.113,0.096,0.11746,3
9,0.23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0026,0.011,0.104,0.104,0.099,3


age                          float64
sex                            int64
on_thyroxine                   int64
query_on_thyroxine             int64
on_antithyroid_medication      int64
sick                           int64
pregnant                       int64
thyroid_surgery                int64
I131_treatment                 int64
query_hypothyroid              int64
query_hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH                          float64
T3                           float64
TT4                          float64
T4U                          float64
FTI                          float64
Target                         int64
dtype: object

### I don't have 'measured' attributes, the 'TBG' and the 'referral_source' attributes. I should create the 'measured' attributes basing on the other columns.

In [17]:
target1 = pd.Series(ann_test[ann_test.columns[-1]].values)
display(target1.value_counts())
target2 = pd.Series(ann_train[ann_train.columns[-1]].values)
display(target2.value_counts())

3    3178
2     177
1      73
dtype: int64

3    3488
2     191
1      93
dtype: int64

### Looking at the distribuition of the values for the 'Target' attribute, we can understand that:
* 3 is referring to the 'negative' class
* 2 is referring to the 'hypothyroid' class
* 1 is referring to the 'hyperthyroid' class

### I should analyze the distribuition of the sex attribute in the other datasets to understand how I should treat it in the 'ann' series:

In [18]:
print("Sex thyroid0387 1=F,0=M:")
sex_series1 = pd.Series(thyroid0387[thyroid0387.columns[1]].values)
display(sex_series1.value_counts())
print("Sick-euthyroid:")
sex_series2 = pd.Series(sick_euthyroid[sick_euthyroid.columns[2]].values)
display(sex_series2.value_counts())

Sex thyroid0387 1=F,0=M:


1.0    6073
0.0    2792
dtype: int64

Sick-euthyroid:


F    2003
M     800
?      67
dtype: int64

### So, there are more female than male patients in these datasets. Looking at the "ann" series I got:

In [19]:
sex1 = pd.Series(ann_test[ann_test.columns[1]].values)
display(sex1.value_counts())
sex2 = pd.Series(ann_train[ann_train.columns[1]].values)
display(sex2.value_counts())

0    2380
1    1048
dtype: int64

0    2629
1    1143
dtype: int64

### I can assume that '0' refers to female patients and '1' refers to male patients. Another important things to do is to multply for 100 all the continuos and numerical attributes and to add the 'measured' attributes.

In [20]:
for column in ann_train.columns:
    listOfValues=set(ann_train[column])
    print(column,": ",listOfValues)

age :  {0.73, 0.24, 0.47, 0.69, 0.48, 0.76, 0.5, 0.25, 0.26, 0.51, 0.75, 0.23, 0.22, 0.54, 0.79, 0.21, 0.63, 0.38, 0.13, 0.88, 0.72, 0.42, 0.18, 0.43, 0.44, 0.19, 0.2, 0.45, 0.46, 0.41, 0.94, 0.15, 0.16, 0.17, 0.55, 0.8, 0.39, 0.89, 0.14, 0.515, 0.57, 0.82, 0.64, 0.85, 0.67, 0.59, 0.58, 0.84, 0.92, 0.68, 0.35, 0.6, 0.93, 0.11, 0.77, 0.27, 0.52, 0.91, 0.61, 0.86, 0.36, 0.7, 0.1, 0.08, 0.56, 0.81, 0.65, 0.4, 0.9, 0.49, 0.74, 0.83, 0.34, 0.29, 0.3, 0.31, 0.32, 0.33, 0.07, 0.06, 0.05, 0.04, 0.53, 0.78, 0.28, 0.62, 0.37, 0.12, 0.87, 0.71, 0.02, 0.01, 0.66}
sex :  {0, 1}
on_thyroxine :  {0, 1}
query_on_thyroxine :  {0, 1}
on_antithyroid_medication :  {0, 1}
sick :  {0, 1}
pregnant :  {0, 1}
thyroid_surgery :  {0, 1}
I131_treatment :  {0, 1}
query_hypothyroid :  {0, 1}
query_hyperthyroid :  {0, 1}
lithium :  {0, 1}
goitre :  {0, 1}
tumor :  {0, 1}
hypopituitary :  {0, 1}
psych :  {0, 1}
TSH :  {0.139, 0.0, 0.46799999999999997, 0.14300000000000002, 0.478, 0.00061, 0.53, 0.11699999999999999, 0.

In [21]:
ann = pd.concat([ann_train,ann_test], ignore_index = True)
ann['sex'] = ann['sex'].map({0:'F',1:'M'})
ann['Target'] = ann['Target'].map({3:'negative',2:'hypothyroid',1:'hyperthyroid'})

continuos_attributes = ['age','TSH','T3','TT4','T4U','FTI']
for attribute in continuos_attributes:
    ann[attribute] = ann[attribute] * 100

def fillNewAttributes(row,attribute):
    if row[attribute] > 0:
        return 'y'
    else:
        return 'n'

ann['TSH_measured'] = ann.apply(lambda row: fillNewAttributes(row,'TSH'), axis=1)
ann['T3_measured'] = ann.apply(lambda row: fillNewAttributes(row,'T3'), axis=1)
ann['TT4_measured'] = ann.apply(lambda row: fillNewAttributes(row,'TT4'), axis=1)
ann['T4U_measured'] = ann.apply(lambda row: fillNewAttributes(row,'T4U'), axis=1)
ann['FTI_measured'] = ann.apply(lambda row: fillNewAttributes(row,'FTI'), axis=1)
display(ann.dtypes)

age                          float64
sex                           object
on_thyroxine                   int64
query_on_thyroxine             int64
on_antithyroid_medication      int64
sick                           int64
pregnant                       int64
thyroid_surgery                int64
I131_treatment                 int64
query_hypothyroid              int64
query_hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                  int64
psych                          int64
TSH                          float64
T3                           float64
TT4                          float64
T4U                          float64
FTI                          float64
Target                        object
TSH_measured                  object
T3_measured                   object
TT4_measured                  object
T4U_measured                  object
FTI_measured                  object
d

### Now I can merge all the datasets in one:

In [22]:
data = pd.concat([allDataset,thyroid0387,hypothyroid,sick_euthyroid,ann], ignore_index = True)
display(data.shape)
display(data.dtypes)

(19786, 30)

age                          object
sex                          object
on_thyroxine                 object
query_on_thyroxine           object
on_antithyroid_medication    object
sick                         object
pregnant                     object
thyroid_surgery              object
I131_treatment               object
query_hypothyroid            object
query_hyperthyroid           object
lithium                      object
goitre                       object
tumor                        object
hypopituitary                object
psych                        object
TSH_measured                 object
TSH                          object
T3_measured                  object
T3                           object
TT4_measured                 object
TT4                          object
T4U_measured                 object
T4U                          object
FTI_measured                 object
FTI                          object
TBG_measured                 object
TBG                         

# Part 2: Data pre processing
### I will start the data pre processing observing the set of possible values for each attribute:

In [23]:
for column in data.columns:
    listOfValues=set(data[column])
    print(column,": ",listOfValues)

age :  {1, 2, 3, '23', 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, '36', 21, 20, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, '60', 51, 53, 54, 55, 56, 52, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, '87', 77, 79, 80, '51', 78, 83, '75', '18', 85, 82, 88, 86, 90, 84, 92, 89, 87, 93, 91, 97, 94, 95, '78', 7.000000000000001, '14', 4, '2', 22, '50', '84', '27', '30', '43', '68', '33', '45', '71', '53', '9', '69', '12', '7', '85', '5', '54', '73', '72', '82', '55', '61', '13', '17', 14.000000000000002, '49', '21', '19', '32', '35', '47', '8', '83', '10', '39', '81', '98', 51.5, '80', 52.190000000000005, '64', 55.00000000000001, '63', '46', '42', '11', 56.99999999999999, 56.00000000000001, '59', 57, 57.99999999999999, '6', '15', '34', '25', '48', '62', '20', '90', '77', '28', '44', '26', '31', '57', '22', '76', 28.000000000000004, '40', 28.999999999999996, '65', '79', 81, '1', '56',

### Sometimes '?' has been used  instead of 'nan', so before counting how many nans are present, I need to do a substitoution:

In [24]:
data=data.replace({"?":np.NAN})
data.isna().sum()

age                            409
sex                            394
on_thyroxine                     0
query_on_thyroxine               0
on_antithyroid_medication        0
sick                             0
pregnant                         0
thyroid_surgery                  0
I131_treatment                3021
query_hypothyroid                0
query_hyperthyroid               0
lithium                          0
goitre                           0
tumor                            0
hypopituitary                 3021
psych                         3021
TSH_measured                     0
TSH                           1321
T3_measured                      0
T3                            3372
TT4_measured                     0
TT4                            696
T4U_measured                     0
T4U                           1083
FTI_measured                     0
FTI                           1075
TBG_measured                  7200
TBG                          19174
referral_source     

### The 'TBG', 'referral_source' and 'TBG_measured' attributes have too many nan values, I have to drop them. Let's try to drop the 'sex' attribute too:

In [25]:
del data['TBG']
del data['referral_source']
del data['TBG_measured']
del data['sex']

### I can have maximum nine nan values in a row, so I will drop all the rows wtih more than five nan values because they present very few data and aren't good enough for the classification:

In [26]:
data.dropna(axis = 0, thresh = 20, inplace = True)
data.isna().sum()

age                           380
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment               2773
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                2773
psych                        2773
TSH_measured                    0
TSH                          1074
T3_measured                     0
T3                           3125
TT4_measured                    0
TT4                           448
T4U_measured                    0
T4U                           835
FTI_measured                    0
FTI                           828
Target                          0
dtype: int64

### For the classification is important that the dataset only has numerical attributes, so I have to encode the categorical values into numerical values:

In [27]:
data = data.replace({"t":1,"f":0, "y":1, "n":0, "hypothyroid":1, "negative":0,"hyperthyroid":2, "F":1, "M":0})
display(data.dtypes)

age                           object
on_thyroxine                   int64
query_on_thyroxine             int64
on_antithyroid_medication      int64
sick                           int64
pregnant                       int64
thyroid_surgery                int64
I131_treatment               float64
query_hypothyroid              int64
query_hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                float64
psych                        float64
TSH_measured                   int64
TSH                           object
T3_measured                    int64
T3                            object
TT4_measured                   int64
TT4                           object
T4U_measured                   int64
T4U                           object
FTI_measured                   int64
FTI                           object
Target                         int64
dtype: object

In [28]:
cols = data.columns[data.dtypes.eq('object')]
data[cols] = data[cols].apply(pd.to_numeric, errors='coerce')
display(data.dtypes)

age                          float64
on_thyroxine                   int64
query_on_thyroxine             int64
on_antithyroid_medication      int64
sick                           int64
pregnant                       int64
thyroid_surgery                int64
I131_treatment               float64
query_hypothyroid              int64
query_hyperthyroid             int64
lithium                        int64
goitre                         int64
tumor                          int64
hypopituitary                float64
psych                        float64
TSH_measured                   int64
TSH                          float64
T3_measured                    int64
T3                           float64
TT4_measured                   int64
TT4                          float64
T4U_measured                   int64
T4U                          float64
FTI_measured                   int64
FTI                          float64
Target                         int64
dtype: object

In [45]:
data.to_csv('total_dataset.csv')

# Part 3: training of the classifiers
### Before the training starts, I have to find the attributes most related to the target:

In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19538 entries, 0 to 19785
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        19158 non-null  float64
 1   on_thyroxine               19538 non-null  int64  
 2   query_on_thyroxine         19538 non-null  int64  
 3   on_antithyroid_medication  19538 non-null  int64  
 4   sick                       19538 non-null  int64  
 5   pregnant                   19538 non-null  int64  
 6   thyroid_surgery            19538 non-null  int64  
 7   I131_treatment             16765 non-null  float64
 8   query_hypothyroid          19538 non-null  int64  
 9   query_hyperthyroid         19538 non-null  int64  
 10  lithium                    19538 non-null  int64  
 11  goitre                     19538 non-null  int64  
 12  tumor                      19538 non-null  int64  
 13  hypopituitary              16765 non-null  flo

In [30]:
corr_values = abs(data[data.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)

on_thyroxine          0.082979
query_hypothyroid     0.050792
query_hyperthyroid    0.077351
psych                 0.050080
TSH                   0.252200
FTI                   0.046015
Name: Target, dtype: float64

### Another thing that I have to do is to divide the dataset into two sets: the training set and the testing set.

In [46]:
corr_values.index

Index(['on_thyroxine', 'query_hypothyroid', 'query_hyperthyroid', 'psych',
       'TSH', 'FTI'],
      dtype='object')

In [47]:
def holdout(dataframe):
  x = dataframe[['age','on_thyroxine', 'query_hypothyroid', 'query_hyperthyroid', 'psych',
       'TSH', 'FTI']]
  y = dataframe['Target']
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42) 
  return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = holdout(data)

### Now I will define the classifiers that I'm going to use. I need some classifiers that are friendly with nan values:

In [32]:
data1 = data.interpolate(method = 'spline', order = 3)
display(data1.isna().sum())

age                          0
on_thyroxine                 0
query_on_thyroxine           0
on_antithyroid_medication    0
sick                         0
pregnant                     0
thyroid_surgery              0
I131_treatment               0
query_hypothyroid            0
query_hyperthyroid           0
lithium                      0
goitre                       0
tumor                        0
hypopituitary                0
psych                        0
TSH_measured                 0
TSH                          0
T3_measured                  0
T3                           0
TT4_measured                 0
TT4                          0
T4U_measured                 0
T4U                          0
FTI_measured                 0
FTI                          0
Target                       0
dtype: int64

In [33]:
classifiers1 = {
    "XGBClassifier" : XGBClassifier(learning_rate=0.01),
    "CatBoostClassifier" : CatBoostClassifier(max_depth=4,verbose=0),
    "Nearest Neighbors" : KNeighborsClassifier(4),
    "Decision Tree" : DecisionTreeClassifier(class_weight = 'balanced'),
    "Random Forest": RandomForestClassifier(class_weight = 'balanced',random_state = 1),
    "ExtraTrees": ExtraTreesClassifier(class_weight = 'balanced',random_state = 1),
    "MLPClassifier": MLPClassifier(hidden_layer_sizes=(256,128,64,32),activation="relu",random_state=1)
}

In [35]:
def classification(classifiers, X_train, X_test, y_train, y_test):
    # Creo un dataframe per visualizzare i risultati calcolati
  res = pd.DataFrame(columns=["Classifier", 
                                "Accuracy", 
                                "Precision", 
                                "Recall", 
                                "FScore"])
  for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            pr, rc, fs, sup = metrics.precision_recall_fscore_support(y_test, y_pred, average='macro')
            res = res.append({"Classifier": name,"Accuracy": round(metrics.accuracy_score(y_test, y_pred), 4),
                              "Precision": round(pr, 4), "Recall":round(rc, 4), "FScore":round(fs, 4)}, ignore_index=True)
            print("Confusion matrix for: ", name)
            display(confusion_matrix(y_test, y_pred))
  res.set_index("FScore", inplace=True)
  res.sort_values(by="FScore", ascending=False, inplace=True)   
  return res

In [48]:
corr_values = abs(data1[data1.columns[0:]].corr()['Target'][:])
corr_values = corr_values.drop('Target')
corr_values = corr_values[corr_values > 0.04]
display(corr_values)

X_train1, X_test1, y_train1, y_test1 = holdout(data1)

display(classification(classifiers1,X_train1, X_test1, y_train1, y_test1))

on_thyroxine          0.082979
query_hypothyroid     0.050792
query_hyperthyroid    0.077351
psych                 0.048771
TSH                   0.219327
FTI                   0.040752
Name: Target, dtype: float64

Confusion matrix for:  XGBClassifier


array([[5167,  101,   11],
       [ 123,  309,    1],
       [  89,    9,   52]])

Confusion matrix for:  CatBoostClassifier


array([[5167,   88,   24],
       [  25,  408,    0],
       [  45,    1,  104]])

Confusion matrix for:  Nearest Neighbors


array([[5176,   97,    6],
       [ 293,  139,    1],
       [ 100,    6,   44]])

Confusion matrix for:  Decision Tree


array([[5160,   76,   43],
       [  63,  368,    2],
       [  42,    1,  107]])

Confusion matrix for:  Random Forest


array([[5170,   85,   24],
       [  42,  391,    0],
       [  47,    2,  101]])

Confusion matrix for:  ExtraTrees


array([[5175,   79,   25],
       [  96,  337,    0],
       [  55,    5,   90]])

Confusion matrix for:  MLPClassifier


array([[5162,  102,   15],
       [ 217,  213,    3],
       [  97,   19,   34]])

Unnamed: 0_level_0,Classifier,Accuracy,Precision,Recall
FScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.8694,CatBoostClassifier,0.9688,0.8734,0.8715
0.8581,Random Forest,0.9659,0.8697,0.8519
0.8419,Decision Tree,0.9613,0.837,0.8469
0.8148,ExtraTrees,0.9556,0.8516,0.7862
0.727,XGBClassifier,0.943,0.8369,0.6797
0.6173,MLPClassifier,0.9227,0.7447,0.5655
0.6013,Nearest Neighbors,0.9142,0.7889,0.5316


In [49]:
model=RandomForestClassifier(class_weight = 'balanced',random_state = 1)
model.fit(X_train1,y_train1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [50]:
fet=[]
fet.append(int(40))
fet.append(int(0))
fet.append(int(0))
fet.append(int(0))
fet.append(float(0))
fet.append(float(0.003))
fet.append(float(0))
model.predict([np.array(fet)])

array([0])

In [51]:
import _pickle
_pickle.dump(model,open('model.pkl','wb'))

# Part 4: Final Discussion
### The final dataset that I got is very umbalanced, that's true, but it's normal because only a small percentage of the world population suffers of thyroid disease. Nevertheless, thanks to a good pre-elaboration of the data, I got some very accurate classifiers, that have a good FScore too. I could handle the nan values beacause the results didn't get much worse, and even with the normalization and the discretization they didn't change that much. After having balanced the normalized dataset, we got the best results of the notebook, this means that the work that had been done before was pretty good. In the future, it would be interesting to continue these studies hoping to use more data.