# Visualizing And Analysing The Data

### Importing Necessary Libraries

In [273]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Loading Dataset

In [274]:
df=pd.read_csv(r'C:\Users\lenovo\Intenship Project\Dataset.csv')

### Reading dataset

In [275]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
0,29,F,f,f,f,f,f,f,f,t,...,,f,,f,,f,,other,-,840801013
1,29,F,f,f,f,f,f,f,f,f,...,128.0,f,,f,,f,,other,-,840801014
2,41,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,11.0,other,-,840801042
3,36,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,26.0,other,-,840803046
4,32,F,f,f,f,f,f,f,f,f,...,,f,,f,,t,36.0,other,S,840803047


In [276]:
df.tail()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,target,patient_id
9167,56,M,f,f,f,f,f,f,f,f,...,64.0,t,0.83,t,77.0,f,,SVI,-,870119022
9168,22,M,f,f,f,f,f,f,f,f,...,91.0,t,0.92,t,99.0,f,,SVI,-,870119023
9169,69,M,f,f,f,f,f,f,f,f,...,113.0,t,1.27,t,89.0,f,,SVI,I,870119025
9170,47,F,f,f,f,f,f,f,f,f,...,75.0,t,0.85,t,88.0,f,,other,-,870119027
9171,31,M,f,f,f,f,f,f,f,t,...,66.0,t,1.02,t,65.0,f,,other,-,870119035


In [277]:
df.shape

(9172, 31)

# Data Preprocessing

### Checking for Null Values

In [278]:
df.isnull().sum()

age                       0
sex                     308
on_thyroxine              5
query_on_thyroxine        2
on_antithyroid_meds       4
sick                      0
pregnant                  1
thyroid_surgery           0
I131_treatment            6
query_hypothyroid         1
query_hyperthyroid        2
lithium                   3
goitre                    2
tumor                     3
hypopituitary             2
psych                     1
TSH_measured              1
TSH                     842
T3_measured               0
T3                     2604
TT4_measured              2
TT4                     442
T4U_measured              4
T4U                     809
FTI_measured              0
FTI                     802
TBG_measured              3
TBG                    8823
referral_source           2
target                    1
patient_id                0
dtype: int64

#### Removing the Reduntant attributes from dataset.

In [279]:
df = df.drop(['patient_id','referral_source','TBG_measured','FTI_measured','TT4_measured','T4U_measured','TSH_measured','T3_measured'],axis=1) 

#### Re-mapping the 'target' values to diagnostic Group

In [280]:
df['target'].unique()

array(['-', 'S', 'F', 'AK', 'R', 'I', 'M', 'N', 'G', 'K', 'A', 'KJ', 'L',
       'MK', 'Q', 'J', 'C|I', 'O', 'LJ', 'H|K', 'D', 'GK', 'MI', 'P',
       'FK', 'B', 'GI', 'C', nan, 'GKJ', 'OI', 'D|R', 'E'], dtype=object)

In [281]:
diagnoses = {
             'A': 'hyperthyroid', 
             'B': 'hyperthyroid', 
             'C': 'hyperthyroid', 
             'D': 'hyperthyroid',
             'E': 'hypothyroid', 
             'F': 'hypothyroid', 
             'G': 'hypothyroid', 
             'H': 'hypothyroid',
             'I':'binding protein',
             'J':'binding protein',
             'K': 'general health', 
             'L': 'replacement therapy', 
             'M': 'replacement therapy',
             'N': 'replacement therapy', 
             'O': 'antithyroid treatment', 
             'P': 'antithyroid treatment',
             'Q': 'antithyroid treatment', 
             'R': 'miscellaneous', 
             'S': 'miscellaneous',
             'T': 'miscellaneous'}

df['target'] = df['target'].map(diagnoses)


#### Dropping the 'target'  of nan values

In [282]:
df.dropna(subset=['target'], inplace=True)

In [283]:
df['target'].value_counts()

hypothyroid              593
general health           436
binding protein          376
replacement therapy      336
miscellaneous            281
hyperthyroid             182
antithyroid treatment     33
Name: target, dtype: int64

### Descriptive Analysis

In [284]:
df.describe()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG
count,2237.0,2087.0,1643.0,2140.0,2059.0,2060.0,98.0
mean,52.792579,14.930791,1.961875,116.390495,1.013439,120.363369,47.717347
std,19.67745,46.204092,1.452238,60.3516,0.280222,70.996728,32.39875
min,1.0,0.005,0.05,2.0,0.17,1.4,9.299999
25%,36.0,0.255,1.0,76.0,0.85,83.0,32.0
50%,56.0,2.0,1.7,109.0,0.96,109.0,36.0
75%,69.0,8.799999,2.5,156.0,1.12,157.0,46.75
max,95.0,530.0,18.0,600.0,2.33,881.0,200.0


In [285]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2237 entries, 4 to 9169
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  2237 non-null   int64  
 1   sex                  2147 non-null   object 
 2   on_thyroxine         2237 non-null   object 
 3   query_on_thyroxine   2236 non-null   object 
 4   on_antithyroid_meds  2236 non-null   object 
 5   sick                 2237 non-null   object 
 6   pregnant             2236 non-null   object 
 7   thyroid_surgery      2237 non-null   object 
 8   I131_treatment       2235 non-null   object 
 9   query_hypothyroid    2237 non-null   object 
 10  query_hyperthyroid   2237 non-null   object 
 11  lithium              2237 non-null   object 
 12  goitre               2236 non-null   object 
 13  tumor                2237 non-null   object 
 14  hypopituitary        2236 non-null   object 
 15  psych                2237 non-null   o

In [286]:
# Checking the 'age' is there any above 100 and we dropping the age>100.
df[df.age>100]

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG,target


In [287]:
df['age']=np.where((df.age>100),np.nan,df.age)

In [288]:
df

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG,target
4,32.0,F,f,f,f,f,f,f,f,f,...,f,f,f,,,,,,36.0,miscellaneous
18,63.0,F,t,f,f,t,f,f,f,f,...,f,f,f,68.000000,,48.0,1.02,47.0,,hypothyroid
32,41.0,M,f,f,f,f,f,f,f,f,...,f,f,f,0.050000,1.6,39.0,1.00,39.0,,miscellaneous
33,71.0,F,t,f,f,f,f,f,f,f,...,f,f,f,0.050000,,126.0,1.38,91.0,,binding protein
39,55.0,F,t,f,f,f,f,f,f,t,...,f,f,f,9.599999,2.4,136.0,1.48,92.0,,replacement therapy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9153,64.0,M,f,f,f,f,f,f,f,f,...,f,f,f,0.810000,,31.0,0.55,56.0,,general health
9157,60.0,M,f,f,t,f,f,f,f,f,...,f,f,f,0.180000,,28.0,0.87,32.0,,general health
9158,64.0,M,f,f,f,f,f,f,f,t,...,f,f,f,,,44.0,0.53,83.0,,binding protein
9162,36.0,F,f,f,f,f,f,f,f,f,...,f,f,f,,,84.0,1.26,67.0,,binding protein


### Splitting The Data X And Y

In [289]:
x=df.iloc[:,0:-1]
y=df.iloc[:,-1]

In [290]:
x

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
4,32.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,,,,36.0
18,63.0,F,t,f,f,t,f,f,f,f,...,f,f,f,f,68.000000,,48.0,1.02,47.0,
32,41.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,0.050000,1.6,39.0,1.00,39.0,
33,71.0,F,t,f,f,f,f,f,f,f,...,f,f,f,f,0.050000,,126.0,1.38,91.0,
39,55.0,F,t,f,f,f,f,f,f,t,...,f,f,f,f,9.599999,2.4,136.0,1.48,92.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9153,64.0,M,f,f,f,f,f,f,f,f,...,f,f,f,f,0.810000,,31.0,0.55,56.0,
9157,60.0,M,f,f,t,f,f,f,f,f,...,f,f,f,f,0.180000,,28.0,0.87,32.0,
9158,64.0,M,f,f,f,f,f,f,f,t,...,f,f,f,f,,,44.0,0.53,83.0,
9162,36.0,F,f,f,f,f,f,f,f,f,...,f,f,f,f,,,84.0,1.26,67.0,


In [291]:
y

4             miscellaneous
18              hypothyroid
32            miscellaneous
33          binding protein
39      replacement therapy
               ...         
9153         general health
9157         general health
9158        binding protein
9162        binding protein
9169        binding protein
Name: target, Length: 2237, dtype: object

#### Making 'F' on where ever we have the 'nan' values on data

In [292]:
x['sex'].unique()

array(['F', 'M', nan], dtype=object)

In [293]:
x['sex'].replace(np.nan,'F',inplace=True)

### Converting The Datatype

In [294]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2237 entries, 4 to 9169
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  2237 non-null   float64
 1   sex                  2237 non-null   object 
 2   on_thyroxine         2237 non-null   object 
 3   query_on_thyroxine   2236 non-null   object 
 4   on_antithyroid_meds  2236 non-null   object 
 5   sick                 2237 non-null   object 
 6   pregnant             2236 non-null   object 
 7   thyroid_surgery      2237 non-null   object 
 8   I131_treatment       2235 non-null   object 
 9   query_hypothyroid    2237 non-null   object 
 10  query_hyperthyroid   2237 non-null   object 
 11  lithium              2237 non-null   object 
 12  goitre               2236 non-null   object 
 13  tumor                2237 non-null   object 
 14  hypopituitary        2236 non-null   object 
 15  psych                2237 non-null   o

###  Handling Categorical Values

In [295]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder
oe=OrdinalEncoder(dtype='float64')
x.iloc[:,1:16]=oe.fit_transform(x.iloc[:,1:16])

In [296]:
x

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
4,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,,,,36.0
18,63.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,68.000000,,48.0,1.02,47.0,
32,41.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.050000,1.6,39.0,1.00,39.0,
33,71.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.050000,,126.0,1.38,91.0,
39,55.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,9.599999,2.4,136.0,1.48,92.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9153,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.810000,,31.0,0.55,56.0,
9157,60.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.180000,,28.0,0.87,32.0,
9158,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,,,44.0,0.53,83.0,
9162,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,,84.0,1.26,67.0,


#### Replacing the nan values with zero (0) values.

In [297]:
x.replace(np.nan,'0',inplace=True)

In [298]:
x

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
4,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,36.0
18,63.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,68.0,0,48.0,1.02,47.0,0
32,41.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.05,1.6,39.0,1.0,39.0,0
33,71.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.05,0,126.0,1.38,91.0,0
39,55.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,9.599999,2.4,136.0,1.48,92.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9153,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.81,0,31.0,0.55,56.0,0
9157,60.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.18,0,28.0,0.87,32.0,0
9158,64.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0,0,44.0,0.53,83.0,0
9162,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,84.0,1.26,67.0,0


#### Now, applying Label Encoding on y(Independent variable) value.

In [299]:
label_encoder=LabelEncoder()

In [350]:
y_dt=label_encoder.fit_transform(y)

In [351]:
y=pd.DataFrame(y_dt,columns=['target'])

In [352]:
y

Unnamed: 0,target
0,5
1,4
2,5
3,1
4,6
...,...
2232,2
2233,2
2234,1
2235,1


### Checking Correlation

In [None]:
c=x.corr()
f,ax=plt.subplots(figsize=(5,5))
sns.heatmap(c,ax=ax,cmap="YlGnBu",linewidths=0.1

### Splitting Data Into Train And Test

In [308]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)

In [309]:
from imblearn.over_sampling import SMOTE
y_train.value_counts()

target
4         471
2         351
1         302
6         265
5         230
3         144
0          26
dtype: int64

In [310]:
x_train

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
1119,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0,0,0,0,41.0
7566,49.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.4,1.9,101.0,0.83,122.0,0
7946,74.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.2,0.9,99.0,0.99,100.0,0
8326,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,3.2,117.0,1.11,105.0,0
1733,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.3,8.9,212.0,0.88,240.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4240,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,51.0,0.4,12.0,1.26,10.0,0
7123,70.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.2,1.0,85.0,0.59,144.0,0
3203,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,54.0,2.1,50.0,1.1,46.0,0
3477,87.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.79,1.5,145.0,0.88,165.0,0


###  Handling Imbalanced Data

In [311]:
hid=SMOTE(random_state=0,k_neighbors=1)
x_bal,y_bal=hid.fit_resample(x_train,y_train)
x_test_bal,y_test_bal=hid.fit_resample(x_test,y_test)

### Applying StandardScaler

In [312]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_bal=sc.fit_transform(x_bal)
x_test_bal=sc.fit_transform(x_test_bal)

In [313]:
x_bal

array([[-1.62721505, -0.55176257, -0.44547421, ..., -2.50870684,
        -1.40088079,  3.29445097],
       [-0.11561403, -0.55176257,  2.30310072, ..., -0.26259147,
         0.0720981 , -0.19494049],
       [ 1.1874903 ,  2.0173423 , -0.44547421, ...,  0.17039463,
        -0.19352104, -0.19494049],
       ...,
       [ 1.395987  , -0.55176257,  2.30310072, ...,  0.43615031,
         0.06101022, -0.19494049],
       [ 0.72802783,  1.54146103,  2.30310072, ...,  0.143333  ,
         0.89086631, -0.19494049],
       [ 1.15628145, -0.55176257,  2.30310072, ...,  0.39723515,
        -0.26588659, -0.19494049]])

In [314]:
columns=['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U',
       'FTI', 'TBG']

In [315]:
x_test_bal=pd.DataFrame(x_test_bal,columns=columns)

In [316]:
x_bal=pd.DataFrame(x_bal,columns=columns)

In [317]:
x_bal

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_meds,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
0,-1.627215,-0.551763,-0.445474,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.315458,-1.035358,-1.704935,-2.508707,-1.400881,3.294451
1,-0.115614,-0.551763,2.303101,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.090056,0.155233,-0.197223,-0.262591,0.072098,-0.194940
2,1.187490,2.017342,-0.445474,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.278907,-0.471394,-0.227079,0.170395,-0.193521,-0.194940
3,-1.366594,-0.551763,-0.445474,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,6.169673,-0.037741,-0.144603,-0.284999,0.969848,0.041622,0.495134,-0.133153,-0.194940
4,-0.167738,-0.551763,-0.445474,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.306321,4.541622,1.459767,-0.127283,1.496783,-0.194940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3292,0.546923,1.104362,2.303101,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.114424,0.343221,-0.148122,-0.146517,0.040168,-0.194940
3293,0.383062,-0.551763,2.303101,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.309176,-0.856540,0.565143,-0.513902,1.085434,-0.194940
3294,1.395987,-0.551763,2.303101,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.095452,-0.172405,0.248906,0.436150,0.061010,-0.194940
3295,0.728028,1.541461,2.303101,-0.141477,-0.257609,-0.177156,-0.174273,-0.267762,-0.224295,-0.304166,...,-0.073367,-0.185723,-0.037741,-0.144603,-0.311566,0.087864,1.071643,0.143333,0.890866,-0.194940


# Model Building

### Random Forest Model

In [318]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
rfm= RandomForestClassifier().fit(x_bal,y_bal)
y_pred=rfm.predict(x_test_bal)
accuracy_score(y_test_bal,y_pred)
x_bal.shape,y_bal.shape,x_test_bal.shape,y_test_bal.shape

((3297, 22), (3297, 1), (854, 22), (854, 1))

In [319]:
test_score=accuracy_score(y_test_bal,y_pred)

In [320]:
test_score

0.870023419203747

In [321]:
train_score=accuracy_score(y_bal,rfm.predict(x_bal))

In [323]:
train_score

1.0

# Performing Feature Importance

In [324]:
from sklearn.inspection import permutation_importance
res=permutation_importance(rfm,x_bal,y_bal,scoring='accuracy')

In [325]:
fi=['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U',
       'FTI', 'TBG']
importance=res.importances_mean
importance=np.sort(importance)
for i,v in enumerate(importance):
    i=fi[i]
    print("feature: {:<20} score:{}".format(i,v))

feature: age                  score:0.0
feature: sex                  score:0.0
feature: on_thyroxine         score:0.0
feature: query_on_thyroxine   score:0.0
feature: on_antithyroid_meds  score:0.0
feature: sick                 score:0.0003033060357900963
feature: pregnant             score:0.0005459508644221733
feature: thyroid_surgery      score:0.0006066120715801926
feature: I131_treatment       score:0.0006066120715801926
feature: query_hypothyroid    score:0.0010312405216863717
feature: query_hyperthyroid   score:0.0024871094934789005
feature: lithium              score:0.002608431907794939
feature: goitre               score:0.004124962086745532
feature: tumor                score:0.01686381558993024
feature: hypopituitary        score:0.01686381558993024
feature: psych                score:0.020382165605095537
feature: TSH                  score:0.05253260539884743
feature: T3                   score:0.06229905975128904
feature: TT4                  score:0.10821959356991204
f

# Selecting Output Columns

In [326]:
df.columns

Index(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U',
       'FTI', 'TBG', 'target'],
      dtype='object')

In [327]:
x_bal.drop(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium'],axis=1)

Unnamed: 0,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
0,-0.073367,-0.185723,-0.037741,-0.144603,-0.315458,-1.035358,-1.704935,-2.508707,-1.400881,3.294451
1,-0.073367,-0.185723,-0.037741,-0.144603,-0.090056,0.155233,-0.197223,-0.262591,0.072098,-0.194940
2,-0.073367,-0.185723,-0.037741,-0.144603,-0.278907,-0.471394,-0.227079,0.170395,-0.193521,-0.194940
3,-0.073367,6.169673,-0.037741,-0.144603,-0.284999,0.969848,0.041622,0.495134,-0.133153,-0.194940
4,-0.073367,-0.185723,-0.037741,-0.144603,-0.306321,4.541622,1.459767,-0.127283,1.496783,-0.194940
...,...,...,...,...,...,...,...,...,...,...
3292,-0.073367,-0.185723,-0.037741,-0.144603,-0.114424,0.343221,-0.148122,-0.146517,0.040168,-0.194940
3293,-0.073367,-0.185723,-0.037741,-0.144603,-0.309176,-0.856540,0.565143,-0.513902,1.085434,-0.194940
3294,-0.073367,-0.185723,-0.037741,-0.144603,-0.095452,-0.172405,0.248906,0.436150,0.061010,-0.194940
3295,-0.073367,-0.185723,-0.037741,-0.144603,-0.311566,0.087864,1.071643,0.143333,0.890866,-0.194940


In [328]:
x_test_bal.drop(['age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_meds', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium'],axis=1)

Unnamed: 0,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,TBG
0,-0.034239,-0.194225,0.0,-0.14186,-0.298613,0.592727,0.689796,1.105905,0.085458,-0.17734
1,-0.034239,-0.194225,0.0,-0.14186,-0.300032,0.790409,0.382843,1.843955,-0.312280,-0.17734
2,-0.034239,-0.194225,0.0,-0.14186,0.952756,-0.461579,-1.111883,-0.426969,-0.864693,-0.17734
3,-0.034239,-0.194225,0.0,-0.14186,-0.185067,-0.527473,-0.217716,-0.426969,0.085458,-0.17734
4,-0.034239,-0.194225,0.0,-0.14186,-0.232378,-0.395685,-0.284445,-0.881154,0.361664,-0.17734
...,...,...,...,...,...,...,...,...,...,...
849,-0.034239,-0.194225,0.0,-0.14186,-0.121083,-1.120520,-0.110915,0.235509,-0.147503,-0.17734
850,-0.034239,-0.194225,0.0,-0.14186,-0.245187,-0.681320,0.614907,0.156529,0.481352,-0.17734
851,-0.034239,-0.194225,0.0,-0.14186,-0.296002,-1.089693,1.317172,0.848832,0.605656,-0.17734
852,-0.034239,-0.194225,0.0,-0.14186,-0.294929,-0.891968,1.283382,0.850426,0.580340,-0.17734


# Model Building On Selected Columns

### Random Forest Classifier Model

In [329]:
rfm= RandomForestClassifier().fit(x_bal,y_bal)
y_pred=rfm.predict(x_test_bal)
print(classification_report(y_test_bal,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       122
           1       0.75      0.90      0.82       122
           2       0.80      0.97      0.87       122
           3       0.93      0.53      0.68       122
           4       0.99      0.93      0.96       122
           5       0.74      0.86      0.80       122
           6       1.00      0.87      0.93       122

    accuracy                           0.87       854
   macro avg       0.88      0.87      0.86       854
weighted avg       0.88      0.87      0.86       854



### XGBClassifier Model

In [330]:
import xgboost as xgb
xgb1=xgb.XGBClassifier()
xgb1.fit(x_bal,y_bal)
y_pred=xgb1.predict(x_test_bal)

In [331]:
print(classification_report(y_test_bal,y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       122
           1       0.61      0.93      0.74       122
           2       0.90      0.98      0.94       122
           3       0.92      0.39      0.55       122
           4       0.97      0.91      0.94       122
           5       0.72      0.79      0.75       122
           6       0.99      0.84      0.91       122

    accuracy                           0.83       854
   macro avg       0.86      0.83      0.83       854
weighted avg       0.86      0.83      0.83       854



In [332]:
accuracy_score(y_test_bal,y_pred)

0.8348946135831382

### SVC Model

In [333]:
from sklearn.svm import SVC
svc=SVC()
svc.fit(x_bal,y_bal)
y_pred=svc.predict(x_test_bal)

In [335]:
print(classification_report(y_test_bal,y_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       122
           1       0.80      0.80      0.80       122
           2       0.75      0.90      0.82       122
           3       0.81      0.67      0.74       122
           4       0.84      0.75      0.79       122
           5       0.83      0.74      0.78       122
           6       0.88      0.98      0.93       122

    accuracy                           0.83       854
   macro avg       0.83      0.83      0.83       854
weighted avg       0.83      0.83      0.83       854



In [336]:
accuracy_score(y_test_bal,y_pred)

0.832552693208431

### GridSearch CV

In [337]:
from sklearn.model_selection import GridSearchCV

In [338]:
params={
    'C':[0.1,1,10,100,1000],
    'gamma':[1,0.1,0.01,0.001,0.0001],
    'kernel':['rbf','sqrt']
    
}

In [339]:
r_Svc=GridSearchCV(svc,params,scoring='accuracy',cv=5,n_jobs=-1)

In [340]:
r_Svc.fit(x_bal,y_bal)

In [341]:
r_Svc.best_params_

{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}

In [342]:
sv1=SVC(C=1000,gamma=0.01,kernel='rbf')

In [346]:
sv1.fit(x_bal,y_bal)

In [347]:
y_predict=sv1.predict(x_test_bal)

In [348]:
print(classification_report(y_test_bal,y_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       122
           1       0.80      0.80      0.80       122
           2       0.75      0.90      0.82       122
           3       0.81      0.67      0.74       122
           4       0.84      0.75      0.79       122
           5       0.83      0.74      0.78       122
           6       0.88      0.98      0.93       122

    accuracy                           0.83       854
   macro avg       0.83      0.83      0.83       854
weighted avg       0.83      0.83      0.83       854



In [355]:
accuracy_score(y_bal,sv1.predict(x_bal))

0.9824082499241735

In [356]:
import pickle
pickle.dump(sv1,open('thyroid1_model.pkl','wb'))

In [357]:
pickle.dump(label_encoder,open('label_encoder.pkl','wb'))