In [263]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings('ignore')

In [264]:
#from sklearn.decomposition import PCA
#PCA = PCA()


In [265]:
# Column names as per the dataset's documentation
column_names = [
     "class","age", "sex", "on_thyroxine", "query_on_thyroxine", "on_antithyroid_medication",
    "sick", "pregnant", "thyroid_surgery", "I131_treatment", "query_hypothyroid",
    "query_hyperthyroid", "lithium", "goitre", "tumor", "hypopituitary", "psych",
    "TSH_measured", "TSH", "T3_measured", "T3", "TT4_measured", "TT4",
    "T4U_measured", "T4U", "FTI_measured", "FTI", "TBG_measured", "TBG",
    "referral_source"]

# Load the 'hypothyroid.data' file
df = pd.read_csv("hypothyroid.data", names=column_names,na_values='?',skipinitialspace=True)

# View first few rows
print(df.head())

         class   age sex on_thyroxine query_on_thyroxine  \
0  hypothyroid  72.0   M            f                  f   
1  hypothyroid  15.0   F            t                  f   
2  hypothyroid  24.0   M            f                  f   
3  hypothyroid  24.0   F            f                  f   
4  hypothyroid  77.0   M            f                  f   

  on_antithyroid_medication sick pregnant thyroid_surgery I131_treatment  ...  \
0                         f    f        f               f              f  ...   
1                         f    f        f               f              f  ...   
2                         f    f        f               f              f  ...   
3                         f    f        f               f              f  ...   
4                         f    f        f               f              f  ...   

  T3 TT4_measured TT4 T4U_measured T4U  FTI_measured FTI  TBG_measured TBG  \
0  y         1.48   y         10.0   n           NaN NaN           NaN NaN

In [266]:
df.columns

Index(['class', 'age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'TSH',
       'T3_measured', 'T3', 'TT4_measured', 'TT4', 'T4U_measured', 'T4U',
       'FTI_measured', 'FTI', 'TBG_measured', 'TBG', 'referral_source'],
      dtype='object')

In [267]:
df.isnull().sum()

class                           0
age                           446
sex                            73
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                 468
psych                           0
TSH_measured                  695
TSH                             0
T3_measured                   249
T3                              0
TT4_measured                  248
TT4                             0
T4U_measured                  247
T4U                             0
FTI_measured                 2903
FTI                          3163
TBG_measured                 3163
TBG                          3163
referral_sourc

In [268]:
df.shape

(3163, 30)

In [269]:
df.drop("FTI",axis=1,inplace=True)
df.drop("TBG_measured",axis=1,inplace=True)
df.drop("TBG",axis=1,inplace=True)
df.drop("referral_source",axis=1,inplace=True)

In [270]:
df['class'].values

array(['hypothyroid', 'hypothyroid', 'hypothyroid', ..., 'negative',
       'negative', 'negative'], shape=(3163,), dtype=object)

In [271]:
df.isnull().sum()

class                           0
age                           446
sex                            73
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
I131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                 468
psych                           0
TSH_measured                  695
TSH                             0
T3_measured                   249
T3                              0
TT4_measured                  248
TT4                             0
T4U_measured                  247
T4U                             0
FTI_measured                 2903
dtype: int64

In [272]:
df['age'] = df['age'].fillna(df['age'].mean())
df['sex'] = df['sex'].fillna(df['sex'].mode()[0])
df['hypopituitary'] = df['hypopituitary'].fillna(df['hypopituitary'].mode()[0])
df['TSH_measured'] = df['TSH_measured'].fillna(df['TSH_measured'].mode()[0])
df['T3_measured'] = df['T3_measured'].fillna(df['T3_measured'].mode()[0])
df['TT4_measured'] = df['TT4_measured'].fillna(df['TT4_measured'].mode()[0])
df['T4U_measured'] = df['T4U_measured'].fillna(df['T4U_measured'].mode()[0])
df['FTI_measured'] = df['FTI_measured'].fillna(df['FTI_measured'].mode()[0])

In [273]:
hypo = df.select_dtypes(include=object).columns
hypo

Index(['class', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'psych', 'TSH', 'T3', 'TT4', 'T4U'],
      dtype='object')

In [274]:
lab_enc = LabelEncoder()
hh =['class', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery',
       'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
       'goitre', 'tumor', 'psych', 'TSH', 'T3', 'TT4', 'T4U']
for i in hh:
    df[i]= lab_enc.fit_transform(df[i])


In [275]:
x= df.iloc[:,:-1]
y = df.iloc[:,-1]

In [276]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

In [277]:
df

Unnamed: 0,class,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,...,psych,TSH_measured,TSH,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured
0,0,72.0,1,0,0,0,0,0,0,0,...,1,0.6,1,15.0,1,1.48,1,10.0,0,28.0
1,0,15.0,0,1,0,0,0,0,0,0,...,1,1.7,1,19.0,1,1.13,1,17.0,0,28.0
2,0,24.0,1,0,0,0,0,0,0,0,...,1,0.2,1,4.0,1,1.00,1,0.0,0,28.0
3,0,24.0,0,0,0,0,0,0,0,0,...,1,0.4,1,6.0,1,1.04,1,6.0,0,28.0
4,0,77.0,1,0,0,0,0,0,0,0,...,1,1.2,1,57.0,1,1.28,1,44.0,0,28.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,1,58.0,0,0,0,0,0,0,0,0,...,1,1.7,1,86.0,1,0.91,1,95.0,0,28.0
3159,1,29.0,0,0,0,0,0,0,0,0,...,1,1.8,1,99.0,1,1.01,1,98.0,0,28.0
3160,1,77.0,1,0,0,0,0,0,0,0,...,1,0.6,1,71.0,1,0.68,1,104.0,0,28.0
3161,1,74.0,0,0,0,0,0,0,0,0,...,1,0.1,1,65.0,1,0.48,1,137.0,0,28.0


In [278]:
#normalisation
s_scale = StandardScaler()
x_train_scaled = s_scale.fit(x_train)
x_test_scaled= s_scale.transform(x_test)

In [279]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [280]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model.fit(x_train, y_train)

In [281]:
y_pred = log_model.predict(x_test)
y_pred[:10]

array([28, 28, 28, 28, 28, 28, 28, 28, 28, 28])

In [282]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)*100

91.6754478398314

In [283]:
# support vector machine
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(x_train,y_train)

In [284]:
svm_pred = svm.predict(x_test)
svm_pred[:10]

array([28, 28, 28, 28, 28, 28, 28, 28, 28, 28])

In [291]:
accuracy_score(y_test,svm_pred)*100

92.09694415173867

In [286]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
forest.fit(x_train,y_train)

In [287]:
forest_pred = forest.predict(x_test)

In [288]:
forest_pred[:10]

array([28, 28, 28, 28, 28, 28, 28, 28, 28, 28])

In [289]:
accuracy_score(forest_pred,y_test)*100

92.09694415173867

In [290]:
from sklearn.metrics import classification_report
print(classification_report(y_test,forest_pred))

              precision    recall  f1-score   support

           7       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         2
          18       0.00      0.00      0.00         6
          19       0.00      0.00      0.00         2
          20       0.00      0.00      0.00         2
          21       0.00      0.00      0.00         8
          22       0.00      0.00      0.00         2
          23       0.00      0.00      0.00         3
          24       0.00      0.00      0.00         1
          25       0.00      0.00      0.00         2
          26       0.00      0.00      0.00         3
          27       0.17      1.00      0.29         1
          28       0.99      0.99      0.99       875
          29       0.00      0.00      0.00         3
          30       0.33    

In [None]:
#neural network

In [292]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()
dtree.fit(x_train,y_train)

In [293]:
tree = dtree.predict(x_test)

In [294]:
tree[:10]

array([28, 28, 28, 28, 28, 28, 28, 28, 28, 28])

In [295]:
accuracy_score(tree,y_test)*100

92.09694415173867