In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score

In [36]:
df = pd.read_csv("Thyroid_Diff.csv")

In [3]:
df.shape

(383, 17)

In [4]:
df.columns

Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy',
       'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology',
       'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383 entries, 0 to 382
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   383 non-null    int64 
 1   Gender                383 non-null    object
 2   Smoking               383 non-null    object
 3   Hx Smoking            383 non-null    object
 4   Hx Radiothreapy       383 non-null    object
 5   Thyroid Function      383 non-null    object
 6   Physical Examination  383 non-null    object
 7   Adenopathy            383 non-null    object
 8   Pathology             383 non-null    object
 9   Focality              383 non-null    object
 10  Risk                  383 non-null    object
 11  T                     383 non-null    object
 12  N                     383 non-null    object
 13  M                     383 non-null    object
 14  Stage                 383 non-null    object
 15  Response              383 non-null    ob

In [7]:
# df.describe()

In [8]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [9]:
df.isna().sum()

Age                     0
Gender                  0
Smoking                 0
Hx Smoking              0
Hx Radiothreapy         0
Thyroid Function        0
Physical Examination    0
Adenopathy              0
Pathology               0
Focality                0
Risk                    0
T                       0
N                       0
M                       0
Stage                   0
Response                0
Recurred                0
dtype: int64

In [10]:
pd.value_counts(df['Smoking'])

No     334
Yes     49
Name: Smoking, dtype: int64

In [11]:
pd.value_counts(df['Hx Smoking'])

No     355
Yes     28
Name: Hx Smoking, dtype: int64

In [12]:
pd.value_counts(df['Hx Radiothreapy'])

No     376
Yes      7
Name: Hx Radiothreapy, dtype: int64

In [13]:
pd.value_counts(df['Adenopathy'])

No           277
Right         48
Bilateral     32
Left          17
Extensive      7
Posterior      2
Name: Adenopathy, dtype: int64

In [14]:
pd.value_counts(df['Focality'])

Uni-Focal      247
Multi-Focal    136
Name: Focality, dtype: int64

In [15]:
pd.value_counts(df['Gender'])

F    312
M     71
Name: Gender, dtype: int64

In [16]:
pd.value_counts(df['M'])

M0    365
M1     18
Name: M, dtype: int64

In [17]:
pd.value_counts(df['N'])

N0     268
N1b     93
N1a     22
Name: N, dtype: int64

In [18]:
pd.value_counts(df['T'])

T2     151
T3a     96
T1a     49
T1b     43
T4a     20
T3b     16
T4b      8
Name: T, dtype: int64

In [19]:
pd.value_counts(df['Pathology'])

Papillary         287
Micropapillary     48
Follicular         28
Hurthel cell       20
Name: Pathology, dtype: int64

In [20]:
pd.value_counts(df['Physical Examination'])

Multinodular goiter            140
Single nodular goiter-right    140
Single nodular goiter-left      89
Normal                           7
Diffuse goiter                   7
Name: Physical Examination, dtype: int64

In [21]:
pd.value_counts(df['Recurred'])

No     275
Yes    108
Name: Recurred, dtype: int64

In [22]:
pd.value_counts(df['Response'])

Excellent                 208
Structural Incomplete      91
Indeterminate              61
Biochemical Incomplete     23
Name: Response, dtype: int64

In [23]:
pd.value_counts(df['Risk'])

Low             249
Intermediate    102
High             32
Name: Risk, dtype: int64

In [24]:
pd.value_counts(df['Stage'])

I      333
II      32
IVB     11
III      4
IVA      3
Name: Stage, dtype: int64

In [25]:
pd.value_counts(df['Thyroid Function'])

Euthyroid                      332
Clinical Hyperthyroidism        20
Subclinical Hypothyroidism      14
Clinical Hypothyroidism         12
Subclinical Hyperthyroidism      5
Name: Thyroid Function, dtype: int64

In [39]:
# def pre_process(columns):
#     label = LabelEncoder()
#     for col in columns:
#         df[col] = label.fit_transform(df[col])

In [40]:
# cols = df.columns[1:]
# pre_process(cols)

In [46]:
cols = ['Risk', 'Stage', 'Thyroid Function', 'Response', 'Physical Examination', 'Pathology', 'T', 'N', 'Adenopathy']

risk_df = pd.get_dummies(df['Risk'])
stage_df = pd.get_dummies(df['Stage'])
thyroid_df = pd.get_dummies(df['Thyroid Function'])
response_df = pd.get_dummies(df['Response'])
physical_df = pd.get_dummies(df['Physical Examination'])
pathology_df = pd.get_dummies(df['Pathology'])
t_df = pd.get_dummies(df['T'])
n_df = pd.get_dummies(df['N'])
adenopathy = pd.get_dummies(df['Adenopathy'])

In [44]:
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [45]:
label = LabelEncoder()
df['Gender'] = label.fit_transform(df['Gender'])
df['Smoking'] = label.fit_transform(df['Smoking'])
df['Hx Smoking'] = label.fit_transform(df['Hx Smoking'])
df['Hx Radiothreapy'] = label.fit_transform(df['Hx Radiothreapy'])
df['Focality'] = label.fit_transform(df['Focality'])
df['M']= label.fit_transform(df['M'])
df['Recurred'] = label.fit_transform(df['Recurred'])

In [47]:
temp_df = df.drop(columns=cols)

In [48]:
temp_df.columns

Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Focality',
       'M', 'Recurred'],
      dtype='object')

In [49]:
temp_df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Focality,M,Recurred
0,27,0,0,0,0,1,0,0
1,34,0,0,1,0,1,0,0
2,30,0,0,0,0,1,0,0
3,62,0,0,0,0,1,0,0
4,62,0,0,0,0,0,0,0


In [50]:
y = df['Recurred']
temp_df.drop(columns=['Recurred'], inplace=True)

In [51]:
temp_df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Focality,M
0,27,0,0,0,0,1,0
1,34,0,0,1,0,1,0
2,30,0,0,0,0,1,0
3,62,0,0,0,0,1,0
4,62,0,0,0,0,0,0


In [54]:
X_df = pd.concat([temp_df, risk_df, stage_df, thyroid_df, adenopathy, t_df, n_df, response_df, pathology_df, physical_df], axis=1)

In [55]:
X_df.columns

Index(['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Focality',
       'M', 'High', 'Intermediate', 'Low', 'I', 'II', 'III', 'IVA', 'IVB',
       'Clinical Hyperthyroidism', 'Clinical Hypothyroidism', 'Euthyroid',
       'Subclinical Hyperthyroidism', 'Subclinical Hypothyroidism',
       'Bilateral', 'Extensive', 'Left', 'No', 'Posterior', 'Right', 'T1a',
       'T1b', 'T2', 'T3a', 'T3b', 'T4a', 'T4b', 'N0', 'N1a', 'N1b',
       'Biochemical Incomplete', 'Excellent', 'Indeterminate',
       'Structural Incomplete', 'Follicular', 'Hurthel cell', 'Micropapillary',
       'Papillary', 'Diffuse goiter', 'Multinodular goiter', 'Normal',
       'Single nodular goiter-left', 'Single nodular goiter-right'],
      dtype='object')

In [56]:
X_df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Focality,M,High,Intermediate,Low,...,Structural Incomplete,Follicular,Hurthel cell,Micropapillary,Papillary,Diffuse goiter,Multinodular goiter,Normal,Single nodular goiter-left,Single nodular goiter-right
0,27,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1,34,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
2,30,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,62,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
4,62,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
