# Changing catergoral variables


In [23]:
import pandas as pd
import numpy as np

df = pd.read_csv("thyroid.csv")

#replacing '?'
df = df.applymap(lambda x: np.nan if str(x).strip() == '?' else x)

#removing unecessary columns
df.drop(['TSH measured','T3 measured','TT4 measured','T4U measured',
         'FTI measured'],axis = 1,inplace = True)

#replacing categorial variable with a continuous value in a missing value column
df.sex.replace({'F': 1, 'M': 0}, inplace = True)
df.sex.fillna(round(df.sex.mean()),inplace = True)

df.isnull().sum()

age                             0
sex                             0
on thyroxine                    0
query on thyroxine              0
on antithyroid medication       0
sick                            0
pregnant                        0
thyroid surgery                 0
I131 treatment                  0
query hypothyroid               0
query hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
TSH                           852
T3                           1755
TT4                           552
T4U                           891
FTI                           885
results                         0
dtype: int64

# Data Augmentation


In [24]:
from sklearn.impute import KNNImputer


cols = ['TSH', 'T3', 'TT4', 'T4U', 'FTI']

# Applying KNN imputation to fill missing values
knn_imputer = KNNImputer(n_neighbors=25)
df[cols] = knn_imputer.fit_transform(df[cols])


# Filling missing values

In [25]:
from imblearn.over_sampling import RandomOverSampler

X = df.drop('results', axis=1)
y = df['results']

oversampler = RandomOverSampler(sampling_strategy='not majority', random_state=100)

# Resampling the dataset to balance
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Creating a new dataframe
augmented_data = pd.concat([X_resampled, y_resampled], axis=1)

augmented_data.to_csv("thyroid_balanced_new.csv")