In [2]:
import pandas as pd

# Load the dataset
file_path = "C:/Users/rohit/Downloads/project_files/dataset/hypothyroid.csv"
df = pd.read_csv(file_path)

# Display basic info
print(df.info())
print(df.head())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3772 entries, 0 to 3771
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3772 non-null   object
 1   sex                        3772 non-null   object
 2   on thyroxine               3772 non-null   object
 3   query on thyroxine         3772 non-null   object
 4   on antithyroid medication  3772 non-null   object
 5   sick                       3772 non-null   object
 6   pregnant                   3772 non-null   object
 7   thyroid surgery            3772 non-null   object
 8   I131 treatment             3772 non-null   object
 9   query hypothyroid          3772 non-null   object
 10  query hyperthyroid         3772 non-null   object
 11  lithium                    3772 non-null   object
 12  goitre                     3772 non-null   object
 13  tumor                      3772 non-null   object
 14  hypopitu

In [4]:
print(df.columns)

Index(['age', 'sex', 'on thyroxine', 'query on thyroxine',
       'on antithyroid medication', 'sick', 'pregnant', 'thyroid surgery',
       'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium',
       'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'TSH',
       'T3 measured', 'T3', 'TT4 measured', 'TT4', 'T4U measured', 'T4U',
       'FTI measured', 'FTI', 'TBG measured', 'TBG', 'referral source',
       'binaryClass'],
      dtype='object')


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Set the target column
target_col = 'binaryClass'  # If 'Type' is the target column

# Handle missing values (drop or fill)
df = df.dropna()  # Drop rows with missing values

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    if col != target_col:  # Don't encode the target column here
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Save encoder for future use

# Encode the target column
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(df[target_col])  # Encode target labels

# Separate features (X) and target (y)
X = df.drop(columns=[target_col])  

# Normalize numerical data (if needed)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data Preprocessing Complete!")


Data Preprocessing Complete!


In [8]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Train Logistic Regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
log_pred = log_reg.predict(X_test)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Evaluate Models
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

# Display Classification Reports
print("\nSVM Classification Report:\n", classification_report(y_test, svm_pred))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, log_pred))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_pred))


SVM Accuracy: 0.9470198675496688
Logistic Regression Accuracy: 0.9483443708609272
Random Forest Accuracy: 0.9708609271523179

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.47      0.57        58
           1       0.96      0.99      0.97       697

    accuracy                           0.95       755
   macro avg       0.85      0.73      0.77       755
weighted avg       0.94      0.95      0.94       755


Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.48      0.59        58
           1       0.96      0.99      0.97       697

    accuracy                           0.95       755
   macro avg       0.86      0.73      0.78       755
weighted avg       0.94      0.95      0.94       755


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.66      0.78        58

In [14]:
import pickle
filename = 'hypothyroid.sav'
pickle.dump(log_reg, open(filename, 'wb'))  # Replace log_reg with your trained model
print("Model saved successfully!")

Model saved successfully!
