In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from imblearn.over_sampling import ADASYN

data = pd.read_csv('/home/srihari/Documents/MidTerm(SujitSir)/thyroid_cancer_risk_data.csv')

# Separate binary and non-binary categorical columns
binary_cols = ['Gender', 'Family_History', 'Radiation_Exposure', 'Iodine_Deficiency', 'Smoking', 'Obesity', 'Diabetes']
non_binary_cols = ['Country', 'Ethnicity']

# Label encoding for binary columns
label_encoders = {}
for column in binary_cols:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# One-hot encoding for non-binary categorical columns
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
transformed_cols = one_hot_encoder.fit_transform(data[non_binary_cols])

# Create a DataFrame with the one-hot encoded columns
one_hot_df = pd.DataFrame(transformed_cols, columns=one_hot_encoder.get_feature_names_out(non_binary_cols))

# Concatenate the one-hot encoded columns with the original data
data = pd.concat([data.drop(non_binary_cols, axis=1), one_hot_df], axis=1)

# Create interaction terms
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction_features = poly.fit_transform(data.drop(['Patient_ID', 'Thyroid_Cancer_Risk', 'Diagnosis'], axis=1))

# Create a DataFrame with the interaction features
interaction_df = pd.DataFrame(interaction_features, columns=poly.get_feature_names_out(data.drop(['Patient_ID', 'Thyroid_Cancer_Risk', 'Diagnosis'], axis=1).columns))

# Concatenate the interaction features with the original data
data = pd.concat([data, interaction_df], axis=1)

# Split the data into features and target
X = data.drop(['Patient_ID', 'Thyroid_Cancer_Risk', 'Diagnosis'], axis=1)
y = data['Diagnosis']

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle class imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [1000, 2000]
}

# Initialize Logistic Regression model
lr_model = LogisticRegression(random_state=42)

# Perform Grid Search with cross-validation
grid_search_lr = GridSearchCV(lr_model, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_res, y_train_res)

# Best parameters and model
best_lr_model = grid_search_lr.best_estimator_

# Evaluate the best Logistic Regression model
y_pred_lr = best_lr_model.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1.0]
}

# Initialize Gradient Boosting model
grad_boost_model = GradientBoostingClassifier(random_state=42)

# Perform Grid Search with cross-validation
grid_search_gb = GridSearchCV(grad_boost_model, param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train_res, y_train_res)

# Best parameters and model
best_gb_model = grid_search_gb.best_estimator_

# Evaluate the best Gradient Boosting model
y_pred_gb = best_gb_model.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))
print("Accuracy:", accuracy_score(y_test, y_pred_gb))

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define base models with best parameters
base_models = [
    ('lr', best_lr_model),
    ('gb', best_gb_model)
]

# Define meta-model
meta_model = LogisticRegression(random_state=42, max_iter=1000)

# Create Stacking Classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)
stacking_model.fit(X_train_res, y_train_res)

# Evaluate Stacking Classifier
y_pred_stacking = stacking_model.predict(X_test)
print("Stacking Classifier Report:")
print(classification_report(y_test, y_pred_stacking))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_stacking))
print("Accuracy:", accuracy_score(y_test, y_pred_stacking))