In [None]:
%%capture
!pip install catboost
!pip install ppscore
!pip install pandas === 1.5.3

In [None]:
pip install --upgrade ppscore numpy


### Load Libraries

In [None]:
pip install xgboost

In [None]:
pip install lightgbm

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Stats
import statsmodels.api as sm
from scipy import stats
import ppscore as pps

# Data preprocessing
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve

# tqdm
from tqdm.auto import tqdm

# warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import sys

# Define the file path as a variable
csv_file = "D:\\Data_Analytics\\Data_source\\extracted_data\\Healthcare\\Thyroid_Diff.csv"

# Load the CSV file using the provided file path
try:
    data = pd.read_csv(csv_file)
    print("CSV file loaded successfully!")
    # Now you can work with your data
except FileNotFoundError:
    print(f"Error: File '{csv_file}' not found.")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.dtypes

In [None]:
categorical_features = data.select_dtypes(include = ['object','category']).columns.to_list()

print(f'Total Categorical features = {len(categorical_features)}\n')
for cat_feat in categorical_features:
    print(f'* {cat_feat}: {data[cat_feat].unique()} => {data[cat_feat].nunique()} values\n')

In [None]:
data.describe()



# EDA

In [None]:
# Visualize the distribution of ages using a histogram.
plt.figure(figsize=(10, 6))
sns.histplot(data['Age'], bins=20, kde=True, color='#ff9f80')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Distribution of Age')
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 8))

#Display the count of patients in each gender category.
sns.countplot(x='Gender', data=data, palette='OrRd', ax=axes[0])
axes[0].set_title('Gender Distribution')
axes[0].set_xlabel('Gender')
axes[0].set_ylabel('Count')

sns.countplot(x='Smoking', data=data, palette='OrRd', ax=axes[1])
axes[1].set_title('Smoking Distribution')
axes[1].set_xlabel('Smoking')
axes[1].set_ylabel('Count')

sns.countplot(x='Response', data=data, palette='OrRd',  ax=axes[2])
axes[2].set_title('Treatment Response Distribution')
axes[2].set_xlabel('Treatment Response')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)
plt.show()

 1. From the above plot we conclude that the majority of people who has a thyroid Cancer are Female

 2. The Smoking doesnt affect on the thyroid cancer

 3. Rate of recovery from the Thyroid cancer are very large

In [None]:
sns.set_palette("OrRd")
fig, axes = plt.subplots(1, 3, figsize=(20, 8))

# Pie chart for 'Risk'
risky = data['Risk'].value_counts()
risky.plot(kind='pie', autopct='%0.2f%%', explode=[0.05, 0.08, 0.1], labels=risky.index, shadow=True, ax=axes[0])
axes[0].set_title('Percentage of Risk')
axes[0].set_ylabel('')

# Pie chart for 'Stage'
Stage_of_Cancer = data['Stage'].value_counts()
Stage_of_Cancer.plot(kind='pie', autopct='%0.2f%%', labels=Stage_of_Cancer.index, explode=[0.1, 0.2, 0.3, 0.4, 0.9], shadow=True, ax=axes[1])
axes[1].set_title('Percentage of Each Stage')
axes[1].set_ylabel('')

# Pie chart for 'Adenopathy'
aden = data['Adenopathy'].value_counts()
aden.value_counts().plot(kind='pie', autopct='%0.2f%%', labels=aden.index, shadow=True, ax=axes[2])
axes[2].set_title('Percentage of Adenopathy')
axes[2].set_ylabel('')


plt.show()


In [None]:
sns.violinplot(x ='Stage', y ='Age', data = data, hue ='Gender', palette='Dark2', split = True);

In [None]:
sns.countplot(x = 'Thyroid Function', data = data,hue='Recurred', palette='Reds')
plt.xticks(rotation = 40);

In [None]:
sns.violinplot(x ='Thyroid Function', y ='Age', data = data, hue ='Gender', palette='Dark2', split = True)

plt.xticks(rotation = 40);

In [None]:
sns.countplot(x = 'Pathology' ,data = data,hue='Recurred', palette='winter')
plt.xticks(rotation = 40);

In [None]:
sns.countplot(x= 'Focality', data= data,hue='Recurred', palette='autumn');

# Bivariate Analysis

In [None]:
sns.set_style("dark")
pps_matrix = pps.matrix(data)[['x', 'y', 'ppscore']].pivot_table(values = 'ppscore', 
                                                                 index = 'y', 
                                                                 columns = 'x')
 
plt.figure(figsize = (20,10))                                                             
sns.heatmap(pps_matrix, 
            cmap = 'Reds', 
            annot = True, 
            annot_kws = {'fontsize':7},
            fmt = '.4f', 
            square = True, 
            linewidths = 1.1)

plt.title("Predictive Power Score (PPS)", 
          fontsize = 20, 
          fontweight = 'bold', 
          color = 'black')
plt.show()

# Preprocessing

In [None]:
X = data.drop('Recurred', axis = 1)
y = data['Recurred']

In [None]:
label2id = dict(zip(y.unique(), np.arange(len(y.unique()), dtype = np.float32)))
label2id

In [None]:
y = y.map(label2id)

In [None]:
from sklearn.model_selection import train_test_split

SEED = 1234

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=SEED, 
                                                    stratify=y)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
numerical_predictor = X_train.select_dtypes(include = ['int', 'float']).columns.to_list()
categorical_predictors = X_train.select_dtypes(include = ['object', 'category']).columns.to_list()


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define preprocessing steps for numerical variables
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values using median
    ('scaler', StandardScaler())  # Standardize features by removing the mean and scaling to unit variance
])

# Define preprocessing steps for categorical variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values using most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features as one-hot vectors
])

# Combine preprocessing steps for both numerical and categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_predictor),  # Apply numeric_transformer to numerical columns
        ('cat', categorical_transformer, categorical_predictors)  # Apply categorical_transformer to categorical columns
    ])

# Fit and transform the training data
X_train_base = preprocessor.fit_transform(X_train)

# Transform the test data (only transform, no fitting)
X_test_base = preprocessor.transform(X_test)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Define a ColumnTransformer for preprocessing the data for the rest of the model
preprocessor_models = ColumnTransformer(
    [('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_predictors)],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_prep_models = preprocessor_models.fit_transform(X_train)

# Transform the test data using the preprocessor fitted to the training data
X_test_prep_models = preprocessor_models.transform(X_test)

# Models

In [None]:
lr = LogisticRegression(random_state = SEED, n_jobs = -1)
lr.fit(X_train_base, y_train)

y_pred_train_base = lr.predict(X_train_base)
y_pred_test_base = lr.predict(X_test_base)

print(f'Balanced Accuracy train: {balanced_accuracy_score(y_train, y_pred_train_base)}')
print(f'Balanced Accuracy test: {balanced_accuracy_score(y_test, y_pred_test_base)}')

In [None]:
# We define the candidate models, we choose the model that best generalizes.
clf1 = SVC(probability = True, class_weight = 'balanced', random_state = SEED)
clf2 = RandomForestClassifier(class_weight = 'balanced', random_state = SEED, n_jobs = -1)
clf3 = ExtraTreesClassifier(bootstrap = True, class_weight = 'balanced', n_jobs = -1, random_state = SEED)
clf4 = XGBClassifier(random_state = SEED)
clf5 = LGBMClassifier(class_weight = 'balanced', random_state = SEED, n_jobs = -1, verbose = -1)
clf6 = CatBoostClassifier(auto_class_weights = 'Balanced',random_state = SEED, verbose = 0)

MODELS = [clf1, clf2, clf3, clf4, clf5, clf6]

In [None]:
# Training!!!
accuracy_train = {}
accuracy_test = {}

for model in tqdm(MODELS):
    name = type(model).__name__
    model.fit(X_train_prep_models, y_train)
    y_pred_train = model.predict(X_train_prep_models)
    y_pred_test = model.predict(X_test_prep_models)
    accuracy_train[name] = balanced_accuracy_score(y_train, y_pred_train)
    accuracy_test[name] = balanced_accuracy_score(y_test, y_pred_test)
    print(f'* {name} finished.')


In [None]:
metric_train = pd.DataFrame.from_dict(accuracy_train, orient = 'index')
metric_train = metric_train.rename(columns = {0:'Train'})

metric_test = pd.DataFrame.from_dict(accuracy_test, orient = 'index')
metric_test = metric_test.rename(columns = {0:'Test'})

fig,ax = plt.subplots(figsize = (14,4.5))

labels = metric_train.index.to_list()
values_train = metric_train.iloc[:,0].to_list()
values_test = metric_test.iloc[:,0].to_list()
x = np.arange(len(labels))
width = 0.35

rects1 = ax.bar(x = x - width/2, height = values_train, width = width, label = 'Train')
rects2 = ax.bar(x = x + width/2, height = values_test, width = width, label = 'Test')


def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(text = f'{height:.4f}', 
                    xy = (rect.get_x() + rect.get_width()/2, height), 
                    xytext = (0,3), 
                    textcoords = "offset points", 
                    ha = "center", 
                    va = "bottom")
        
autolabel(rects1)
autolabel(rects2)
ax.legend()
ax.set_title("Metric of Performance: Balanced Accuracy", fontsize = 12, fontweight = "bold", color = "black")
ax.set_ylabel("score", fontsize = 12, fontweight = "bold", color = "black")
ax.set_xlabel("Models", fontsize = 12, fontweight = "bold", color = "black")
ax.set_xticks(x)
ax.set_xticklabels(labels)
fig.show()


#### The best model with respect to the evaluation metric is CatBoostClassifier and it outperforms the base model, so we will calculate some additional metrics with this model.

# Metrics

In [None]:
# Predictions!!!
y_pred_train_final = clf6.predict(X_train_prep_models)
y_pred_prob_train_final = clf6.predict_proba(X_train_prep_models)[:,1]
y_pred_test_final = clf6.predict(X_test_prep_models)
y_pred_prob_test_final = clf6.predict_proba(X_test_prep_models)[:,1]

- Accuracy

In [None]:
print(f'Accuracy train: {accuracy_score(y_train, y_pred_train_final):.4f}')
print(f'Accuracy test: {accuracy_score(y_test, y_pred_test_final):.4f}')

- Classification report

In [None]:
print("Classification Report Train")
print(classification_report(y_train, y_pred_train_final, target_names = ['No','Yes']))
print(" ")

print("Classification Report Test")
print(classification_report(y_test, y_pred_test_final, target_names = ['No','Yes']))

- Confusion Metrix

In [None]:
cf_mx_train = confusion_matrix(y_train, y_pred_train_final)
cf_mx_test = confusion_matrix(y_test, y_pred_test_final)

fig,axs = plt.subplots(nrows = 1, ncols = 2, figsize = (10,5))
axs = axs.flat

sns.heatmap(cf_mx_train, cmap = 'Reds', annot = True, annot_kws = {'fontsize':11, 'fontweight':'bold'}, fmt = '', xticklabels = ['No','Yes'], yticklabels = ['No','Yes'], cbar = False, square = True, ax = axs[0])
sns.heatmap(cf_mx_test, cmap = 'Blues', annot = True, annot_kws = {'fontsize':11, 'fontweight':'bold'}, fmt = '', xticklabels = ['No','Yes'], yticklabels = ['No','Yes'], cbar = False, square = True, ax = axs[1])
axs[0].set_xlabel('Predicted', fontsize = 12, fontweight = "bold", color = "black")
axs[1].set_xlabel('Predicted', fontsize = 12, fontweight = "bold", color = "black")
axs[0].set_ylabel('True', fontsize = 12, fontweight = "bold", color = "black")
axs[1].set_ylabel('True', fontsize = 12, fontweight = "bold", color = "black")
axs[0].set_title('Confusion Matrix Train', fontsize = 14, fontweight = "bold", color = "black")
axs[1].set_title('Confusion Matrix Test', fontsize = 14, fontweight = "bold", color = "black")

fig.tight_layout()
fig.show()

- ROC Curve

In [None]:
plt.style.use('ggplot')
fpr_train, tpr_train, _ = roc_curve(y_train, y_pred_prob_train_final)
fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_prob_test_final)

auc_train = roc_auc_score(y_train, y_pred_prob_train_final)
auc_test = roc_auc_score(y_test, y_pred_prob_test_final)

fig,ax = plt.subplots(figsize = (8,5.5))
ax.plot(fpr_train, tpr_train, label = f'Train AUC = {auc_train:.4f}')
ax.plot(fpr_test, tpr_test, label = f'Test AUC = {auc_test:.4f}')
ax.plot([0,1], linestyle = '--', color = 'black')
ax.set_xlabel("FPR", fontsize = 10, fontweight = 'bold', color = 'black')
ax.set_ylabel("TPR", fontsize = 10, fontweight = 'bold', color = 'black')
ax.legend()
ax.set_title("ROC AUC", fontsize = 12, fontweight = "bold", color = "black")
fig.show()

- Precision Recall Curve

In [None]:
precision_train, recall_train, _ = precision_recall_curve(y_train, y_pred_prob_train_final)
precision_test, recall_test, _ = precision_recall_curve(y_test, y_pred_prob_test_final)

fig, ax = plt.subplots(figsize=(8, 5.5))
ax.plot(recall_train, precision_train, label='Train')
ax.plot(recall_test, precision_test, label='Test')  # Changed to plot test precision-recall curve
ax.set_xlabel("Recall", fontsize=10, fontweight='bold', color='black')
ax.set_ylabel("Precision", fontsize=10, fontweight='bold', color='black')
ax.legend()
ax.set_title("Precision Recall Curve", fontsize=12, fontweight="bold", color="black")
fig.show()