In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv("DATA.csv")
# CHECKING  NO OF MISSING VALUES IN EACH COLUMN
data.isnull().sum()


Patient_ID                           0
Systemic Illness                  6216
Rectal Pain                          0
Sore Throat                          0
Penile Oedema                        0
Oral Lesions                         0
Solitary Lesion                      0
Swollen Tonsils                      0
HIV Infection                        0
Sexually Transmitted Infection       0
MonkeyPox                            0
dtype: int64

In [3]:
# DISTRIBUTION  OF Positive cases and negative cases 
print(data['MonkeyPox'].value_counts())
print()

print('Percentage of No Positive cases: {}%'.format(round(data.MonkeyPox.value_counts()['Positive']/len(data) * 100.0,2)))
print('Percentage of Negative cases: {}%'.format(round(data.MonkeyPox.value_counts()['Negative']/len(data) * 100.0,2)))

MonkeyPox
Positive    15909
Negative     9091
Name: count, dtype: int64

Percentage of No Positive cases: 63.64%
Percentage of Negative cases: 36.36%


In [4]:
#Unique values present in every column  
for col in data.columns:
    unique_values=data[col].unique()
    print(f"UNIQUE VALUES IN A COLUMN'{col}':{unique_values}'")
    print()

#Converting the text into numerical of all the columns 
features=["Rectal Pain","Sore Throat","Penile Oedema","Oral Lesions","Solitary Lesion","Swollen Tonsils","HIV Infection","Sexually Transmitted Infection"]
for col in features:
    data[col]=data[col].replace({True:1,False:0})
    
data['MonkeyPox']=data['MonkeyPox'].replace({'Negative':0,'Positive':1})

# None =0  fever =1   Swollen Lymph Nodes=2     Muscle Aches and Pain=3
data['Systemic Illness']=data['Systemic Illness'].replace({'None':0,'Fever':1,'Swollen Lymph Nodes':2,'Muscle Aches and Pain':3})

data['Patient_ID'] = data['Patient_ID'].str.replace('P','', regex=False).astype(int)
data

UNIQUE VALUES IN A COLUMN'Patient_ID':['P0' 'P1' 'P2' ... 'P24997' 'P24998' 'P24999']'

UNIQUE VALUES IN A COLUMN'Systemic Illness':[nan 'Fever' 'Swollen Lymph Nodes' 'Muscle Aches and Pain']'

UNIQUE VALUES IN A COLUMN'Rectal Pain':[False  True]'

UNIQUE VALUES IN A COLUMN'Sore Throat':[ True False]'

UNIQUE VALUES IN A COLUMN'Penile Oedema':[ True False]'

UNIQUE VALUES IN A COLUMN'Oral Lesions':[ True False]'

UNIQUE VALUES IN A COLUMN'Solitary Lesion':[False  True]'

UNIQUE VALUES IN A COLUMN'Swollen Tonsils':[ True False]'

UNIQUE VALUES IN A COLUMN'HIV Infection':[False  True]'

UNIQUE VALUES IN A COLUMN'Sexually Transmitted Infection':[False  True]'

UNIQUE VALUES IN A COLUMN'MonkeyPox':['Negative' 'Positive']'



  data[col]=data[col].replace({True:1,False:0})
  data['MonkeyPox']=data['MonkeyPox'].replace({'Negative':0,'Positive':1})
  data['Systemic Illness']=data['Systemic Illness'].replace({'None':0,'Fever':1,'Swollen Lymph Nodes':2,'Muscle Aches and Pain':3})


Unnamed: 0,Patient_ID,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,0,,0,1,1,1,0,1,0,0,0
1,1,1.0,1,0,1,1,0,0,1,0,1
2,2,1.0,0,1,1,0,0,0,1,0,1
3,3,,1,0,0,0,1,1,1,0,1
4,4,2.0,1,1,1,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
24995,24995,,1,1,0,1,1,0,0,1,1
24996,24996,1.0,0,1,1,0,1,1,1,1,1
24997,24997,,1,1,0,0,1,1,0,0,1
24998,24998,2.0,0,1,0,1,1,1,0,0,0


# FEATURE SELECTION PROCESS

# Filter Methods (Logistic)

In [5]:
# MUTUAL INFORMATION = measures how much a given feature can explain another

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SelectKBest with mutual_info_classif
k =5 # Change k to select the desired number of features
fs = SelectKBest(score_func=mutual_info_classif, k=k)
fs.fit(X_train, y_train)

# Transform the training and testing data
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

# Get the feature scores
feature_scores = fs.scores_

# Get the names of the selected features
selected_features = X.columns[fs.get_support()]

# Fit a model to evaluate accuracy
model = LogisticRegression()
model.fit(X_train_fs, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test_fs)
accuracy = accuracy_score(y_test, y_pred)

# Print results
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features.tolist())
print("Model Accuracy:", accuracy)

'''# Plotting feature scores
plt.bar(range(len(feature_scores)), feature_scores)
plt.title('Feature Scores using Mutual Information')
plt.xlabel('Feature Index')
plt.ylabel('Score')
plt.show() '''

Number of Selected Features: 5
Selected Features: ['Systemic Illness', 'Rectal Pain', 'Swollen Tonsils', 'HIV Infection', 'Sexually Transmitted Infection']
Model Accuracy: 0.6704


"# Plotting feature scores\nplt.bar(range(len(feature_scores)), feature_scores)\nplt.title('Feature Scores using Mutual Information')\nplt.xlabel('Feature Index')\nplt.ylabel('Score')\nplt.show() "

In [6]:
#It can be used to select the best categorical features for a 
#classification model. ANOVA (Analysis of Variance) is a
#statistical test that is used to compare the means of two or more groups. 

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score,classification_report, roc_curve,f1_score)



X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SelectKBest with mutual_info_classif
k =5 # Change k to select the desired number of features
fs = SelectKBest(score_func=f_classif, k=k)
fs.fit(X_train, y_train)

# Transform the training and testing data
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

# Get the feature scores
feature_scores = fs.scores_

# Get the names of the selected features
selected_features = X.columns[fs.get_support()]

# Fit a model to evaluate accuracy
model = LogisticRegression()
model.fit(X_train_fs, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test_fs)
accuracy = accuracy_score(y_test, y_pred)

# Print results
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
f1= f1_score(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print the results
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features.tolist())

print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")
print("Confusion Matrix:")
print(conf_matrix)



Number of Selected Features: 5
Selected Features: ['Rectal Pain', 'Sore Throat', 'Penile Oedema', 'HIV Infection', 'Sexually Transmitted Infection']
Performance Metrics:
Accuracy: 0.6690
Precision: 0.6886
Recall: 0.8949
F1 Score: 0.7783
ROC/AUC: 0.5729
Specificity: 0.2509
Confusion Matrix:
[[ 440 1314]
 [ 341 2905]]


In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SelectKBest with mutual_info_classif
k =7 # Change k to select the desired number of features
fs = SelectKBest(score_func=chi2, k=k)
fs.fit(X_train, y_train)

# Transform the training and testing data
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

# Get the feature scores
feature_scores = fs.scores_

# Get the names of the selected features
selected_features = X.columns[fs.get_support()]

# Fit a model to evaluate accuracy
model = LogisticRegression()
model.fit(X_train_fs, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test_fs)
accuracy = accuracy_score(y_test, y_pred)

# Print results
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features.tolist())
print("Model Accuracy:", accuracy)

Number of Selected Features: 7
Selected Features: ['Patient_ID', 'Rectal Pain', 'Sore Throat', 'Penile Oedema', 'Oral Lesions', 'HIV Infection', 'Sexually Transmitted Infection']
Model Accuracy: 0.6592


# Filter Methods (SVM)

In [8]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']


# Apply SelectKBest with chi-square as the scoring function
selector = SelectKBest(f_classif, k=8)  # Select the top 10 features
X_new = selector.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Train SVM on the selected features
clf = SVC()
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.6746
Precision: 0.6921433657726086
Recall: 0.8983364140480592
F1-score: 0.7818742458774636
ROC AUC: 0.5794418672292747
Confusion Matrix:
 [[ 457 1297]
 [ 330 2916]]


# Filter Methods (Decision  Tree)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']


# Apply SelectKBest with chi-square as the scoring function
selector = SelectKBest(mutual_info_classif, k=5)  # Select the top 10 features based on chi-square test
X_new = selector.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier on the selected features
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
print("Performance Metrics:")
selected_features = X.columns[selector.get_support()]
print("Number of Selected Features:", len(selected_features))

# Print selected feature names

print("Selected Features:", selected_features.tolist())
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)


Performance Metrics:
Number of Selected Features: 5
Selected Features: ['Systemic Illness', 'Rectal Pain', 'Penile Oedema', 'HIV Infection', 'Sexually Transmitted Infection']
Accuracy: 0.7000
Precision: 0.7150
Recall: 0.8943
F1 Score: 0.7947
ROC AUC: 0.6173
Confusion Matrix:
 [[ 597 1157]
 [ 343 2903]]


# Filter Method (Gradient Boosting)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']

# Apply SelectKBest with chi-square as the scoring function
selector = SelectKBest(mutual_info_classif, k=7)  # Select the top 10 features based on chi-square test
X_new = selector.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Train Gradient Boosting Classifier on the selected features
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

# Calculate confusion matrix for specificity calculation
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
# Print selected feature names
selected_features = X.columns[selector.get_support()]
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features.tolist())
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)



NameError: name 'data' is not defined

# Filter Method (Adaboost)

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)


# Define features and target variable

X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']

# Apply SelectKBest with chi-square as the scoring function
selector = SelectKBest(mutual_info_classif, k=6)  # Select the top 10 features based on chi-square test
X_new = selector.fit_transform(X, y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Train AdaBoost Classifier on the selected features
base_estimator = DecisionTreeClassifier(max_depth=1)  # Using a decision stump as base estimator
clf = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
# Print selected feature names
selected_features = X.columns[selector.get_support()]
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features.tolist())
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)





Number of Selected Features: 6
Selected Features: ['Systemic Illness', 'Rectal Pain', 'Sore Throat', 'Penile Oedema', 'HIV Infection', 'Sexually Transmitted Infection']
Performance Metrics:
Accuracy: 0.7008
Precision: 0.7257
Recall: 0.8666
F1 Score: 0.7899
ROC AUC: 0.7042
Confusion Matrix:
 [[ 691 1063]
 [ 433 2813]]


# using varience threshold

In [30]:
from sklearn.feature_selection import VarianceThreshold

# Apply variance threshold
vt = VarianceThreshold(threshold=0.2)
X_train_new = vt.fit_transform(X_train)
X_test_new = vt.transform(X_test)

# Get the selected feature names
selected_features = X.columns[vt.get_support()]

# Fit a model using the selected features
model = LogisticRegression()
model.fit(X_train_new, y_train)

# Evaluate the model
accuracy = model.score(X_test_new, y_test)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.6674


# Wrapped Method (RFE) 

In [30]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score,classification_report, roc_curve,f1_score)


X = data.drop('MonkeyPox', axis=1)  # Replace 'target_column' with your target variable
y = data['MonkeyPox']


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Create the RFE model and select the top 5 features
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X_train, y_train)

# Transform the training and testing data
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Get the selected feature names
selected_features = X.columns[rfe.support_]

# Fit the model on the selected features
model.fit(X_train_selected, y_train)

# Make predictions and calculate accuracy
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)



# Print results
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
f1= f1_score(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)


# Print the results
# Print results
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features.tolist())
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Number of Selected Features: 5
Selected Features: ['Rectal Pain', 'Sore Throat', 'Penile Oedema', 'HIV Infection', 'Sexually Transmitted Infection']
Performance Metrics:
Accuracy: 0.6690
Precision: 0.6886
Recall: 0.8949
F1 Score: 0.7783
ROC/AUC: 0.5729
Specificity: 0.2509
Confusion Matrix:
[[ 440 1314]
 [ 341 2905]]


# RFE (Recursive Feature Selection)

# RFE with Decision Tree 

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score,classification_report, roc_curve,f1_score)

# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
model = DecisionTreeClassifier(random_state=42)

# Apply RFE
rfe = RFE(estimator=model, n_features_to_select=8)  # Select the top 10 features
rfe.fit(X_train, y_train)

# Get the selected features
selected_features = X.columns[rfe.support_]
print("Number of Selected Features:", len(selected_features))
print("Selected Features:", selected_features)

# Transform the training and testing sets to include only the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train the model using the selected features
model.fit(X_train_rfe, y_train)

# Make predictions
y_pred = model.predict(X_test_rfe)
y_pred_proba = model.predict_proba(X_test_rfe)[:, 1]  # Probability estimates for ROC/AUC

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
f1= f1_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Calculate specificity
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print the results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Number of Selected Features: 8
Selected Features: Index(['Patient_ID', 'Systemic Illness', 'Sore Throat', 'Penile Oedema',
       'Oral Lesions', 'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection'],
      dtype='object')
Performance Metrics:
Accuracy: 0.5780
Precision: 0.6772
Recall: 0.6688
F1 Score: 0.6730
ROC/AUC: 0.5394
Specificity: 0.4099
Confusion Matrix:
[[ 719 1035]
 [1075 2171]]
Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.41      0.41      1754
           1       0.68      0.67      0.67      3246

    accuracy                           0.58      5000
   macro avg       0.54      0.54      0.54      5000
weighted avg       0.58      0.58      0.58      5000



# RFE with Logistic Regression   (NO)

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)


# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Initialize the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

# K-Fold Cross-Validation
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Create a Logistic Regression model
    model = LogisticRegression(max_iter=1000, random_state=42)

    # Apply RFE
    rfe = RFE(estimator=model, n_features_to_select=10)  # Select the top 10 features
    rfe.fit(X_train, y_train)

    # Transform the training and testing sets to include only the selected features
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)

    # Train the model using the selected features
    model.fit(X_train_rfe, y_train)

    # Make predictions
    y_pred = model.predict(X_test_rfe)
    y_pred_proba = model.predict_proba(X_test_rfe)[:, 1]  # Probability estimates for ROC/AUC

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Append metrics to the lists
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    roc_auc_list.append(roc_auc)

# Calculate average metrics across all folds
avg_accuracy = sum(accuracy_list) / len(accuracy_list)
avg_precision = sum(precision_list) / len(precision_list)
avg_recall = sum(recall_list) / len(recall_list)
avg_f1 = sum(f1_list) / len(f1_list)
avg_roc_auc = sum(roc_auc_list) / len(roc_auc_list)

# Print the average results
print("Average Performance Metrics across K-Folds:")
print(f"Accuracy: {avg_accuracy:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
print(f"ROC/AUC: {avg_roc_auc:.4f}")

# RFE With Gradient Bossting 

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)

# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Initialize the k-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics for each fold
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []
specificity_list = []

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Initialize RFE with the Gradient Boosting model
rfe = RFE(estimator=gb_model, n_features_to_select=4)  # Change n_features_to_select as needed

# K-Fold Cross-Validation with Feature Selection
for train_index, test_index in kfold.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit RFE on the training data
    rfe.fit(X_train, y_train)

    # Transform the training and testing sets to include only the selected features
    X_train_rfe = rfe.transform(X_train)
    X_test_rfe = rfe.transform(X_test)

    # Train the Gradient Boosting model on the selected features
    gb_model.fit(X_train_rfe, y_train)

    # Make predictions
    y_pred = gb_model.predict(X_test_rfe)
    y_pred_proba = gb_model.predict_proba(X_test_rfe)[:, 1]  # Probability estimates for ROC/AUC

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Calculate specificity
    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    specificity = tn / (tn + fp)

    # Append metrics to the lists
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    roc_auc_list.append(roc_auc)
    specificity_list.append(specificity)

# Calculate average metrics across all folds
avg_accuracy = sum(accuracy_list) / len(accuracy_list)
avg_precision = sum(precision_list) / len(precision_list)
avg_recall = sum(recall_list) / len(recall_list)
avg_f1 = sum(f1_list) / len(f1_list)
avg_roc_auc = sum(roc_auc_list) / len(roc_auc_list)
avg_specificity = sum(specificity_list) / len(specificity_list)

# Print the average results
print("Average Performance Metrics across K-Folds with RFE:")
print(f"Accuracy: {avg_accuracy:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1 Score: {avg_f1:.4f}")
print(f"ROC/AUC: {avg_roc_auc:.4f}")
print(f"Specificity: {avg_specificity:.4f}")

# Print selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features.tolist())

# RFE with Gradient Boosting 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)

# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Initialize RFE with the Gradient Boosting model
rfe = RFE(estimator=gb_model, n_features_to_select=8)  # Change n_features_to_select as needed

# Fit RFE on the training data
rfe.fit(X_train, y_train)

# Transform the training and testing sets to include only the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train the Gradient Boosting model on the selected features
gb_model.fit(X_train_rfe, y_train)

# Make predictions
y_pred = gb_model.predict(X_test_rfe)
y_pred_proba = gb_model.predict_proba(X_test_rfe)[:, 1]  # Probability estimates for ROC/AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features.tolist())

# RFE with Adaboost (NO k-fold)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)


# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an AdaBoost model using a Decision Tree as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=1)  # Decision stump
ada_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)

# Initialize RFE with the AdaBoost model
rfe = RFE(estimator=ada_model, n_features_to_select=7)  # Change n_features_to_select as needed

# Fit RFE on the training data
rfe.fit(X_train, y_train)

# Transform the training and testing sets to include only the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train the AdaBoost model on the selected features
ada_model.fit(X_train_rfe, y_train)

# Make predictions
y_pred = ada_model.predict(X_test_rfe)
y_pred_proba = ada_model.predict_proba(X_test_rfe)[:, 1]  # Probability estimates for ROC/AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features.tolist())

# RFE with XGBoost Without K-Fold

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)


# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Initialize RFE with the XGBoost model
rfe = RFE(estimator=model, n_features_to_select=7)  # Change n_features_to_select as needed

# Fit RFE on the training data
rfe.fit(X_train, y_train)

# Transform the training and testing sets to include only the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Train the XGBoost model on the selected features
model.fit(X_train_rfe, y_train)

# Make predictions
y_pred = model.predict(X_test_rfe)
y_pred_proba = model.predict_proba(X_test_rfe)[:, 1]  # Probability estimates for ROC/AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features.tolist())

# Random Forest Feature Importance 

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)

# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
importances = rf_model.feature_importances_

# Create a DataFrame to hold feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})

# Sort the DataFrame by importance scores in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance
print("Feature Importances:")
print(feature_importance_df)

# Select features based on importance (e.g., top 10)
top_features = feature_importance_df.head(6)['Feature'].tolist()
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

# Train Random Forest on the selected features
rf_model_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_selected.fit(X_train_selected, y_train)

# Make predictions
y_pred = rf_model_selected.predict(X_test_selected)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, rf_model_selected.predict_proba(X_test_selected)[:, 1])

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
# Print selected feature names
print("Selected Features:", top_features)
print("\nPerformance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)



Feature Importances:
                          Feature  Importance
0                      Patient_ID    0.727945
1                Systemic Illness    0.082671
8                   HIV Infection    0.028580
7                 Swollen Tonsils    0.025432
2                     Rectal Pain    0.025329
9  Sexually Transmitted Infection    0.024122
6                 Solitary Lesion    0.023763
5                    Oral Lesions    0.022080
3                     Sore Throat    0.020502
4                   Penile Oedema    0.019576
Selected Features: ['Patient_ID', 'Systemic Illness', 'HIV Infection', 'Swollen Tonsils', 'Rectal Pain', 'Sexually Transmitted Infection']

Performance Metrics:
Accuracy: 0.6058
Precision: 0.7002
Recall: 0.6870
F1 Score: 0.6935
ROC AUC: 0.6147
Confusion Matrix:
 [[ 799  955]
 [1016 2230]]


# Lasso Regression Feature Importance 

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.svm import SVC  # Example model to train with selected features

# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the Lasso Regression model
lasso = Lasso(alpha=0.1)  # Set the regularization parameter
lasso.fit(X_train, y_train)

# Get feature importances (coefficients)
feature_importances = lasso.coef_

# Select features based on non-zero coefficients
selected_features =X.columns[feature_importances != 0]
print("No of selected features:",len(selected_features))
print("Selected Features:", selected_features.tolist())

# Transform the training and testing sets to include only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Train a model (e.g., SVM) on the selected features
final_model = SVC()  # You can replace this with any other model you prefer
final_model.fit(X_train_selected, y_train)

# Make predictions
y_pred = final_model.predict(X_test_selected)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, final_model.predict_proba(X_test_selected)[:, 1])

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Print performance metrics
print("\nPerformance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)

No of selected features: 1
Selected Features: ['Patient_ID']


AttributeError: predict_proba is not available when  probability=False

# PCA with Gradient Boosting

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)


# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for PCA)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA to reduce dimensions
pca = PCA(n_components=8)  # Change n_components as needed
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create a Gradient Boosting model
model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the model on the PCA-transformed data
model.fit(X_train_pca, y_train)

# Make predictions
y_pred = model.predict(X_test_pca)
y_pred_proba = model.predict_proba(X_test_pca)[:, 1]  # Probability estimates for ROC/AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print explained variance ratio of the components
print("Explained Variance Ratio of PCA Components:", pca.explained_variance_ratio_)
print("Selected Features:", selected_features.tolist())

# PCA  with Ada Boost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)

# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for PCA)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA to reduce dimensions
pca = PCA(n_components=8)  # Change n_components as needed
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create an AdaBoost model using a Decision Tree as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=1)  # Decision stump
ada_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)

# Train the model on the PCA-transformed data
ada_model.fit(X_train_pca, y_train)

# Make predictions
y_pred = ada_model.predict(X_test_pca)
y_pred_proba = ada_model.predict_proba(X_test_pca)[:, 1]  # Probability estimates for ROC/AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print explained variance ratio of the components and selected features names from PCA
print("Explained Variance Ratio of PCA Components:", pca.explained_variance_ratio_)

# Print component loadings for understanding feature contributions
loadings = pd.DataFrame(pca.components_.T, index=X.columns, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
print("PCA Component Loadings:")
print(loadings)

# PCA  with  XGBoost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             confusion_matrix, roc_auc_score, f1_score)
from sklearn.preprocessing import StandardScaler


# Define features and target variable
X = data.drop('MonkeyPox', axis=1)  # Features
y = data['MonkeyPox']  # Target variable

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for PCA)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA to reduce dimensions
pca = PCA(n_components=8)  # Change n_components as needed
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Create an XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train the model on the PCA-transformed data
model.fit(X_train_pca, y_train)

# Make predictions
y_pred = model.predict(X_test_pca)
y_pred_proba = model.predict_proba(X_test_pca)[:, 1]  # Probability estimates for ROC/AUC

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Calculate specificity
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
specificity = tn / (tn + fp)

# Print results
print("Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC/AUC: {roc_auc:.4f}")
print(f"Specificity: {specificity:.4f}")

# Print explained variance ratio of the components and selected features names from PCA
print("Explained Variance Ratio of PCA Components:", pca.explained_variance_ratio_)