In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

**Load the dataset**

In [2]:
file_path = "/kaggle/input/dataset-monkeypox/MonkeyPox.csv"  
df = pd.read_csv(file_path)

**Display basic information**

In [3]:
display(df.head()) 

Unnamed: 0,Systemic Illness,Rectal Pain,Sore Throat,Penile Oedema,Oral Lesions,Solitary Lesion,Swollen Tonsils,HIV Infection,Sexually Transmitted Infection,MonkeyPox
0,1,0,1,1,1,0,1,0,0,0
1,2,1,0,1,1,0,0,1,0,1
2,2,0,1,1,0,0,0,1,0,1
3,1,1,0,0,0,1,1,1,0,1
4,3,1,1,1,0,0,1,1,0,1


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype
---  ------                          --------------  -----
 0   Systemic Illness                25000 non-null  int64
 1   Rectal Pain                     25000 non-null  int64
 2   Sore Throat                     25000 non-null  int64
 3   Penile Oedema                   25000 non-null  int64
 4   Oral Lesions                    25000 non-null  int64
 5   Solitary Lesion                 25000 non-null  int64
 6   Swollen Tonsils                 25000 non-null  int64
 7   HIV Infection                   25000 non-null  int64
 8   Sexually Transmitted Infection  25000 non-null  int64
 9   MonkeyPox                       25000 non-null  int64
dtypes: int64(10)
memory usage: 1.9 MB
None


In [5]:
print(df.isnull().sum())

Systemic Illness                  0
Rectal Pain                       0
Sore Throat                       0
Penile Oedema                     0
Oral Lesions                      0
Solitary Lesion                   0
Swollen Tonsils                   0
HIV Infection                     0
Sexually Transmitted Infection    0
MonkeyPox                         0
dtype: int64


In [6]:
print(df['MonkeyPox'].value_counts()) 

MonkeyPox
1    15909
0     9091
Name: count, dtype: int64


**Handle missing values**

In [7]:
df = df.dropna()

**Encode categorical columns (if any)**

In [8]:
categorical_columns = df.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_columns:
    df[col] = encoder.fit_transform(df[col])

**Define features and target**

In [9]:
X = df.drop(columns=['MonkeyPox'])  
y = df['MonkeyPox']

**Split data (90% train, 10% test)**

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

**Feature Scaling**

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Train RandomForestClassifier**

In [12]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

**Predict and evaluate**

In [13]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 68.76%


**Hyperparameter Tuning**

In [14]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

**Best Model Evaluation**

In [15]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
optimized_accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Test Accuracy: {optimized_accuracy * 100:.2f}%")

Optimized Test Accuracy: 69.20%


In [16]:
model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=5, random_state=42)

In [17]:
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)