In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv("datasets/aids.csv")

In [None]:
print(data.shape)
print(data.index)
print(data.columns)
print(len(data.columns))
pd.set_option('display.max_columns', None)
data.head()

In [None]:
data.drop(['patient_id','name'], axis=1, inplace=True)

In [None]:
print(data.isnull().sum())

In [None]:
mean1=data["age"].mean()
mean2=data["weight"].mean()

In [None]:
data["age"].fillna(mean1, inplace=True)
data["weight"].fillna(mean2, inplace=True)

In [None]:
label_encoder = LabelEncoder()

In [None]:
data["drugs"] = label_encoder.fit_transform(data["drugs"])
data["previous_surgeries"] = label_encoder.fit_transform(data["previous_surgeries"])
data["gender"] = label_encoder.fit_transform(data["gender"])
data["infected"] = label_encoder.fit_transform(data["infected"])

In [None]:
y = data["infected"]  # Feature Vector
X = data.drop("infected",axis=1)  # Feature Matrix

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()
# scaler = StandardScaler()

# Scale all columns
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

># Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the SVM classifier
# model = SVC(kernel='linear',probability=True)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np
import matplotlib.pyplot as plt

def plot_roc_curves_for_classes(model, X, y):
    y_bin = label_binarize(y, classes=np.unique(y))
    y_scores_cv = cross_val_predict(model, X, y, cv=5, method='predict_proba')
    n_classes = y_bin.shape[1]
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_scores_cv[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(tpr, fpr, lw=2, label='Class {} (AUC = {:.2f})'.format(i, roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curves for each class')
    plt.show()

plot_roc_curves_for_classes(model, X_train, y_train)