# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display
from ydata_profiling import ProfileReport
import os
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

# Loading Dataset

In [4]:
dataset_path = '/Users/praveesha/Desktop/IS733/HW/HW2/red_wine.csv'
df = pd.read_csv(dataset_path)

In [5]:
df.head()

Unnamed: 0,citric acid,sulphates,alcohol,type
0,0.49,0.63,8.0,low
1,0.66,0.57,8.3,low
2,0.23,0.44,8.5,high
3,0.44,0.84,8.6,low
4,0.08,0.5,8.7,low


# Ydata Profiling

In [7]:
# Generate a profile report
profile = ProfileReport(df, title="Dataset Profiling Report")

# Save the report as an HTML file in the same directory as the dataset
report_path = os.path.join(os.path.dirname(dataset_path), "profiling_report.html")
profile.to_file(report_path)

print(f"Profiling report saved as {report_path}")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved as /Users/praveesha/Desktop/IS733/HW/HW2/profiling_report.html


# Comparing AUC and Accuracy of models

In [9]:
# Load the dataset (replace 'your_dataset.csv' with the actual file path)
data = pd.read_csv('/Users/praveesha/Desktop/IS733/HW/HW2/red_wine.csv')

# Prepare features (X) and target (y)
X = data.iloc[:, :-1]  # All columns except the last as features
y = data.iloc[:, -1]   # The last column as the target

# Initialize Stratified K-Fold cross-validator
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# Define models, including baseline model
models = {
    "Baseline": DummyClassifier(strategy="most_frequent"),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM-Linear": SVC(kernel="linear", probability=True),
    "SVM-RBF": SVC(kernel="rbf", probability=True),
    "Random Forest": RandomForestClassifier()
}

# Create empty lists to store results
auc_scores = []
accuracy_scores = []

# Calculate AUC and accuracy for each model using cross-validation
for model_name, model in models.items():
    try:
        auc = cross_val_score(model, X, y, cv=cv, scoring="roc_auc").mean()
        accuracy = cross_val_score(model, X, y, cv=cv, scoring="accuracy").mean()
    except Exception as e:
        print(f"Error evaluating model {model_name}: {e}")
        auc, accuracy = None, None  # Use None as placeholders for failed models
    
    auc_scores.append(auc)
    accuracy_scores.append(accuracy)

# Ensure all lists are the same length
print("Lengths of Model Names, AUC Scores, and Accuracy Scores:", len(models), len(auc_scores), len(accuracy_scores))

# Create results table
results = pd.DataFrame({
    "Model": list(models.keys()),
    "AUC": auc_scores,
    "Accuracy": accuracy_scores
}).set_index("Model")

# Display results table
print(results)


Lengths of Model Names, AUC Scores, and Accuracy Scores: 7 7 7
                          AUC  Accuracy
Model                                  
Baseline             0.500000  0.528887
Logistic Regression  0.877178  0.791500
Naive Bayes          0.893577  0.824894
Decision Tree        0.813379  0.814459
SVM-Linear           0.879401  0.788082
SVM-RBF              0.856889  0.535904
Random Forest        0.927452  0.859982


In [10]:
%matplotlib inline

# ROC Curve of the Random Forest Classifier - (red-wine dataset)

In [12]:
# Prepare features (X) and target (y)
X = data.iloc[:, :-1]  # all columns except the last as features
y = data.iloc[:, -1]   # the last column as the target

# Convert categorical labels to binary (e.g., 'high' -> 1, 'low' -> 0)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Encode labels in entire dataset

# Re-split the data with encoded y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Initialize and train the Random Forest model
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Predict probabilities for the test set
y_probs = rf_classifier.predict_proba(X_test)[:, 1]  # Use probability of the positive class

# Compute ROC curve and AUC, specify pos_label if needed
fpr, tpr, thresholds = roc_curve(y_test, y_probs, pos_label=1)  # 1 is the positive class
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
# Labels and title
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"Random Forest (AUC = {roc_auc:.2f});")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random Classifier (AUC = 0.5);")
plt.legend(loc="lower right")
plt.show()


TypeError: 'numpy.float64' object is not callable

## AUC of Random Forest Classifier - (white-wine dataset)

In [None]:
# Load the dataset (replace 'your_new_dataset.csv' with the actual file path)
new_data = pd.read_csv('/Users/praveesha/Desktop/IS733/HW/HW2/white_wine.csv')

In [None]:
# Prepare features (X) and target (y)
X = new_data.iloc[:, :-1]  # all columns except the last as features
y = new_data.iloc[:, -1]   # the last column as the target

# Encode target if it's categorical
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)  # Convert labels to 0 and 1

# Initialize Random Forest classifier with limited max_depth
rf_classifier = RandomForestClassifier(max_depth=5, random_state=1)

# Initialize Stratified K-Fold cross-validator
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

# Arrays to store TPR, FPR, and AUC for each fold
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# Cross-validation loop
for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]  # Use integer-encoded labels directly
    
    # Fit the model and predict probabilities
    rf_classifier.fit(X_train, y_train)
    y_probs = rf_classifier.predict_proba(X_test)[:, 1]  # Probability of the positive class
    
    # Compute ROC curve and AUC for this fold
    fpr, tpr, _ = roc_curve(y_test, y_probs, pos_label=1)  # Use 1 as the positive class label
    auc_score = auc(fpr, tpr)
    aucs.append(auc_score)
    print(f"Fold {fold_idx + 1} AUC: {auc_score:.4f}")  # Print AUC for each fold
    
    # Interpolate the TPR values at mean FPR points
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0  # Ensure the curve starts at 0
    tprs.append(interp_tpr)

# Calculate the mean and standard deviation of TPRs across folds
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0  # Ensure the curve ends at 1
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)

# Print the mean AUC and standard deviation
print(f"Mean AUC: {mean_auc:.4f} ± {std_auc:.4f}")

# Plot the mean ROC curve with a confidence band
plt.figure(figsize=(8, 6))
plt.plot(mean_fpr, mean_tpr, color="b", label=f"Mean ROC (AUC = {mean_auc:.2f} ± {std_auc:.2f})", lw=2)
plt.fill_between(mean_fpr, np.maximum(mean_tpr - np.std(tprs, axis=0), 0), 
                 np.minimum(mean_tpr + np.std(tprs, axis=0), 1), color="lightblue", alpha=0.2, 
                 label="± 1 std. dev.")

# Plot the reference line
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--", label="Random Classifier (AUC = 0.5)")

# Labels and title
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Mean ROC Curve with Cross-Validation (Random Forest)")
plt.legend(loc="lower right")
plt.show()
