In [1]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
print('✓ Libraries imported successfully')

✓ Libraries imported successfully


In [3]:
#Load the dataset
data = pd.read_csv("/Users/Eswara Chaitanya/OneDrive/Documents/DNN_assignment/oral_cancer_balanced_900.csv")
primary_metric = ['Accuracy', 'AU Score','Precision', 'Recall', 'F1-Score']

In [4]:
# 1. Separate features (X) and target (y)
X = data.drop(['Oral Cancer (Diagnosis)'], axis=1)
y = data['Oral Cancer (Diagnosis)'].map({'Yes': 1, 'No': 0})  # Convert target to binary
#print(X.head)
print(type(y))

# 2. Handle missing values if any
X.isnull().sum()
X = X.fillna(X.select_dtypes(include=[np.number]).mean())

# 3. Encode categorical variables
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.to_list()
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X=X_encoded.astype(float)
print(f"Encode categorical variables completed. New shape of X: {X.shape}")

# Train-test split (stratify when possible)
stratify_arg = y if y.nunique() > 1 else None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=stratify_arg)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Fill these after preprocessing
train_samples = X_train_scaled.shape[0]      # Number of training samples
test_samples = X_test_scaled.shape[0]        # Number of test samples
train_test_ratio = train_samples/(train_samples+test_samples)  # e.g., 0.8 for 80-20 split

print(f"Train samples: {train_samples}")
print(f"Test samples: {test_samples}")
print(f"Split ratio: {train_test_ratio:.1%}")
print(f"Y values after encoding and scaling: {y_train.shape}")

<class 'pandas.core.series.Series'>
Encode categorical variables completed. New shape of X: (900, 43)
Train samples: 720
Test samples: 180
Split ratio: 80.0%
Y values after encoding and scaling: (720,)


In [5]:
class LogisticRegressionModel:
    def __init__(self, lr=0.01, n_iters=1000): 
        self.lr = lr 
        self.n_iters = n_iters 
        self.weights = None 
        self.bias = None 
    def _sigmoid(self, z): 
        return 1 / (1 + np.exp(-z)) 
    def fit(self, X, y): 
        n_samples, n_features = X.shape 
        self.weights = np.zeros(n_features) 
        self.bias = 0 
        for _ in range(self.n_iters):
         linear_model = np.dot(X, self.weights) + self.bias 
         y_predicted = self._sigmoid(linear_model) 
         # gradients 
         dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) 
         db = (1 / n_samples) * np.sum(y_predicted - y)
         # update 
         self.weights -= self.lr * dw 
         self.bias -= self.lr * db
    def predict(self, X): 
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model) 
        return [1 if i > 0.5 else 0 for i in y_predicted]
    

In [6]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
model = LogisticRegressionModel(lr=0.1, n_iters=1000) 
model.fit(X_train_scaled, y_train) 
preds = model.predict(X_test_scaled) 
print("Predictions:", preds)
# Accuracy 
accuracy = accuracy_score(y_test, preds)
# AUC Score (needs probabilities, not just class labels) 
y_probs = model._sigmoid(np.dot(X_test_scaled, model.weights) + model.bias) 
auc = roc_auc_score(y_test, y_probs)
# Precision 
precision = precision_score(y_test, preds)
# Recall
recall = recall_score(y_test, preds)
# F1 Score 
f1 = f1_score(y_test, preds)
print("Accuracy:", accuracy) 
print("AUC Score:", auc) 
print("Precision:", precision) 
print("Recall:", recall) 
print("F1 Score:", f1)

Predictions: [0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1]
Accuracy: 1.0
AUC Score: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [7]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score 
from sklearn.model_selection import train_test_split 
from sklearn.datasets import load_breast_cancer
# Initialize Decision Tree
dt_clf = DecisionTreeClassifier( criterion="gini", # or "entropy" 
max_depth=5, # limit depth to avoid overfitting
random_state=42 
)
# Train 
dt_clf.fit(X_train_scaled, y_train)

# Predictions 
y_pred = dt_clf.predict(X_test) 
y_probs = dt_clf.predict_proba(X_test)[:, 1] # for AUC 
# Evaluation Metrics 
accuracy = accuracy_score(y_test, y_pred) 
precision = precision_score(y_test, y_pred) 
recall = recall_score(y_test, y_pred) 
f1 = f1_score(y_test, y_pred) 
auc = roc_auc_score(y_test, y_probs) 
print("Accuracy:", accuracy) 
print("Precision:", precision) 
print("Recall:", recall) 
print("F1 Score:", f1) 
print("AUC Score:", auc)

Accuracy: 0.5
Precision: 0.5
Recall: 1.0
F1 Score: 0.6666666666666666
AUC Score: 0.5


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Initialize K-Nearest Neighbors Classifier
knn_clf = KNeighborsClassifier(
    n_neighbors=5,      # Number of neighbors to consider
    metric='euclidean'  # Distance metric
)

# Train
knn_clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = knn_clf.predict(X_test_scaled)
y_probs = knn_clf.predict_proba(X_test_scaled)[:, 1]  # for AUC

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_probs)

print("KNN Classifier Results:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC Score:", auc)

KNN Classifier Results:
Accuracy: 0.9444444444444444
Precision: 1.0
Recall: 0.8888888888888888
F1 Score: 0.9411764705882353
AUC Score: 0.9979012345679013


In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Initialize Gaussian Naive Bayes Classifier
nb_clf = GaussianNB()

# Train
nb_clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = nb_clf.predict(X_test_scaled)
y_probs = nb_clf.predict_proba(X_test_scaled)[:, 1]  # for AUC

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_probs)

print("Naive Bayes Classifier Results:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC Score:", auc)

Naive Bayes Classifier Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC Score: 1.0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Initialize Random Forest Classifier
rf_clf = RandomForestClassifier(
    n_estimators=100,      # Number of trees in the forest
    max_depth=15,          # Maximum depth of trees
    min_samples_split=5,   # Minimum samples required to split
    min_samples_leaf=2,    # Minimum samples required at leaf node
    random_state=42,
    n_jobs=-1              # Use all processors
)

# Train
rf_clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = rf_clf.predict(X_test_scaled)
y_probs = rf_clf.predict_proba(X_test_scaled)[:, 1]  # for AUC

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_probs)

print("Random Forest Classifier Results:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC Score:", auc)

# Feature Importance
feature_importance = rf_clf.feature_importances_
print("\nTop 10 Important Features:")
top_indices = np.argsort(feature_importance)[-10:][::-1]
for idx in top_indices:
    print(f"Feature {idx}: {feature_importance[idx]:.4f}")

Random Forest Classifier Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC Score: 1.0

Top 10 Important Features:
Feature 6: 0.3047
Feature 3: 0.1835
Feature 4: 0.1670
Feature 5: 0.1551
Feature 2: 0.0909
Feature 38: 0.0685
Feature 41: 0.0098
Feature 40: 0.0096
Feature 39: 0.0073
Feature 0: 0.0008


In [11]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Initialize XGBoost Classifier
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,           # Number of boosting rounds
    max_depth=6,                # Maximum tree depth
    learning_rate=0.1,          # Shrinkage (eta)
    subsample=0.8,              # Fraction of samples for fitting trees
    colsample_bytree=0.8,       # Fraction of features for fitting trees
    random_state=42,
    eval_metric='logloss',
    verbosity=0                 # Silent mode
)

# Train
xgb_clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = xgb_clf.predict(X_test_scaled)
y_probs = xgb_clf.predict_proba(X_test_scaled)[:, 1]  # for AUC

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_probs)

print("XGBoost Classifier Results:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC Score:", auc)

# Feature Importance
feature_importance_xgb = xgb_clf.feature_importances_
print("\nTop 10 Important Features:")
top_indices_xgb = np.argsort(feature_importance_xgb)[-10:][::-1]
for idx in top_indices_xgb:
    print(f"Feature {idx}: {feature_importance_xgb[idx]:.4f}")

XGBoost Classifier Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
AUC Score: 1.0

Top 10 Important Features:
Feature 3: 0.3930
Feature 2: 0.3025
Feature 6: 0.2063
Feature 4: 0.0656
Feature 5: 0.0325
Feature 37: 0.0000
Feature 38: 0.0000
Feature 39: 0.0000
Feature 42: 0.0000
Feature 41: 0.0000
