In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import seaborn as sns

# Load your dataset
data = pd.read_csv('E:/Research_Work/LungCancer/survey lung cancer.csv')

# Display the first few rows and column names of the dataset to inspect
print(data.head())
print(data.columns)

# Set the correct name of your target variable column
target_variable = 'LUNG_CANCER'  # Update this line with the actual column name

# Ensure the column name is correct by printing the dataframe columns
if target_variable not in data.columns:
    raise ValueError(f"Target variable '{target_variable}' not found in the dataset columns: {data.columns}")

# Split the data into features and target variable
X = data.drop(target_variable, axis=1)  # Features
y = data[target_variable]  # Target variable

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'  # This leaves the numeric columns as is
)

# Apply preprocessing to the features
X_preprocessed = preprocessor.fit_transform(X)

# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Initialize classifiers
rf_classifier = RandomForestClassifier(random_state=42)
knn_classifier = KNeighborsClassifier()
gbm_classifier = GradientBoostingClassifier(random_state=42)

classifiers = [('Random Forest', rf_classifier),
               ('K-Nearest Neighbors', knn_classifier),
               ('Gradient Boosting', gbm_classifier)]

# Initialize dictionaries to store evaluation metrics
metrics = {'Accuracy': [],
           'Precision': [],
           'Recall': [],
           'F1-score': []}

# Train and evaluate each classifier
for name, classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='YES')
    recall = recall_score(y_test, y_pred, pos_label='YES')
    f1 = f1_score(y_test, y_pred, pos_label='YES')
    
    # Save metrics
    metrics['Accuracy'].append(accuracy)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['F1-score'].append(f1)
    
    # Plot ROC curve and save the figure
    fpr, tpr, _ = roc_curve(y_test, classifier.predict_proba(X_test)[:,1], pos_label='YES')
    roc_auc = roc_auc_score(y_test, classifier.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - ' + name)
    plt.legend(loc="lower right")
    plt.savefig(name + '_ROC.png')
    plt.close()
    
    # Compute and display confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=['YES', 'NO'])
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['YES', 'NO'], yticklabels=['YES', 'NO'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
   #plt.title('Confusion Matrix - ' + name)
    plt.savefig(name + '_Confusion_Matrix.png')
    plt.close()

# Print evaluation metrics
print("Evaluation Metrics:")
print(pd.DataFrame(metrics, index=[name for name, _ in classifiers]))


  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
0                    2                      