In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, roc_curve
)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [None]:
DATA_PATH = "C:\Users\HP\Desktop\ass\Tumor detection"

df = pd.read_csv(DATA_PATH)
print("Shape:", df.shape)
df.head()


In [None]:

df.info()
display(df.describe().T)
print("\nMissing values per column:")
print(df.isna().sum())


In [None]:


# Drop 'id' if exists
if 'id' in df.columns.str.lower() or 'id' in df.columns:
    # find exact name (case variations)
    id_cols = [c for c in df.columns if c.lower() == 'id']
    df = df.drop(columns=id_cols, errors='ignore')

# Drop unnamed auto-index columns
unnamed_cols = [c for c in df.columns if c.startswith("Unnamed")]
if unnamed_cols:
    df = df.drop(columns=unnamed_cols, errors='ignore')

# Show remaining columns
print("Columns after cleanup:", df.columns.tolist())

# Check duplicates
dupes = df.duplicated().sum()
print("Duplicate rows:", dupes)

# If duplicates exist, drop them
if dupes > 0:
    df = df.drop_duplicates()
    print("Dropped duplicates. New shape:", df.shape)

# If any missing values remain, show a small sample
print("Missing values now:", df.isna().sum().sum())


In [None]:

if 'diagnosis' in df.columns:
    print("\nDiagnosis value counts:")
    display(df['diagnosis'].value_counts())
    plt.figure(figsize=(6,4))
    sns.countplot(x='diagnosis', data=df)
    plt.title("Diagnosis Distribution (B = Benign, M = Malignant)")
    plt.show()
else:
    raise ValueError("Column 'diagnosis' not found in dataset.")


In [None]:
# Quick histogram of a few representative numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns sample:", numeric_cols[:10])

# Plot distributions for first 6 numeric features
plt.figure(figsize=(14,8))
for i,col in enumerate(numeric_cols[:6],1):
    plt.subplot(2,3,i)
    sns.histplot(df[col], kde=True)
    plt.title(col)
plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix (use a subset if there are many features)
corr = df.select_dtypes(include=[np.number]).corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, cmap='coolwarm', center=0, fmt=".2f")
plt.title("Correlation matrix (numeric features)")
plt.show()


In [None]:

le = LabelEncoder()
df['diagnosis_enc'] = le.fit_transform(df['diagnosis'])  # B->0, M->1
corr_with_target = df.select_dtypes(include=[np.number]).corr()['diagnosis_enc'].abs().sort_values(ascending=False)
display(corr_with_target.head(15))


In [None]:


# Prepare features (drop diagnosis and diagnosis_enc from X)
X = df.drop(columns=['diagnosis', 'diagnosis_enc'], errors='ignore')
# Keep only numeric features (drop non-numeric if any)
X = X.select_dtypes(include=[np.number])

y = df['diagnosis_enc']

print("Feature shape:", X.shape)
print("Target distribution:", np.bincount(y))


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')

# Cross-validation baseline (optional)
cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("CV accuracy scores:", np.round(cv_scores,4))
print("CV mean accuracy:", np.round(cv_scores.mean(),4))

# Fit on training data
rf.fit(X_train_scaled, y_train)

# Predict on test
y_pred = rf.predict(X_test_scaled)
y_proba = rf.predict_proba(X_test_scaled)[:,1]  # probability for positive class


In [None]:

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", round(acc,4))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ROC AUC
auc = roc_auc_score(y_test, y_proba)
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc:.3f})')
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:

importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
display(feat_imp.head(20))

plt.figure(figsize=(8,10))
sns.barplot(x=feat_imp[:20], y=feat_imp[:20].index)
plt.title("Top 20 Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:

print("Summary:")
print(f"- Test Accuracy: {acc:.4f}")
print(f"- ROC AUC: {auc:.4f}")

print("""
Next steps / possible improvements:
1. Try additional models: Logistic Regression, SVM, XGBoost.
2. Perform hyperparameter tuning (GridSearchCV / RandomizedSearchCV).
3. Use feature selection (e.g., SelectFromModel) to reduce dimensionality.
4. Address class imbalance carefully (if present) using SMOTE or class weights (we used class_weight='balanced').
5. Consider cross-validation with stratified folds for robust estimates.
""")
