# Breast Cancer Detection Project: Full Pipeline

This notebook combines all tasks:
- **Task 1**: Data Preprocessing & EDA
- **Task 2**: Baseline Models (SVM & MLP)
- **Task 3**: Hyperparameter Tuning & Cross-Validation


## Task 1: Data Preprocessing & EDA

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('data.csv')
print(df.shape)
print(df.head())

# Check missing values and data types
print(df.isnull().sum())
print(df.dtypes)

# Drop unnecessary columns
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)

# Encode labels
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Split features and target
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# EDA
print('Class distribution:
', y.value_counts())
sns.heatmap(df.corr(), cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

df.hist(figsize=(20, 15))
plt.show()

# Verification
print('Missing values:
', df.isnull().sum())
print('Feature shape:', X_scaled.shape)
print('Scaled mean:', X_scaled.mean(), 'std:', X_scaled.std())

## Task 2: Baseline Models (SVM & MLP)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)

# SVM Model
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print('SVM Accuracy:', accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# ROC Curve for SVM
y_prob_svm = svm_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_prob_svm)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'SVM (AUC={roc_auc:.2f})')
plt.plot([0,1],[0,1],'--')
plt.title('SVM ROC Curve')
plt.legend()
plt.show()

# MLP Model
mlp_model = MLPClassifier(hidden_layer_sizes=(500,500,500), activation='relu', solver='adam', max_iter=300, random_state=42)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)
print('MLP Accuracy:', accuracy_score(y_test, y_pred_mlp))
print(classification_report(y_test, y_pred_mlp))

# ROC Curve for MLP
y_prob_mlp = mlp_model.predict_proba(X_test)[:,1]
fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_prob_mlp)
roc_auc_mlp = auc(fpr_mlp, tpr_mlp)
plt.plot(fpr_mlp, tpr_mlp, label=f'MLP (AUC={roc_auc_mlp:.2f})')
plt.plot([0,1],[0,1],'--')
plt.title('MLP ROC Curve')
plt.legend()
plt.show()

## Task 3: Hyperparameter Tuning & Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV

# SVM Grid Search
svm_params = {'kernel': ['linear','rbf'], 'C':[0.1,1,10], 'gamma':['scale','auto']}
svm_grid = GridSearchCV(SVC(probability=True), svm_params, cv=5, scoring='accuracy')
svm_grid.fit(X_train, y_train)
print('Best SVM Params:', svm_grid.best_params_)
print('Best SVM CV Score:', svm_grid.best_score_)

# MLP Grid Search
mlp_params = {'hidden_layer_sizes':[(100,), (500,), (500,500)], 'activation':['relu','tanh'], 'solver':['adam'], 'max_iter':[300]}
mlp_grid = GridSearchCV(MLPClassifier(random_state=42), mlp_params, cv=5, scoring='accuracy')
mlp_grid.fit(X_train, y_train)
print('Best MLP Params:', mlp_grid.best_params_)
print('Best MLP CV Score:', mlp_grid.best_score_)

In [None]:
# Evaluate tuned models
best_svm = svm_grid.best_estimator_
best_mlp = mlp_grid.best_estimator_

print('Tuned SVM Accuracy:', accuracy_score(y_test, best_svm.predict(X_test)))
print('Tuned MLP Accuracy:', accuracy_score(y_test, best_mlp.predict(X_test)))