Preps Wisconsin Breast Cancer dataset

## 1. Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

ModuleNotFoundError: No module named 'pandas'

## 2. Load Data

In [None]:
# Load the dataset
data = pd.read_csv("wdbc.data", header=None)

# Add column names
column_names = [
    "id", "diagnosis",
    # Mean features
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", 
    "smoothness_mean", "compactness_mean", "concavity_mean", 
    "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    # SE features
    "radius_se", "texture_se", "perimeter_se", "area_se", 
    "smoothness_se", "compactness_se", "concavity_se", 
    "concave_points_se", "symmetry_se", "fractal_dimension_se",
    # Worst features
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", 
    "smoothness_worst", "compactness_worst", "concavity_worst", 
    "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

data.columns = column_names

print("Dataset shape:", data.shape)
print("\nFirst few rows:")
data.head()

## 3. Data Inspection and Cleaning

In [None]:
# Check for missing values
print("Missing values per column:")
print(data.isnull().sum())
print("\nTotal missing values:", data.isnull().sum().sum())

In [None]:
# Check data types
print("Data types:")
print(data.dtypes)
print("\nBasic statistics:")
data.describe()

In [None]:
# Check for duplicates
print(f"Number of duplicate rows: {data.duplicated().sum()}")

# Check target variable distribution
print("\nDiagnosis distribution:")
print(data['diagnosis'].value_counts())
print("\nDiagnosis percentages:")
print(data['diagnosis'].value_counts(normalize=True) * 100)

In [None]:
# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=data, x='diagnosis', palette='Set2')
plt.title('Distribution of Diagnosis (M=Malignant, B=Benign)', fontsize=14, fontweight='bold')
plt.xlabel('Diagnosis')
plt.ylabel('Count')
plt.show()

# Calculate class imbalance ratio
class_counts = data['diagnosis'].value_counts()
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 1.5:
    print("Moderate class imbalance detected - consider using class weights or resampling")
else:
    print("Classes are relatively balanced")

## 4. Feature Engineering and Selection

In [None]:
# Drop ID column (not useful for prediction)
data_clean = data.drop('id', axis=1)

# Encode diagnosis: M (Malignant) = 1, B (Benign) = 0
label_encoder = LabelEncoder()
data_clean['diagnosis_encoded'] = label_encoder.fit_transform(data_clean['diagnosis'])

print("Encoding mapping:")
for label, encoded in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"  {label} -> {encoded}")

# Separate features and target
X = data_clean.drop(['diagnosis', 'diagnosis_encoded'], axis=1)
y = data_clean['diagnosis_encoded']

print(f"\nFeatures shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature names: {list(X.columns)}")

In [None]:
# Check for highly correlated features (multicollinearity)
correlation_matrix = X.corr()

# Find pairs of highly correlated features (> 0.9)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append(
                (correlation_matrix.columns[i], 
                 correlation_matrix.columns[j], 
                 correlation_matrix.iloc[i, j])
            )

print(f"Number of highly correlated feature pairs (|r| > 0.9): {len(high_corr_pairs)}")
if high_corr_pairs:
    print("\nTop 10 highly correlated pairs:")
    for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:10]:
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")

In [None]:
# Visualize correlation with target
correlations_with_target = X.corrwith(y).sort_values(ascending=False)

plt.figure(figsize=(10, 12))
correlations_with_target.plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with Diagnosis', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

print("\nTop 10 features most correlated with malignancy:")
print(correlations_with_target.head(10))

## 5. Train-Test Split

In [None]:
# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class distribution in splits
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nTraining set class distribution:")
print(y_train.value_counts())
print("\nTesting set class distribution:")
print(y_test.value_counts())

## 6. Feature Scaling/Normalization

In [None]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit on training data only (to prevent data leakage)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print("Scaled training data shape:", X_train_scaled.shape)
print("Scaled testing data shape:", X_test_scaled.shape)
print("\nSample of scaled training data:")
print(X_train_scaled.head())
print("\nScaled data statistics:")
print(X_train_scaled.describe())

In [None]:
# Visualize the effect of scaling on a few features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
features_to_plot = ['radius_mean', 'texture_mean', 'perimeter_mean', 
                    'area_mean', 'smoothness_mean', 'compactness_mean']

for idx, feature in enumerate(features_to_plot):
    row = idx // 3
    col = idx % 3
    
    # Before scaling
    axes[row, col].hist(X_train[feature], bins=30, alpha=0.5, label='Original', color='blue')
    # After scaling
    axes[row, col].hist(X_train_scaled[feature], bins=30, alpha=0.5, label='Scaled', color='red')
    axes[row, col].set_title(feature)
    axes[row, col].legend()
    axes[row, col].set_ylabel('Frequency')

plt.suptitle('Before and After Feature Scaling', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 7. Save Preprocessed Data

In [None]:
# Save preprocessed data for use in other notebooks
import pickle

# Create a dictionary with all preprocessed data
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'X_train_scaled': X_train_scaled,
    'X_test_scaled': X_test_scaled,
    'y_train': y_train,
    'y_test': y_test,
    'scaler': scaler,
    'label_encoder': label_encoder,
    'feature_names': list(X.columns)
}

# Save to pickle file
with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("âœ“ Preprocessed data saved to 'preprocessed_data.pkl'")
print("\nYou can load this data in other notebooks using:")
print("""\nimport pickle
with open('preprocessed_data.pkl', 'rb') as f:
    data = pickle.load(f)
X_train_scaled = data['X_train_scaled']
y_train = data['y_train']
# etc...""")

## 8. Data Summary Report

In [None]:
print("="*60)
print("DATA PREPROCESSING SUMMARY")
print("="*60)
print(f"\n Original Dataset:")
print(f"   - Total samples: {len(data)}")
print(f"   - Number of features: {X.shape[1]}")
print(f"   - Missing values: {data.isnull().sum().sum()}")
print(f"   - Duplicate rows: {data.duplicated().sum()}")

print(f"\n Target Variable:")
print(f"   - Benign (B/0): {(y==0).sum()} ({(y==0).sum()/len(y)*100:.1f}%)")
print(f"   - Malignant (M/1): {(y==1).sum()} ({(y==1).sum()/len(y)*100:.1f}%)")
print(f"   - Class imbalance ratio: {imbalance_ratio:.2f}:1")

print(f"\n Train-Test Split:")
print(f"   - Training samples: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"   - Testing samples: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
print(f"   - Stratified: Yes")

print(f"\n Feature Engineering:")
print(f"   - Scaling method: StandardScaler (z-score normalization)")
print(f"   - Highly correlated pairs (|r|>0.9): {len(high_corr_pairs)}")
print(f"   - Features removed: 1 (id)")
print(f"   - Final feature count: {X.shape[1]}")

print(f"\n Output Files:")
print(f"   - preprocessed_data.pkl")

print(f"\n Data is ready for modeling!")
print("="*60)