# Data Science Environment Setup & Library Introduction

This notebook guides you through setting up a Python virtual environment and introduces key data science libraries.

**Note:** Run these commands in your terminal (not in Jupyter):

```bash
python -m venv .venv

source .venv/bin/activate

.venv/Source/activate.bat

pip install numpy pandas matplotlib seaborn scikit-learn jupyter kaggle

jupyter notebook
```

If you install the Jupyter extension in VSCode you dont need that last command

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA

# Visualization Parameters
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
np.random.seed(42)

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
data = pd.read_csv('wdbc.data', header=None)

feature_names = [
    'radius', 'texture', 'perimeter', 'area', 'smoothness',
    'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dimension'
]

column_names = ['id', 'diagnosis'] + \
               [f'{feat}_mean' for feat in feature_names] + \
               [f'{feat}_se' for feat in feature_names] + \
               [f'{feat}_worst' for feat in feature_names]

data.columns = column_names

print(f"Dataset shape: {data.shape}")
print(f"\nFirst few rows:")
data.head()

In [None]:
diagnosis_df = data[['id', 'diagnosis']]
y = data['diagnosis'].map({"M": 0, "B": 1}).to_numpy()  # 0 = Malignant, 1 = Benign
X = data.iloc[:, 2:].to_numpy()

print(f"Feature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"\nLabel distribution:")
print(f"Malignant (0): {np.sum(y == 0)} ({np.sum(y == 0) / len(y) * 100:.2f}%)")
print(f"Benign (1): {np.sum(y == 1)} ({np.sum(y == 1) / len(y) * 100:.2f}%)")

In [None]:
df = data.copy()

print("Dataset Information:")
df.info()

print("Missing values:")
print(df.isnull().sum().sum())

print("\nDescriptive Statistics:")
df.describe()

In [None]:
# this is where we clean the data


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
diagnosis_counts = df['diagnosis'].value_counts()
axes[0].bar(['Malignant', 'Benign'], diagnosis_counts.values, color=['#ff6b6b', '#4ecdc4'])
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Distribution of Diagnosis', fontsize=14, fontweight='bold')
for i, v in enumerate(diagnosis_counts.values):
    axes[0].text(i, v + 10, str(v), ha='center', fontsize=11, fontweight='bold')

# Pie chart
colors = ['#ff6b6b', '#4ecdc4']
axes[1].pie(diagnosis_counts.values, labels=['Malignant', 'Benign'], 
            autopct='%1.1f%%', colors=colors, startangle=90,
            textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Diagnosis Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Analyze mean features
mean_features = [col for col in df.columns if '_mean' in col]

# Distribution plots for mean features
fig, axes = plt.subplots(5, 2, figsize=(15, 18))
axes = axes.ravel()

for idx, col in enumerate(mean_features):
    # Separate by diagnosis
    malignant = df[df['diagnosis'] == 'M'][col]
    benign = df[df['diagnosis'] == 'B'][col]
    
    axes[idx].hist(malignant, bins=30, alpha=0.6, label='Malignant', color='#ff6b6b')
    axes[idx].hist(benign, bins=30, alpha=0.6, label='Benign', color='#4ecdc4')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].legend()
    axes[idx].set_title(f'Distribution of {col}', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Box plots for mean features
fig, axes = plt.subplots(5, 2, figsize=(15, 18))
axes = axes.ravel()

for idx, col in enumerate(mean_features):
    sns.boxplot(data=df, x='diagnosis', y=col, ax=axes[idx], 
                palette={'M': '#ff6b6b', 'B': '#4ecdc4'})
    axes[idx].set_title(f'Box Plot: {col}', fontsize=11, fontweight='bold')
    axes[idx].set_xlabel('Diagnosis', fontsize=10)
    axes[idx].set_ylabel(col, fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for mean features
mean_df = df[mean_features]
correlation_matrix = mean_df.corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Mean Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Find highly correlated feature pairs
threshold = 0.85
high_corr_pairs = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

print(f"\nHighly correlated feature pairs (|correlation| > {threshold}):")
for feat1, feat2, corr in high_corr_pairs:
    print(f"{feat1} <-> {feat2}: {corr:.3f}")

In [None]:
# Select key features for pair plot
key_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
pair_df = df[key_features + ['diagnosis']].copy()

# Create pair plot
sns.pairplot(pair_df, hue='diagnosis', palette={'M': '#ff6b6b', 'B': '#4ecdc4'},
             diag_kind='kde', plot_kws={'alpha': 0.6}, height=2.5)
plt.suptitle('Pair Plot - Key Features', y=1.02, fontsize=16, fontweight='bold')
plt.show()

In [None]:
# Scale features


In [None]:
# Visualize variance distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Bar plot
axes[0].bar(range(len(variance_per_feature)), variance_per_feature, color='steelblue')
axes[0].set_xlabel('Feature Index', fontsize=12)
axes[0].set_ylabel('Variance', fontsize=12)
axes[0].set_title('Variance per Feature (Scaled & Centered)', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Top 15 features
top_15 = variance_df.head(15)
axes[1].barh(range(len(top_15)), top_15['Variance'].values, color='coral')
axes[1].set_yticks(range(len(top_15)))
axes[1].set_yticklabels(top_15['Feature'].values)
axes[1].set_xlabel('Variance', fontsize=12)
axes[1].set_title('Top 15 Features by Variance', fontsize=14, fontweight='bold')
axes[1].invert_yaxis()
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
#Principle Component Analysis (PCA)

In [None]:
# Visualize PCA results
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Scree plot
axes[0].plot(range(1, len(explained_variance_ratio) + 1), 
             explained_variance_ratio, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Principal Component', fontsize=12)
axes[0].set_ylabel('Explained Variance Ratio', fontsize=12)
axes[0].set_title('Scree Plot', fontsize=14, fontweight='bold')
axes[0].grid(alpha=0.3)

# Cumulative variance
axes[1].plot(range(1, len(cumulative_variance) + 1), 
             cumulative_variance, 'ro-', linewidth=2, markersize=8)
axes[1].axhline(y=0.95, color='g', linestyle='--', label='95% variance')
axes[1].set_xlabel('Number of Components', fontsize=12)
axes[1].set_ylabel('Cumulative Explained Variance', fontsize=12)
axes[1].set_title('Cumulative Explained Variance', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Visualize first 2 principal components
plt.figure(figsize=(10, 8))
colors = ['#ff6b6b' if label == 0 else '#4ecdc4' for label in y]
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.6, s=50)
plt.xlabel('First Principal Component', fontsize=12)
plt.ylabel('Second Principal Component', fontsize=12)
plt.title('PCA: First Two Principal Components', fontsize=14, fontweight='bold')

# Create custom legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#ff6b6b', label='Malignant'),
                   Patch(facecolor='#4ecdc4', label='Benign')]
plt.legend(handles=legend_elements, fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()