# EDA - Breast Cancer Wisconsin (Diagnostic)

Notebook de trabajo para TB1: Comprensión de datos.


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Cargar dataset
df = pd.read_csv(r"/mnt/data/breast_cancer_project/data/breast_cancer_wisconsin.csv")
df.head()


## Vista rápida del dataset

In [None]:
print('Shape:', df.shape)
print('\nDtypes:')
print(df.dtypes)

## Estadísticas descriptivas

In [None]:
df.describe().T

### Distribución de la variable objetivo (Diagnosis)

In [None]:
df['target'].value_counts()

## Valores faltantes y duplicados

In [None]:
print(df.isnull().sum())
print('Duplicados:', df.duplicated().sum())

## Detección de outliers (IQR) para: 'mean radius', 'mean area', 'mean concavity'

In [None]:
def detect_outliers_iqr(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    return ((series < lower) | (series > upper)).sum(), lower, upper

for col in ['mean radius','mean area','mean concavity']:
    count, lower, upper = detect_outliers_iqr(df[col])
    print(f"{col}: outliers={count}, lower={lower:.3f}, upper={upper:.3f}")


## Visualizaciones principales

In [None]:
# 1) Histograma de mean radius por target
plt.figure()
df[df['target']==0]['mean radius'].hist(bins=20, alpha=0.7)
df[df['target']==1]['mean radius'].hist(bins=20, alpha=0.7)
plt.title('Histograma de mean radius por diagnóstico (0=maligno,1=benigno)')
plt.xlabel('mean radius')
plt.ylabel('Frecuencia')
plt.legend(['0 (maligno)','1 (benigno)'])
plt.tight_layout()
plt.savefig(r"/mnt/data/breast_cancer_project/figures/hist_mean_radius.png")
plt.show()

# 2) Scatter mean perimeter vs mean area por target
plt.figure()
for t in df['target'].unique():
    subset = df[df['target']==t]
    plt.scatter(subset['mean perimeter'], subset['mean area'], label=str(t), s=10)
plt.xlabel('mean perimeter')
plt.ylabel('mean area')
plt.title('mean perimeter vs mean area por diagnóstico')
plt.legend(title='target')
plt.tight_layout()
plt.savefig(r"/mnt/data/breast_cancer_project/figures/scatter_perimeter_area.png")
plt.show()

# 3) Boxplot mean concavity por target
plt.figure()
data0 = df[df['target']==0]['mean concavity']
data1 = df[df['target']==1]['mean concavity']
plt.boxplot([data0, data1], labels=['0 (maligno)','1 (benigno)'])
plt.title('Boxplot de mean concavity por diagnóstico')
plt.tight_layout()
plt.savefig(r"/mnt/data/breast_cancer_project/figures/box_mean_concavity.png")
plt.show()


## Comentarios y decisiones sugeridas
- Imputación por mediana para faltantes (si existieran).
- Duplicados: eliminar si son réplicas.
- Outliers: revisar mediciones y considerar winsorización o modelos robustos.
