#### 1. Install and Import Dependencies

In [None]:
# First, ensure kagglehub is installed
import subprocess
import sys

try:
    import kagglehub
except ImportError:
    print("Installing kagglehub...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "kagglehub"])
    import kagglehub

print("Kagglehub installed and imported successfully")

In [None]:
import pandas as pd
import numpy as np
from kagglehub import KaggleDatasetAdapter
import kagglehub
from pathlib import Path

print("All libraries imported successfully")

#### 2. Load Dataset Using KaggleHub

In [None]:
print("Downloading Breast Cancer Wisconsin Dataset from Kaggle...")
print("This may take a moment on first download...\n")

try:
    # Load the dataset using kagglehub with CSV file specification
    df = kagglehub.load_dataset(
        KaggleDatasetAdapter.PANDAS,
        "uciml/breast-cancer-wisconsin-data",
        "data.csv"  # Specify the CSV file from the dataset
    )
    
    print("✓ Dataset downloaded successfully!")
    print(f"✓ Dataset shape: {df.shape}")
    print(f"✓ Number of features: {df.shape[1]}")
    print(f"✓ Number of samples: {df.shape[0]}\n")
    
except Exception as e:
    print(f"✗ Error: {e}")
    print("\nTroubleshooting:")
    print("1. Ensure kagglehub is installed: pip install kagglehub")
    print("2. Set up Kaggle API credentials:")
    print("   - Go to https://www.kaggle.com/settings/account")
    print("   - Click 'Create New API Token'")
    print("   - Save the kaggle.json file to ~/.kaggle/kaggle.json")
    print("3. Verify the dataset is accessible on Kaggle")

#### 3. Explore Dataset Structure

In [5]:
print("DATASET COLUMNS:")
print("="*60)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")
print("="*60)

DATASET COLUMNS:
 1. id
 2. diagnosis
 3. radius_mean
 4. texture_mean
 5. perimeter_mean
 6. area_mean
 7. smoothness_mean
 8. compactness_mean
 9. concavity_mean
10. concave points_mean
11. symmetry_mean
12. fractal_dimension_mean
13. radius_se
14. texture_se
15. perimeter_se
16. area_se
17. smoothness_se
18. compactness_se
19. concavity_se
20. concave points_se
21. symmetry_se
22. fractal_dimension_se
23. radius_worst
24. texture_worst
25. perimeter_worst
26. area_worst
27. smoothness_worst
28. compactness_worst
29. concavity_worst
30. concave points_worst
31. symmetry_worst
32. fractal_dimension_worst
33. Unnamed: 32


In [6]:
print("\nFIRST FEW ROWS:")
print(df.head(10))


FIRST FEW ROWS:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   
5    843786         M        12.45         15.70           82.57      477.1   
6    844359         M        18.25         19.98          119.60     1040.0   
7  84458202         M        13.71         20.83           90.20      577.9   
8    844981         M        13.00         21.82           87.50      519.8   
9  84501001         M        12.46         24.04           83.97      475.9   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760

In [7]:
print("\nDATASET INFO:")
print("="*60)
df.info()


DATASET INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_s

#### 4. Handle Missing Values and Data Cleaning

In [8]:
print("MISSING VALUES ANALYSIS:")
print("="*60)
missing_counts = df.isnull().sum()
missing_percentage = (missing_counts / len(df)) * 100

if missing_counts.sum() == 0:
    print("✓ No missing values found in the dataset")
else:
    for col in df.columns:
        if missing_counts[col] > 0:
            print(f"{col}: {missing_counts[col]} missing ({missing_percentage[col]:.2f}%)")

print("\nDUPLICATE ROWS:")
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

MISSING VALUES ANALYSIS:
Unnamed: 32: 569 missing (100.00%)

DUPLICATE ROWS:
Number of duplicate rows: 0


#### 5. Generate Summary Statistics

In [9]:
print("DESCRIPTIVE STATISTICS:")
print("="*60)
print(df.describe())

DESCRIPTIVE STATISTICS:
                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   
50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   
75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   
max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.01406

In [10]:
print("\nTARGET VARIABLE DISTRIBUTION:")
print("="*60)
target_col = 'diagnosis' if 'diagnosis' in df.columns else df.columns[1]
print(df[target_col].value_counts())
print(f"\nTarget distribution (%):")
print(df[target_col].value_counts(normalize=True) * 100)


TARGET VARIABLE DISTRIBUTION:
diagnosis
B    357
M    212
Name: count, dtype: int64

Target distribution (%):
diagnosis
B    62.741652
M    37.258348
Name: proportion, dtype: float64


## 6. Save Dataset for Next Phases

Save the dataset to the data/raw folder for use in subsequent analysis phases.

In [11]:
# Save dataset to raw data folder
data_path = Path('..') / 'data' / 'raw' / 'breast_cancer_data.csv'
data_path.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(data_path, index=False)
print(f"Dataset saved to: {data_path.resolve()}")

Dataset saved to: C:\Repos\breast-cancer-wisconsin\data\raw\breast_cancer_data.csv
