In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [5]:
# Column names from breast-cancer-wisconsin.names
col_names = [
    "Sample_code_number", "Clump_Thickness", "Uniformity_Cell_Size",
    "Uniformity_Cell_Shape", "Marginal_Adhesion", "Single_Epi_Cell_Size",
    "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses", "Class"
]

# Load dataset
data = pd.read_csv("wisconsin.data", names=col_names)

# Replace missing values ("?") with NaN then fill with median
data.replace("?", np.nan, inplace=True)
data["Bare_Nuclei"] = pd.to_numeric(data["Bare_Nuclei"])
data["Bare_Nuclei"].fillna(data["Bare_Nuclei"].median(), inplace=True)

# Drop the ID column
data.drop("Sample_code_number", axis=1, inplace=True)

# Map classes: Benign=2 → 0, Malignant=4 → 1
data["Class"] = data["Class"].map({2:0, 4:1})

print("Class distribution:\n", data["Class"].value_counts(normalize=True))

Class distribution:
 Class
0    0.655222
1    0.344778
Name: proportion, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Bare_Nuclei"].fillna(data["Bare_Nuclei"].median(), inplace=True)
