In [1]:
import pandas as pd

df = pd.read_csv("breast.csv", low_memory=False)


In [2]:
df.shape

(712319, 149)

In [3]:
df_sample = df.sample(60000, random_state=42)

In [4]:
missing_values = df_sample.isnull().sum()
missing_percent = (df_sample.isnull().sum() / len(df_sample)) * 100


In [15]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [6]:
missing_percent

CASENUM                   0.000000
REG                       0.000000
MAR_STAT                  3.893333
RACE                      0.286667
ORIGIN                    0.493333
NHIA                      0.000000
SEX                       0.000000
AGE_DX                    0.003333
YR_BRTH                   0.003333
SEQ_NUM                   0.003333
DATE_mo                   0.000000
DATE_yr                   0.000000
SITEO2V                   0.000000
LATERAL                   0.000000
HISTO2V                   0.000000
BEHO2V                    0.000000
HISTO3V                   0.000000
BEHO3V                    0.000000
GRADE                     0.000000
DX_CONF                   0.595000
REPT_SRC                  0.000000
EOD10_SZ                 59.795000
EOD10_EX                 53.540000
EOD10_PE                100.000000
EOD10_ND                 53.540000
EOD10_PN                 24.656667
EOD10_NE                 25.380000
EOD13                    87.685000
EOD2                

In [7]:
df_sample.to_csv('df_sample.csv', index=False)


In [2]:
import pandas as pd
df_sample = pd.read_csv('df_sample.csv')

In [3]:
df_sample.shape

(60000, 149)

In [4]:
df_sample = df_sample.dropna(thresh=len(df_sample) * 0.3, axis=1)

In [5]:
df_sample.shape

(60000, 84)

In [6]:
df = df_sample

In [7]:
(df.isnull().sum() / len(df)) * 100

CASENUM           0.000000
REG               0.000000
MAR_STAT          3.893333
RACE              0.286667
ORIGIN            0.493333
                    ...   
HER2              0.000000
BRST_SUB          0.000000
PLC_BRTH_CNTRY    0.000000
PLC_BRTH_STATE    0.000000
ANNARBOR          0.000000
Length: 84, dtype: float64

In [8]:
# 1. Categorical columns
categorical_cols = ['MAR_STAT', 'RACE', 'ORIGIN', 'DX_CONF', 'RAC_RECA', 'RAC_RECY', 'IHS']
for col in categorical_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

# 2. Numeric columns
numeric_cols = ['AGE_DX', 'YR_BRTH', 'AGE_REC', 'ADJTM_6VALUE', 'ADJNM_6VALUE', 'ADJM_6VALUE', 'ADJAJCCSTG']
for col in numeric_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# 3. Binary flags
binary_cols = ['INTPRIM', 'ERSTATUS', 'PRSTATUS']
for col in binary_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [9]:
(df.isnull().sum() / len(df)) * 100

CASENUM           0.0
REG               0.0
MAR_STAT          0.0
RACE              0.0
ORIGIN            0.0
                 ... 
HER2              0.0
BRST_SUB          0.0
PLC_BRTH_CNTRY    0.0
PLC_BRTH_STATE    0.0
ANNARBOR          0.0
Length: 84, dtype: float64

In [10]:
df = df.dropna(thresh=len(df) * 0.5, axis=1)

In [11]:
(df.isnull().sum() / len(df)) * 100

CASENUM           0.0
REG               0.0
MAR_STAT          0.0
RACE              0.0
ORIGIN            0.0
                 ... 
HER2              0.0
BRST_SUB          0.0
PLC_BRTH_CNTRY    0.0
PLC_BRTH_STATE    0.0
ANNARBOR          0.0
Length: 73, dtype: float64

In [12]:
df.drop(columns=['EOD10_PN', 'EOD10_NE', 'EODCODE', 'SS_SURG', 'ICCC3WHO', 'ICCC3XWHO'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['EOD10_PN', 'EOD10_NE', 'EODCODE', 'SS_SURG', 'ICCC3WHO', 'ICCC3XWHO'], inplace=True)


In [13]:
# Impute very low-missing fields
df['SEQ_NUM'].fillna(df['SEQ_NUM'].median(), inplace=True)
df['RAD_SURG'].fillna(df['RAD_SURG'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['SEQ_NUM'].fillna(df['SEQ_NUM'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SEQ_NUM'].fillna(df['SEQ_NUM'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] 

In [16]:
(df.isnull().sum() / len(df)) * 100

CASENUM                 0.0
REG                     0.0
MAR_STAT                0.0
RACE                    0.0
ORIGIN                  0.0
NHIA                    0.0
SEX                     0.0
AGE_DX                  0.0
YR_BRTH                 0.0
SEQ_NUM                 0.0
DATE_mo                 0.0
DATE_yr                 0.0
SITEO2V                 0.0
LATERAL                 0.0
HISTO2V                 0.0
BEHO2V                  0.0
HISTO3V                 0.0
BEHO3V                  0.0
GRADE                   0.0
DX_CONF                 0.0
REPT_SRC                0.0
NO_SURG                 0.0
RADIATN                 0.0
RAD_BRN                 0.0
RAD_SURG                0.0
REC_NO                  0.0
TYPEFUP                 0.0
AGE_REC                 0.0
SITERWHO                0.0
ICDOTO9V                0.0
ICDOT10V                0.0
BEHANAL                 0.0
HISTREC                 0.0
BRAINREC                0.0
CS0204SCHEMA            0.0
RAC_RECA            

In [17]:
df['is_malignant'] = (df['BEHO3V'] == 3).astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_malignant'] = (df['BEHO3V'] == 3).astype(int)


In [20]:
X = df.drop(columns=['is_malignant'])
y = df['is_malignant']


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [24]:
# Identify non-numeric (object) columns
non_numeric_cols = df.select_dtypes(include=['object']).columns.tolist()

# Drop ones that shouldn't be encoded (e.g., IDs if still present)
non_numeric_cols = [col for col in non_numeric_cols if col not in ['CASENUM', 'REC_NO']]

# Apply one-hot encoding
df = pd.get_dummies(df, columns=non_numeric_cols, drop_first=True)


In [25]:
model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'C509'

In [27]:
result = df.isin(['C509'])

In [28]:
locations = result[result].stack().index.tolist()
print("Locations of 'C509':", locations)

Locations of 'C509': []


In [31]:
# 1. Identify object (string) columns
object_cols = df.select_dtypes(include=['object']).columns.tolist()

# 2. One-hot encode them
df = pd.get_dummies(df, columns=object_cols, drop_first=True)


In [35]:
cols_to_encode = ['SITEO2V', 'ICDOT10V', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE']

# Apply one-hot encoding
df = pd.get_dummies(df, columns=cols_to_encode, drop_first=True)


KeyError: "None of [Index(['SITEO2V', 'ICDOT10V', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE'], dtype='object')] are in the [columns]"

In [37]:
# Automatically find all remaining non-numeric (object or category) columns
non_numeric_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Then one-hot encode them safely
if non_numeric_cols:
    df = pd.get_dummies(df, columns=non_numeric_cols, drop_first=True)


In [38]:
X = df.drop(columns=['is_malignant'])
y = df['is_malignant']


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1755
           1       1.00      1.00      1.00     10245

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000



In [40]:
X = df.drop(columns=['STAT_REC'])
y = df['STAT_REC']


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       1.00      1.00      1.00      6464
           4       1.00      1.00      1.00      5536

    accuracy                           1.00     12000
   macro avg       1.00      1.00      1.00     12000
weighted avg       1.00      1.00      1.00     12000

