In [1]:
import pandas as pd

df = pd.read_csv("breast.csv", low_memory=False)

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100

In [4]:
df = df.dropna(thresh=len(df) * 0.8, axis=1)

In [5]:
object_cols = df.select_dtypes(include=['object']).columns.tolist()
object_cols

['SITEO2V', 'ICDOT10V', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE']

In [6]:
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

In [7]:
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_percent

CASENUM                  0.000000
REG                      0.000000
MAR_STAT                 3.934894
RACE                     0.307868
ORIGIN                   0.510165
NHIA                     0.000000
SEX                      0.000000
AGE_DX                   0.005475
YR_BRTH                  0.005475
SEQ_NUM                  0.002948
DATE_mo                  0.000000
DATE_yr                  0.000000
LATERAL                  0.000000
HISTO2V                  0.000000
BEHO2V                   0.000000
HISTO3V                  0.000000
BEHO3V                   0.000000
GRADE                    0.000000
DX_CONF                  0.602258
REPT_SRC                 0.000000
NO_SURG                  0.000000
RADIATN                  0.000000
RAD_BRN                  0.000000
RAD_SURG                 0.090549
REC_NO                   0.000000
TYPEFUP                  0.000000
AGE_REC                  0.005475
SITERWHO                 0.000000
ICDOTO9V                 0.000000
ICCC3WHO      

In [8]:
df.drop(columns=['ICCC3WHO', 'ICCC3XWHO', 'IHS'], inplace=True)

In [9]:
df = df.dropna()

# Feature Engineering

In [10]:
df['is_alive'] = (df['STAT_REC'] == 1).astype(int)
df.drop(columns=['STAT_REC'], inplace=True)

In [11]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('is_alive', axis=1))
df_scaled = pd.DataFrame(scaled_features, columns=df.drop('is_alive', axis=1).columns)
df_scaled['is_alive'] = df['is_alive'].values
df = df_scaled

In [12]:
df.shape

(676128, 322)

# Feature Selection

In [13]:
# Anova
from sklearn.feature_selection import SelectKBest, f_classif

# Separate features and target
X = df.drop('is_alive', axis=1)
y = df['is_alive']

# Apply ANOVA F-test
selector = SelectKBest(score_func=f_classif, k='all') 
X_new = selector.fit_transform(X, y)

anova_scores = pd.DataFrame({
    'Feature': X.columns,
    'F-Score': selector.scores_,
    'p-Value': selector.pvalues_
}).sort_values(by='F-Score', ascending=False)

top_features = anova_scores.nlargest(20, 'F-Score')['Feature']
df_anova = df[top_features.tolist() + ['is_alive']]


  f = msb / msw


In [14]:
df_anova.shape

(676128, 21)

In [15]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=20)
X_rfe = rfe.fit_transform(X, y)

selected_rfe = X.columns[rfe.support_]
df_rfe = df[selected_rfe.tolist() + ['is_alive']]

In [16]:
df_rfe.shape

(676128, 21)

In [17]:
# Common features

# Drop target column temporarily for comparison
features_rfe = set(df_rfe.columns) - {'is_alive'}
features_anova = set(df_anova.columns) - {'is_alive'}

# Find common features
common_features = features_rfe & features_anova

# Rebuild DataFrame with common features + target
df_common = df_rfe[list(common_features) + ['is_alive']]

In [18]:
print(list(common_features))

['HST_STGA', 'PLC_BRTH_STATE_ZZ', 'PRSTATUS', 'HER2', 'ERSTATUS', 'YR_BRTH', 'CODKM', 'AGE_REC', 'PLC_BRTH_CNTRY_ZZU', 'ICD_5DIG', 'BRST_SUB', 'AGE_DX']


In [19]:
df_common.shape

(676128, 13)

In [20]:
from sklearn.decomposition import PCA

X = df_common.drop('is_alive', axis=1)
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
df_pca = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(10)])
df_pca['is_alive'] = df['is_alive'].values
df = df_pca

In [21]:
df.shape

(676128, 11)

# Splitting

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
X = df.drop('is_alive', axis=1)
y = df['is_alive']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [23]:
lda = LDA(n_components=1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Drop target column
X = df.drop('is_alive', axis=1)

# Create DataFrame for VIF values
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Sort by VIF descending
vif_data = vif_data.sort_values(by='VIF', ascending=False)
vif_data


Unnamed: 0,Feature,VIF
2,PC3,1.0
6,PC7,1.0
1,PC2,1.0
4,PC5,1.0
0,PC1,1.0
7,PC8,1.0
5,PC6,1.0
9,PC10,1.0
3,PC4,1.0
8,PC9,1.0


In [29]:
X.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,0.561436,0.144743,0.372717,-1.526777,-0.549392,-1.065026,-0.426302,0.024982,-0.121724,-0.008233
1,-0.274628,1.391179,0.854924,-1.269682,-0.628344,-1.598578,-0.27358,0.029698,-0.071158,-0.002754
2,-2.382361,0.54384,0.022711,-0.744513,-0.347237,1.198008,0.166592,0.022052,-0.102899,0.0013
3,-2.956471,-1.224302,-0.913975,0.600605,0.145172,0.858826,-0.238546,-0.002932,0.08226,-0.003139
4,1.732231,2.235434,-2.688616,-0.897702,-1.402325,-0.621727,0.180728,0.037623,-0.089897,-0.118949


In [30]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

# Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [27]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     62139
           1       1.00      1.00      1.00     73087

    accuracy                           1.00    135226
   macro avg       1.00      1.00      1.00    135226
weighted avg       1.00      1.00      1.00    135226



In [32]:
print(df['is_alive'].value_counts(normalize=True))  # Are 99% of labels the same?


is_alive
1    0.540481
0    0.459519
Name: proportion, dtype: float64


# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=10)
model.fit(X_train, y_train)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,5
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     62139
           1       1.00      1.00      1.00     73087

    accuracy                           1.00    135226
   macro avg       1.00      1.00      1.00    135226
weighted avg       1.00      1.00      1.00    135226



# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
base_estimator = DecisionTreeClassifier(max_depth=1)
adb_clf = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

In [None]:
adb_clf.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...r(max_depth=1)
,n_estimators,50
,learning_rate,1.0
,algorithm,'deprecated'
,random_state,42

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
y_pred = adb_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     62139
           1       1.00      1.00      1.00     73087

    accuracy                           1.00    135226
   macro avg       1.00      1.00      1.00    135226
weighted avg       1.00      1.00      1.00    135226



# Random Forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,8
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     62139
           1       0.99      1.00      1.00     73087

    accuracy                           1.00    135226
   macro avg       1.00      1.00      1.00    135226
weighted avg       1.00      1.00      1.00    135226



# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(
    n_neighbors=2,
    weights='distance',
    metric='euclidean'
)
model.fit(X_train, y_train)

0,1,2
,n_neighbors,2
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,


In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# SVC

In [None]:
from sklearn.svm import LinearSVC

model = LinearSVC(C=1.0, max_iter=10000, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
y_pred = gnb.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Gaussian Naive Bayes Accuracy: {accuracy:.2f}")

In [None]:
print(classification_report(y_test, y_pred))

# XGBoost

In [None]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)
xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Ensemble 1

In [None]:
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('linear-svc', LinearSVC(C=1.0, max_iter=10000, random_state=42))
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking_model.fit(X_train, y_train)

In [None]:
y_pred = stacking_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Ensemble 2

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Bagging with shallower trees and fewer estimators
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),  # shallower tree
    n_estimators=5,                                 # fewer trees
    random_state=42
)

# Gradient Boosting with fewer estimators and lower learning rate
boosting = GradientBoostingClassifier(
    n_estimators=50,       # fewer estimators
    learning_rate=0.05,    # more stable training
    max_depth=2,           # shallower boosting trees
    random_state=42
)

# Inner stack with lighter models
stack_inner = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=5, max_depth=3, random_state=42)),
        ('nb', GaussianNB())  # Replaces SVC with a very light model
    ],
    final_estimator=LogisticRegression(max_iter=100)
)

# Final ensemble stacking the reduced models
final_ensemble = StackingClassifier(
    estimators=[
        ('bag', bagging),
        ('boost', boosting),
        ('stack', stack_inner)
    ],
    final_estimator=LogisticRegression(max_iter=100)
)

# Fit on training data
final_ensemble.fit(X_train, y_train)

# Evaluate
from sklearn.metrics import accuracy_score
y_pred = final_ensemble.predict(X_test)
print(f"Final Ensemble Accuracy: {accuracy_score(y_test, y_pred):.4f}")


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))