In [None]:
import pandas as pd

df = pd.read_csv("breast.csv", low_memory=False)

In [3]:
df.shape

(712319, 149)

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100


In [6]:
missing_percent

CASENUM                   0.000000
REG                       0.000000
MAR_STAT                  3.934894
RACE                      0.307868
ORIGIN                    0.510165
NHIA                      0.000000
SEX                       0.000000
AGE_DX                    0.005475
YR_BRTH                   0.005475
SEQ_NUM                   0.002948
DATE_mo                   0.000000
DATE_yr                   0.000000
SITEO2V                   0.000000
LATERAL                   0.000000
HISTO2V                   0.000000
BEHO2V                    0.000000
HISTO3V                   0.000000
BEHO3V                    0.000000
GRADE                     0.000000
DX_CONF                   0.602258
REPT_SRC                  0.000000
EOD10_SZ                 59.656137
EOD10_EX                 53.376928
EOD10_PE                100.000000
EOD10_ND                 53.376928
EOD10_PN                 24.518650
EOD10_NE                 25.300603
EOD13                    87.602465
EOD2                

In [7]:
df = df.dropna(thresh=len(df) * 0.5, axis=1)

In [8]:
df.shape

(712319, 73)

In [9]:
# df = df.dropna()

In [10]:
df.shape

(712319, 73)

In [11]:
# 1. Identify object (string) columns
object_cols = df.select_dtypes(include=['object']).columns.tolist()
object_cols

['SITEO2V', 'ICDOT10V', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE']

In [12]:
# 2. One-hot encode them
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

In [13]:
# 1. Categorical columns
categorical_cols = ['MAR_STAT', 'RACE', 'ORIGIN', 'DX_CONF', 'RAC_RECA', 'RAC_RECY', 'IHS']
for col in categorical_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

# 2. Numeric columns
numeric_cols = ['AGE_DX', 'YR_BRTH', 'AGE_REC', 'ADJTM_6VALUE', 'ADJNM_6VALUE', 'ADJM_6VALUE', 'ADJAJCCSTG']
for col in numeric_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# 3. Binary flags
binary_cols = ['INTPRIM', 'ERSTATUS', 'PRSTATUS']
for col in binary_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [14]:
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_percent

CASENUM                  0.000000
REG                      0.000000
MAR_STAT                 0.000000
RACE                     0.000000
ORIGIN                   0.000000
NHIA                     0.000000
SEX                      0.000000
AGE_DX                   0.000000
YR_BRTH                  0.000000
SEQ_NUM                  0.002948
DATE_mo                  0.000000
DATE_yr                  0.000000
LATERAL                  0.000000
HISTO2V                  0.000000
BEHO2V                   0.000000
HISTO3V                  0.000000
BEHO3V                   0.000000
GRADE                    0.000000
DX_CONF                  0.000000
REPT_SRC                 0.000000
EOD10_PN                24.518650
EOD10_NE                25.300603
EODCODE                 28.858278
NO_SURG                  0.000000
RADIATN                  0.000000
RAD_BRN                  0.000000
RAD_SURG                 0.090549
SS_SURG                 48.954050
REC_NO                   0.000000
TYPEFUP       

In [15]:
df.drop(columns=['EOD10_PN', 'EOD10_NE', 'EODCODE', 'SS_SURG', 'ICCC3WHO', 'ICCC3XWHO'], inplace=True)

In [16]:
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_percent

CASENUM                 0.000000
REG                     0.000000
MAR_STAT                0.000000
RACE                    0.000000
ORIGIN                  0.000000
NHIA                    0.000000
SEX                     0.000000
AGE_DX                  0.000000
YR_BRTH                 0.000000
SEQ_NUM                 0.002948
DATE_mo                 0.000000
DATE_yr                 0.000000
LATERAL                 0.000000
HISTO2V                 0.000000
BEHO2V                  0.000000
HISTO3V                 0.000000
BEHO3V                  0.000000
GRADE                   0.000000
DX_CONF                 0.000000
REPT_SRC                0.000000
NO_SURG                 0.000000
RADIATN                 0.000000
RAD_BRN                 0.000000
RAD_SURG                0.090549
REC_NO                  0.000000
TYPEFUP                 0.000000
AGE_REC                 0.000000
SITERWHO                0.000000
ICDOTO9V                0.000000
BEHANAL                 0.000000
HISTREC   

In [17]:
df = df.dropna()

In [18]:
df['is_malignant'] = (df['BEHO3V'] == 3).astype(int)


In [19]:
df.drop(columns=['BEHO3V'], inplace=True)

In [21]:
X = df.drop(columns=['is_malignant'])
y = df['is_malignant']

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Feature Engineering**

In [72]:
X_train.shape,X_test.shape

((569322, 326), (142331, 326))

In [73]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [74]:
## threshold--Domain expertise
corr_features=correlation(X_train,0.85)

In [75]:
## drop features when correlation is more than 0.85 
X_train.drop(corr_features,axis=1,inplace=True)
X_test.drop(corr_features,axis=1,inplace=True)
X_train.shape,X_test.shape

((569322, 286), (142331, 286))

# Logistic Regression

In [76]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [77]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [78]:
model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [79]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92     21450
           1       0.98      0.99      0.99    120881

    accuracy                           0.98    142331
   macro avg       0.97      0.95      0.96    142331
weighted avg       0.98      0.98      0.98    142331



In [80]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mae=mean_absolute_error(y_test,y_pred)
score=r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

Mean absolute error 0.022138536228931152
R2 Score 0.8270332746865866


# SVM

In [81]:
from sklearn.svm import SVC
model = SVC(kernel='linear', max_iter=1000)  # You can change kernel to 'linear', 'poly', etc.
model.fit(X_train, y_train)




0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [82]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.08      0.02      0.04     21450
           1       0.85      0.96      0.90    120881

    accuracy                           0.81    142331
   macro avg       0.46      0.49      0.47    142331
weighted avg       0.73      0.81      0.77    142331



In [83]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mae=mean_absolute_error(y_test,y_pred)
score=r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

Mean absolute error 0.18541990149721424
R2 Score -0.44867180188711275


In [84]:
# from sklearn.model_selection import cross_val_score

# for k in ['linear', 'rbf', 'poly']:
#     model = SVC(kernel=k)
#     scores = cross_val_score(model, X_train, y_train, cv=5)
#     print(f"{k} kernel accuracy: {scores.mean():.4f}")


# Decision Tree

In [85]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=10)
model.fit(X_train, y_train)


0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,5
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [86]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21450
           1       1.00      1.00      1.00    120881

    accuracy                           1.00    142331
   macro avg       1.00      1.00      1.00    142331
weighted avg       1.00      1.00      1.00    142331



In [87]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mae=mean_absolute_error(y_test,y_pred)
score=r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

Mean absolute error 0.0
R2 Score 1.0


# Random Forrest Classifier

In [88]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=8,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,8
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [89]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21450
           1       1.00      1.00      1.00    120881

    accuracy                           1.00    142331
   macro avg       1.00      1.00      1.00    142331
weighted avg       1.00      1.00      1.00    142331



In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mae=mean_absolute_error(y_test,y_pred)
score=r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

Mean absolute error 0.0
R2 Score 1.0


: 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(
    n_neighbors=2,
    weights='distance',
    metric='euclidean'
)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.37      0.18      0.24     21450
           1       0.87      0.94      0.90    120881

    accuracy                           0.83    142331
   macro avg       0.62      0.56      0.57    142331
weighted avg       0.79      0.83      0.80    142331



In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
mae=mean_absolute_error(y_test,y_pred)
score=r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

Mean absolute error 0.1704688367256606
R2 Score -0.33186025270687036


# Neural Networks

In [25]:
import torch
torch.cuda.is_available()

True

In [26]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return self.sigmoid(out)


In [23]:
X_train.shape, X_test.shape

((569322, 326), (142331, 326))

In [39]:
import numpy as np

In [41]:
X = X.astype(np.float32)

In [43]:
X = np.expand_dims(X, axis=1)

In [33]:
y = y.values.astype(np.float32)


In [44]:
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y).unsqueeze(1)

In [45]:
from torch.utils.data import DataLoader, TensorDataset, random_split

dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [46]:
input_dim = X.shape[2]
model = LSTMClassifier(input_dim=input_dim, hidden_dim=64)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [47]:
for epoch in range(10):  # you can increase epochs
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        output = model(xb)
        loss = criterion(output, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

ValueError: LSTM: Expected input to be 2D or 3D, got 4D instead