In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# read dataset from a Google Drive File
# file_link = 'https://drive.google.com/file/d/1uLtuUvWGOSSoyGrelVKH1WBbq8NzRmSo/view?usp=sharing' # the file access must have to be Public
# id = file_link.split("/")[-2]
# new_link = f'https://drive.google.com/uc?id={id}'
# print(new_link)
df = pd.read_csv("/kaggle/input/cell-samples/cell_samples.csv")

In [None]:
# Explore the dataset (optional)
print(df.head())
print(df.info())

        ID  Clump  UnifSize  UnifShape  MargAdh  SingEpiSize BareNuc  \
0  1000025      5         1          1        1            2       1   
1  1002945      5         4          4        5            7      10   
2  1015425      3         1          1        1            2       2   
3  1016277      6         8          8        1            3       4   
4  1017023      4         1          1        3            2       1   

   BlandChrom  NormNucl  Mit  Class  
0           3         1    1      2  
1           3         2    1      2  
2           3         1    1      2  
3           3         7    1      2  
4           3         1    1      2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           699 non-null    int64 
 1   Clump        699 non-null    int64 
 2   UnifSize     699 non-null    int64 
 3   UnifShape    699 non-nul

In [None]:
#change categorical col to numerical col
df.dtypes
df = df[pd.to_numeric(df['BareNuc'],errors='coerce').notnull()].astype('int')
df.dtypes

ID             int64
Clump          int64
UnifSize       int64
UnifShape      int64
MargAdh        int64
SingEpiSize    int64
BareNuc        int64
BlandChrom     int64
NormNucl       int64
Mit            int64
Class          int64
dtype: object

In [None]:
X = df.iloc[:, :-1]  # all columns except the last one
y = df.iloc[:, -1]   # the last column

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

#check for accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
#can also use classifiation_report metrics for more details

Accuracy: 0.9416058394160584


# **Implementing Logistic Regression from scratch**





In [None]:
X = df.drop(columns=['Class']).values
y = df['Class'].values
y = np.where(y == 2, 0, 1)  # Convert labels to 0 and 1 if needed (2=benign, 4=malignant)

# Normalize features
X = (X - X.mean(axis=0)) / X.std(axis=0) #axis=0 goes for a particular column

# Add bias term
# is independent feature is 0, still allows to classify.
X = np.c_[np.ones(X.shape[0]), X]  # Add bias column (x0 = 1)

# Initialize weights
weights = np.zeros(X.shape[1])

def sigmoid(z):
    #return the expression for sigmoid function
    return 1 / (1 + np.exp(-z))

# Loss function (binary cross-entropy)
def compute_loss(y, y_pred):
    epsilon = 1e-10  # to avoid log(0)
    # https://towardsdatascience.com/understanding-binary-cross-entropy-log-loss-a-visual-explanation-a3ac6025181a/
    loss = -np.mean(y * np.log(y_pred + epsilon) + (1 - y) * np.log(1 - y_pred + epsilon))
    return loss

# Gradient descent
def train(X, y, weights, lr=0.1, epochs=1000):
    for i in range(epochs):
        z = np.dot(X, weights)
        # call the sigmoid function on z
        y_pred = sigmoid(z)

        # find the error by calculating the difference between y_pred and y
        error = y_pred - y

        grad = np.dot(X.T, error) / len(y)
        # update the weights using learning rate(lr) and gradient(grad)
        weights -= lr * grad

        if i % 100 == 0:
            loss = compute_loss(y, y_pred)
            print(f"Epoch {i}: Loss = {loss:.4f}")

    return weights

# Train the model
weights = train(X, y, weights)

# Predictions
y_pred = sigmoid(np.dot(X, weights)) >= 0.5
accuracy = np.mean(y_pred == y)
print(f"Final Accuracy: {accuracy:.4f}")


Epoch 0: Loss = 0.6931
Epoch 100: Loss = 0.0987
Epoch 200: Loss = 0.0868
Epoch 300: Loss = 0.0829
Epoch 400: Loss = 0.0809
Epoch 500: Loss = 0.0796
Epoch 600: Loss = 0.0788
Epoch 700: Loss = 0.0781
Epoch 800: Loss = 0.0777
Epoch 900: Loss = 0.0773
Final Accuracy: 0.9707
