In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
file_path = "C:/Users/shara/OneDrive/Desktop/ML ACTS/projects ml/Logistic Regression/diabetes (2).csv"
df = pd.read_csv(file_path)
print(df.describe())
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[col] = df[col].replace(0, np.nan)
    df[col].fillna(df[col].median(), inplace=True) #median not drop as very less values are missing and considered as 0 in dataset
print("\nMissing values:\n", df.isnull().sum())
print("\nOutcome distribution:\n", df['Outcome'].value_counts())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True) #median not drop as very less values are missing and considered as 0 in dataset


In [None]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True) #random_state for reproducibility , means if you run it again you will get same output even in other systems
train = df_shuffled.iloc[:300]
test = df_shuffled.iloc[300:767]
print(f"Train set shape: {train.shape}")
print(f"Test set shape: {test.shape}")

X_train = train.iloc[:, :-1].values # numpy array
y_train = train.iloc[:, -1].values  
X_test = test.iloc[:, :-1].values  
y_test = test.iloc[:, -1].values 

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


Train set shape: (300, 9)
Test set shape: (467, 9)
X_train shape: (300, 8)
y_train shape: (300,)
X_test shape: (467, 8)
y_test shape: (467,)


In [9]:
#feature scaling (standardization) #this is z score normalization mean becomes 0 std dev becomes 1 
#Features often have different units and magnitudes. For example, age might range 0â€“100, income could be in thousands or lakhs.
#Models that use gradient descent, distance calculations, or regularization (e.g., linear regression(multiple features), logistic regression, SVMs, k-NN) perform better and train faster if the input features are on a similar scale.
#Without scaling, features with larger ranges can dominate the learning process or cause numerical instability.
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)#axis=0 means for each feature =x column
X_train_scaled = (X_train - train_mean) / train_std
X_test_scaled = (X_test - train_mean) / train_std

In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.loss_curve = []

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.m, self.n = X.shape      # m = training samples(rows), n = features (columns)
        self.weights = np.zeros(self.n)
        self.bias = 0
        self.X = X
        self.y = y
        for i in range(self.n_iterations):
            self.update_weights()

    def update_weights(self):
        y_hat = self.sigmoid(np.dot(self.X, self.weights) + self.bias)  #y_hat is sigmoid function formula value (z)
        dw = (1 / self.m) * np.dot(self.X.T, (y_hat - self.y)) # self.X.T is now(n,m) so that dot product gives (n,) weights which is 1 dimension 
        db = (1 / self.m) * np.sum(y_hat - self.y)   # 1/m to avg over all samples / total no. of rows 
        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

    def predict(self, X):
        y_pred = self.sigmoid(np.dot(X, self.weights) + self.bias)
        y_pred = np.where(y_pred >= 0.5, 1, 0) #will make the y value either be 1 or 0 seperator 0.5
        return y_pred

    

In [11]:
model = LogisticRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X_train_scaled, y_train)
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

In [None]:
train_acc = np.mean(y_pred_train == y_train)
test_acc = np.mean(y_pred_test == y_test)
#means if prediction is 0 and same as actual (0 or same for 1) then its correct prediction
print(f"Train Accuracy: {train_acc * 100:.2f}%")
print(f"Test Accuracy: {test_acc * 100:.2f}%")

Train Accuracy: 78.33%
Test Accuracy: 75.16%
