In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.special import expit

# Load the dataset
df = pd.read_csv('Social_Network_Ads.csv')
df.head()


Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [12]:
# Split the dataset into features and labels
X = df.drop(['User ID', 'Gender', 'Purchased'], axis=1)
y = df['Purchased']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [13]:
# Create scalers
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Scale the features using raw data, normalization, and standardization
X_train_raw = X_train.copy()
X_test_raw = X_test.copy()

X_train_norm = min_max_scaler.fit_transform(X_train)
X_test_norm = min_max_scaler.transform(X_test)

X_train_std = standard_scaler.fit_transform(X_train)
X_test_std = standard_scaler.transform(X_test)

In [22]:
def sigmoid(z):
    return expit(z)

# Define the cost function
def cost_function(h, y):
    epsilon = 1e-10  # a small value to avoid zero or one in the logarithm
    return (-y * np.log(h + epsilon) - (1 - y) * np.log(1 - h + epsilon)).mean()

# Define the logistic regression function
def logistic_regression(X, y, alpha, num_iter, intercept=False):
    # Add an intercept term if needed
    if intercept:
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
    
    # Initialize the coefficients
    theta = np.zeros(X.shape[1])
    
    # Initialize the cost history
    cost_history = []
    
    # Perform gradient descent
    for i in range(num_iter):
        # Compute the linear combination of features and coefficients
        z = np.dot(X, theta)
        
        # Compute the sigmoid of z
        h = sigmoid(z)
        
        # Compute the gradient
        gradient = np.dot(X.T, (h - y)) / y.size
        
        # Update the coefficients
        theta -= alpha * gradient
        
        # Compute the cost
        cost = cost_function(h, y)
        
        # Append the cost to the cost history
        cost_history.append(cost)
    
    # Compute the predictions
    y_pred = sigmoid(np.dot(X, theta))
    y_pred = np.round(y_pred)
    
    # Return the coefficients, cost history, and predictions
    return theta, cost_history, y_pred



In [25]:
# Raw data
theta_raw, _, predictions_scratch_raw = logistic_regression(X_train_raw.values, y_train.values, alpha=0.01, num_iter=1000)
accuracy_scratch_raw = accuracy_score(y_train, predictions_scratch_raw)
print("Accuracy of Logistic Regression (raw data) from scratch:", accuracy_scratch_raw)

# Normalization
theta_normalized, _, predictions_scratch_norm = logistic_regression(X_train_norm, y_train, alpha=0.01, num_iter=1000)
predictions_scratch_norm = np.round(predictions_scratch_norm)

# Check and align shapes of y_test and predictions_scratch_norm
print("Shape of y_test:", y_test.shape)
print("Shape of predictions_scratch_norm:", predictions_scratch_norm.shape)

if y_test.shape != predictions_scratch_norm.shape:
    print("Shapes are different. Aligning shapes...")
    min_samples = min(y_test.shape[0], predictions_scratch_norm.shape[0])
    y_test = y_test[:min_samples]
    predictions_scratch_norm = predictions_scratch_norm[:min_samples]

accuracy_scratch_norm = accuracy_score(y_test, predictions_scratch_norm)
print("Accuracy of Logistic Regression (normalized) from scratch:", accuracy_scratch_norm)


# Standardization
theta_standardized, _, predictions_scratch_std = logistic_regression(X_train_std, y_train, alpha=0.01, num_iter=1000)
predictions_scratch_std = np.round(predictions_scratch_std)

# Check and align shapes of y_test and predictions_scratch_std
print("Shape of y_test:", y_test.shape)
print("Shape of predictions_scratch_std:", predictions_scratch_std.shape)

if y_test.shape != predictions_scratch_std.shape:
    print("Shapes are different. Aligning shapes...")
    min_samples = min(y_test.shape[0], predictions_scratch_std.shape[0])
    y_test = y_test[:min_samples]
    predictions_scratch_std = predictions_scratch_std[:min_samples]

accuracy_scratch_std = accuracy_score(y_test, predictions_scratch_std)
print("Accuracy of Logistic Regression (standardized) from scratch:", accuracy_scratch_std)


# Logistic Regression using scikit-learn
model_sklearn_raw = LogisticRegression(max_iter=1000)
model_sklearn_raw.fit(X_train_raw, y_train)
predictions_raw = model_sklearn_raw.predict(X_test_raw)
accuracy_raw = accuracy_score(y_test, predictions_raw)
print("Accuracy of Logistic Regression (raw data) from sklearn:", accuracy_raw)

model_sklearn_normalized = LogisticRegression(max_iter=1000)
model_sklearn_normalized.fit(X_train_norm, y_train)
predictions_normalized = model_sklearn_normalized.predict(X_test_norm)
accuracy_normalized = accuracy_score(y_test, predictions_normalized)
print("Accuracy of Logistic Regression (normalized) from sklearn:", accuracy_normalized)
model_sklearn_standardized = LogisticRegression(max_iter=1000)
model_sklearn_standardized.fit(X_train_std, y_train)
predictions_standardized = model_sklearn_standardized.predict(X_test_std)
accuracy_standardized = accuracy_score(y_test, predictions_standardized)
print("Accuracy of Logistic Regression (standardized) from sklearn:", accuracy_standardized)

Accuracy of Logistic Regression (raw data) from scratch: 0.6571428571428571
Shape of y_test: (120,)
Shape of predictions_scratch_norm: (280,)
Shapes are different. Aligning shapes...
Accuracy of Logistic Regression (normalized) from scratch: 0.6083333333333333
Shape of y_test: (120,)
Shape of predictions_scratch_std: (280,)
Shapes are different. Aligning shapes...
Accuracy of Logistic Regression (standardized) from scratch: 0.5833333333333334
Accuracy of Logistic Regression (raw data) from sklearn: 0.6083333333333333
Accuracy of Logistic Regression (normalized) from sklearn: 0.8416666666666667
Accuracy of Logistic Regression (standardized) from sklearn: 0.85
