## Logistic Regression example

Create the artificial dataset

In [13]:
import numpy as np
import pandas as pd

# Create synthetic data
np.random.seed(123)
num_samples = 200000
X1_class0 = np.random.normal(loc=5, scale=1, size=num_samples // 2)
X2_class0 = np.random.normal(loc=3, scale=1, size=num_samples // 2)
X3_class0 = np.random.normal(loc=8, scale=2, size=num_samples // 2)
X4_class0 = np.random.normal(loc=4, scale=2, size=num_samples // 2)
X5_class0 = np.random.normal(loc=6, scale=1, size=num_samples // 2)

X1_class1 = np.random.normal(loc=10, scale=1, size=num_samples // 2)
X2_class1 = np.random.normal(loc=8, scale=1, size=num_samples // 2)
X3_class1 = np.random.normal(loc=15, scale=2, size=num_samples // 2)
X4_class1 = np.random.normal(loc=12, scale=2, size=num_samples // 2)
X5_class1 = np.random.normal(loc=18, scale=1, size=num_samples // 2)

# Create feature matrix X and target vector y
X_class0 = np.column_stack((X1_class0, X2_class0, X3_class0, X4_class0, X5_class0))
X_class1 = np.column_stack((X1_class1, X2_class1, X3_class1, X4_class1, X5_class1))
X = np.vstack((X_class0, X_class1))

y_class0 = np.zeros(num_samples // 2)
y_class1 = np.ones(num_samples // 2)
y = np.concatenate((y_class0, y_class1))

# Shuffle the data
indices = np.arange(num_samples)
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

# Convert data to a Pandas DataFrame
data = pd.DataFrame({'X1': X[:, 0], 'X2': X[:, 1], 'X3': X[:, 2], 'X4': X[:, 3], 'X5': X[:, 4], 'y': y})

In [15]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X = data[['X1', 'X2', 'X3', 'X4', 'X5']].values
y = data['y'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# get the class distribution of the training set
print("Class distribution of the training set: ", np.unique(y_train, return_counts=True))
print("Class distribution of the test set: ", np.unique(y_test, return_counts=True))

Class distribution of the training set:  (array([0., 1.]), array([80031, 79969]))
Class distribution of the test set:  (array([0., 1.]), array([19969, 20031]))


Test the model

In [18]:
from tensorflow_ml.classification.logistic_regression import LogisticRegression

# Initialize and set hyperparameters for the LogisticRegression class
logistic_regression = LogisticRegression()
params = {
    'learning_rate': 0.01,
    'num_epochs': 100,
    'batch_size': 32,
    'reg_strength': 0.1,
    'early_stopping_patience': 5,
    'regularization': 'l2'
}
logistic_regression.set_params(params)

# Train the model
logistic_regression.fit(X_train, y_train, random_seed=42, X_val=X_test[:100], y_val=y_test[:100])

# Evaluate the model on the test set
accuracy, cross_entropy_loss = logistic_regression.score(X_test, y_test)
accuracy2, cross_entropy_loss2 = logistic_regression.score(X_train, y_train)

print(f"Accuracy on test set :\t{accuracy*100:.4f} %")
print(f"Accuracy on train set :\t{accuracy2*100:.4f} %")

print(f"\nCross-entropy loss on test set :\t{cross_entropy_loss:.4f}")
print(f"Cross-entropy loss on train set :\t{cross_entropy_loss2:.4f}")



Compare against the sklearn implementation of logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score

lr = LR()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Accuracy: 1.0000
