In [1]:
import time
import jax
import pandas as pd
from tqdm import tqdm
import jax.numpy as jnp
from jax import jit, grad, hessian
from jax import grad, jit, vmap


In [2]:
def sigmoid(z):
    return 1.0 / (1.0 + jnp.exp(-z))

def binary_cross_entropy_loss(y_true, y_pred):
    """Binary cross-entropy loss."""
    epsilon = 1e-7  # small value to avoid division by zero
    y_pred = jnp.clip(y_pred, epsilon, 1 - epsilon)  # clip y_pred to avoid log(0)
    loss = -(y_true * jnp.log(y_pred) + (1 - y_true) * jnp.log(1 - y_pred))
    return jnp.mean(loss)

def normalize(X):
    means = jnp.mean(X, axis=0)
    stds = jnp.std(X, axis=0)

    X_normalized = jnp.divide((X - means), stds)

    return X_normalized


@jit
def gradients(w, b, xb, yb):
    y_hat = jnp.dot(xb, w) + b
    diff = sigmoid(y_hat) - yb
    dw = jnp.dot(xb.T, diff)
    db = jnp.sum(diff)
    return dw, db

def train(X, y, bs, epochs, lr):
    m, n = X.shape
    
    # Initializing weights and bias to zeros.
    w = jnp.zeros((n,1))
    b = jnp.zeros(())
    
    # Normalize inputs
    x = normalize(X)
    
    # Store losses
    losses = []
    
    # Train
    for epoch in tqdm(range(epochs)):
        for i in range((m-1)//bs + 1):
            
            # Defining batches for SGD (this can be changed)
            start_i = i*bs
            end_i = start_i + bs
            xb = x[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Calculate gradients
            dw, db = gradients(w, b, xb, yb)
            
            # Update params
            w -= lr*dw
            b -= lr*db
        
        # Calc loss
        y_hat = jnp.dot(x, w) + b
        l = jnp.mean(binary_cross_entropy_loss(y_hat, y))
        losses.append(l)
    
    return w, b, losses

In [3]:
@jit
def predict(X, w, b):
    # X --> Input.
    # Normalizing the inputs.
    x = normalize(X)
    
    # Calculating predictions/y_hat.
    preds = sigmoid(jnp.dot(X, w) + b)
    
    # if y_hat >= 0.5 --> round up to 1
    # if y_hat < 0.5 --> round up to 1
    pred_class = jnp.round(preds).astype(int)
    return pred_class


@jit
def accuracy(y_true, y_pred):
    return jnp.mean(y_true == jnp.round(y_pred))

In [4]:
def compare(X, y):
    # Training 
    start1 = time.time()
    w, b, l = train(X, y, bs=100, epochs=1000, lr=0.001)
    pred = predict(X, w, b)
    acc = accuracy(y, pred)
    end1 = time.time()
    print(f'Time to run our logistic regression: {end1 - start1} s')
    print(f'Accuracy of our logistic regression: {acc}')
    return w

In [5]:
start = time.time()
train_raw = pd.read_parquet('train_data.parquet')
end = time.time() 
print(f"Reading the parquet file took {end - start:0.4f} seconds")

Reading the parquet file took 8.0623 seconds


In [6]:
def clean_and_getSampledata(train_raw):
    raw_sample = train_raw.iloc[:100_000] #Change sample size here
    raw_sample = raw_sample.drop('B_31', axis='columns')
    sample = raw_sample.select_dtypes(include=['float32', 'int64'], exclude=['object', 'category']).fillna(0)
    categorical_features = ['target']
    sample[categorical_features] = sample[categorical_features].astype("float32")
    X_train = sample.iloc[:,:-1].values
    y_train = sample[['target']].values
    return X_train, y_train

In [7]:
X_train, y_train = clean_and_getSampledata(train_raw)

In [8]:
w2 = compare(X_train, y_train)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:28<00:00,  3.05it/s]

Time to run our logistic regression: 328.62737798690796 s
Accuracy of our logistic regression: 0.7498399615287781





In [9]:
# 5.477 minutes