In [1]:
import numpy as np
import pandas as pd
import random
import math
import time
from tqdm import tqdm

In [2]:
from numba import jit, float64, float32, int32, prange, njit

In [3]:
@njit(fastmath=True, nogil=True)
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))


@njit(nogil=True)
def loss(w, X, y):
    margin = np.dot(X, w)
    l_if_pos = -np.logaddexp(0, -margin) * y
    l_if_neg = -np.logaddexp(0, margin) * (1 - y)
    
    l = -(l_if_pos + l_if_neg)
    sum_l = np.sum(l)
    return sum_l


@njit("(float32[:, :],float32[:, :],float64[:, :])", fastmath=True, nogil=True)
def gradients(X, y, y_hat):
    m = X.shape[0]
    diff = np.subtract(y_hat, y)
    scale_factor = np.divide(1, m)
    
    # Gradient of loss w.r.t weights
    dot = np.dot(np.transpose(X).astype(np.float64), diff)
    dw = np.multiply(scale_factor,dot)
        
    # Gradient of loss w.r.t bias
    db = np.multiply(scale_factor, np.sum(diff))
    return dw, db


@njit("float32[:, :](float32[:, :])", nogil=True, parallel=True)
def normalize(X):
    n, m = X.shape
    means = np.zeros(m)
    stds = np.zeros(m)
    
    # Compute column-wise means and standard deviations
    for j in prange(m):
        col = X[:, j]
        means[j] = np.mean(col)
        stds[j] = np.std(col)
    
    # Normalize X based on means and standard deviations
    for i in prange(n):
        for j in prange(m):
            X[i, j] = (X[i, j] - means[j]) / stds[j]
    
    return X


@njit("(float32[:, :],float32[:, :],int32,int32,float32)", nogil=True, parallel=True, fastmath=True)
def train(X, y, bs, epochs, lr):
    m = X.shape[0]
    n = X.shape[1]
    el = len(range(epochs))
    
    # Initializing weights and bias to zeros.
    w = np.zeros((n,1), dtype=np.float32)
    b = 0.0
    
    # Normalize inputs
    x = normalize(X)
    
    # Store losses
    np_losses = np.empty(el, dtype=np.float32)
    
    # Train
    for epoch in prange(epochs):
        for i in prange((m-1)//bs + 1):
            
            # Defining batches for SGD
            start_i = i*bs
            end_i = start_i + bs
            xb = x[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Predict
            y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
            
            # Calculate gradients
            dw, db = gradients(xb, yb, y_hat)
            
            # Update params
            w -= lr*dw
            b -= lr*db
        
        # Calc loss
        l = loss(w, x, y)
        np_losses[epoch] = l
        
    return w, b, np_losses

@njit
def predict(X, w, b):
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Calculating presictions/y_hat.
    preds = sigmoid(np.dot(x, w) + b)
    
    # Converting predicted probabilities to binary classes.
    pred_class = np.where(preds >= 0.5, 1, 0)
    
    return pred_class

@njit
def accuracy(y, pred):
    y_arr = np.asarray(y, dtype=np.float32)
    y_rav = y_arr.ravel()

    # pred = np.array(pred).ravel()
    pred_arr = np.asarray(pred, dtype=np.float32)
    pred_rav = pred_arr.ravel()
    return  np.sum(y_rav == pred_rav) / len(y_rav)

@njit("(float32[:, :],float32[:, :])", fastmath=True, nogil=True)
def compare(X, y):
    random.seed(1)
    # Training 
    w, b, l = train(X, y, bs=100, epochs=1000, lr=0.001)
    pred = predict(X, w, b)
    acc = accuracy(y, pred)

    return w, acc

  y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
  y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
  l = loss(w, x, y)
  l = loss(w, x, y)
  y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
  y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
  pred = predict(X, w, b)
  pred = predict(X, w, b)


In [4]:
start = time.time()
train_raw = pd.read_parquet('./data/train_data.parquet')
end = time.time() 
print(f"Reading the parquet file took {end - start:0.4f} seconds")

Reading the parquet file took 4.9624 seconds


In [5]:
def clean_and_getSampledata(train_raw):
    raw_sample = train_raw.iloc[:100_000] #Change sample size here
    raw_sample = raw_sample.drop('B_31', axis='columns')
    sample = raw_sample.select_dtypes(include=['float32', 'int64'], exclude=['object', 'category']).fillna(0)
    categorical_features = ['target']
    sample[categorical_features] = sample[categorical_features].astype("float32")
    X_train = sample.iloc[:,:-1].values
    y_train = sample[['target']].values
    return X_train, y_train

In [6]:
X_train, y_train = clean_and_getSampledata(train_raw)

In [12]:
w2, acc = compare(X_train, y_train)

In [8]:
# %%timeit
# w2, acc = compare(X_train, y_train)

In [14]:
# %%time
# w2, acc = compare(X_train, y_train)

In [10]:
# print(f'Accuracy of our logistic regression: {acc}')