In [1]:
import numpy as np
import pandas as pd
import random
import math
import time
from tqdm import tqdm

from numba import jit, float64, float32, int32, prange, njit

In [8]:
@njit("(float32[:, :],float32[:, :],int32,int32,float32)", nogil=True, parallel=True, fastmath=True)
def train(X, y, bs, epochs, lr):
    m = X.shape[0]
    n = X.shape[1]
    
    # Initializing weights and bias to zeros.
    w = np.zeros((n,1), dtype=np.float32)
    b = 0.0
    
    # Normalize inputs
    means = np.zeros(n)
    stds = np.zeros(n)
    
    # Compute column-wise means and standard deviations
    for j in prange(n):
        col = X[:, j]
        means[j] = np.mean(col)
        stds[j] = np.std(col)
    
    # Normalize X based on means and standard deviations
    for i in prange(m):
        for j in prange(n):
            X[i, j] = (X[i, j] - means[j]) / stds[j]
    
    # Train
    for epoch in prange(epochs):
        for i in prange((m-1)//bs + 1):
            
            # Defining batches for SGD
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Predict
            y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
            
            # Calculate gradients
            diff = np.subtract(y_hat, yb)
            scale_factor = np.divide(1,xb.shape[0])

            #Gradient of loss wrt weights
            dot = np.dot(np.transpose(xb).astype(np.float64), diff)
            dw = np.multiply(scale_factor, dot)

            #Gradient of loss wrt bias
            db = np.multiply(scale_factor, np.sum(diff))
            
            # Update params
            w -= lr*dw
            b -= lr*db
        
    return w, b

@njit("Tuple((float32[:, :], float32))(float32[:, :],float32[:, :])", nogil=True, fastmath=True)
def compare(X, y): 
    random.seed(1)
    w, b = train(X, y, bs=100, epochs=1000, lr=0.001)

    # Normalizing the inputs.
    n, m = X.shape
    means = np.zeros(m)
    stds = np.zeros(m)
    
    # Compute column-wise means and standard deviations
    for j in prange(m):
        col = X[:, j]
        means[j] = np.mean(col)
        stds[j] = np.std(col)
    
    # Normalize X based on means and standard deviations
    for i in prange(n):
        for j in prange(m):
            X[i, j] = (X[i, j] - means[j]) / stds[j]
    
    # Calculating predictions/y_hat.
    preds = 1.0 / (1 + np.exp(-(np.dot(X, w) + b)))
    
    # Converting predicted probabilities to binary classes.
    pred = np.where(preds >= 0.5, 1, 0)
    
    y_rav = np.asarray(y, dtype=np.float32).ravel()
    pred_rav = np.asarray(pred, dtype=np.float32).ravel()
    acc = np.sum(y_rav == pred_rav) / len(y_rav)
    return w, acc

  preds = 1.0 / (1 + np.exp(-(np.dot(X, w) + b)))
  preds = 1.0 / (1 + np.exp(-(np.dot(X, w) + b)))


In [3]:
start = time.time()
train_raw = pd.read_parquet('./data/train_data.parquet')
end = time.time() 
print(f"Reading the parquet file took {end - start:0.4f} seconds")

Reading the parquet file took 5.4225 seconds


In [9]:
def clean_and_getSampledata(train_raw):
    raw_sample = train_raw.iloc[:100_000] #Change sample size here
    raw_sample = raw_sample.drop('B_31', axis='columns')
    sample = raw_sample.select_dtypes(include=['float32', 'int64'], exclude=['object', 'category']).fillna(0)
    categorical_features = ['target']
    sample[categorical_features] = sample[categorical_features].astype("float32")
    X_train = sample.iloc[:,:-1].values
    y_train = sample[['target']].values
    return X_train, y_train

In [10]:
X_train, y_train = clean_and_getSampledata(train_raw)

In [12]:
w2, acc = compare(X_train, y_train)

In [None]:
%%timeit
w2, acc = compare(X_train, y_train)

27.2 s ± 603 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
#  %%time
#  w2, acc = compare(X_train, y_train)

In [9]:
# print(f'Accuracy of our logistic regression: {acc}')