In [3]:
import numpy as np
import pandas as pd
import random
import math
import time
from tqdm import tqdm
from numba import jit, float64, float32, int32, prange, njit

In [6]:
%load_ext Cython

In [104]:
%%cython
import numpy as np
import pandas as pd
import random
import math
import time
from numba import jit, float64, float32, int32, prange, njit
cimport numpy as np

def train(np.ndarray[float, ndim=2] X, np.ndarray[float,ndim=2] y, int bs, int epochs, float lr):
    cdef int m = X.shape[0]
    cdef int n = X.shape[1]
    
    # Initializing weights and bias to zeros.
    cdef np.ndarray[float, ndim=2] w = np.zeros((n,1), dtype=np.float32)
    cdef float b = 0.0
    
    # Normalize inputs
    cdef np.ndarray[double, ndim=1] means = np.zeros(n)
    cdef np.ndarray[double, ndim=1] stds = np.zeros(n)

    # Compute column-wise means and standard deviations
    for j in prange(n):
        col = X[:, j]
        means[j] = np.mean(col)
        stds[j] = np.std(col)
    
    # Normalize X based on means and standard deviations
    for i in prange(m):
        for j in prange(n):
            X[i, j] = (X[i, j] - means[j]) / stds[j]
    
    # Train
    for epoch in prange(epochs):
        for i in prange((m-1)//bs + 1):
            
            # Defining batches for SGD
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Predict
            y_hat = 1.0 / (1 + np.exp(-(np.dot(xb, w) + b)))
            
            # Calculate gradients
            #dw, db = gradients(xb, yb, y_hat)
            diff = np.subtract(y_hat, yb)
            scale_factor = np.divide(1,xb.shape[0])
            #scale_factor = np.divide(1/xb.shape[0])
            #Gradient of loss wrt weights
            dot = np.dot(np.transpose(xb).astype(np.float64), diff)
            dw = np.multiply(scale_factor, dot)
            #Gradient of loss wrt bias
            db = np.multiply(scale_factor, np.sum(diff))
            
            # Update params
            w -= lr*dw
            b -= lr*db
        
    return w, b

def compare(np.ndarray[float, ndim=2] X, np.ndarray[float, ndim=2] y): 
    random.seed(1)
    wb = train(X, y, bs=100, epochs=1000, lr=0.001)
    cdef np.ndarray[float, ndim=2] w = wb[0]
    cdef float b = wb[1]

    # Normalizing the inputs.
    n, m = X.shape[0], X.shape[1]
    
    # Normalize inputs
    cdef np.ndarray[double, ndim=1] means = np.zeros(m)
    cdef np.ndarray[double, ndim=1] stds = np.zeros(m)

    # Compute column-wise means and standard deviations
    for j in prange(m):
        col = X[:, j]
        means[j] = np.mean(col)
        stds[j] = np.std(col)
    
    # Normalize X based on means and standard deviations
    for i in prange(n):
        for j in prange(m):
            X[i, j] = (X[i, j] - means[j]) / stds[j]
    
    # Calculating presictions/y_hat.
    cdef np.ndarray[float, ndim=2] preds = 1.0 / (1 + np.exp(-(np.dot(X, w) + b)))
    # Converting predicted probabilities to binary classes.
    cdef np.ndarray[long, ndim=2] pred = np.where(preds >= 0.5, 1, 0)
    cdef np.ndarray[float, ndim=1] y_rav = np.asarray(y, dtype=np.float32).ravel()
    cdef np.ndarray[float, ndim=1] pred_rav = np.asarray(pred, dtype=np.float32).ravel()
    cdef float acc = np.sum(y_rav == pred_rav) / len(y_rav)
    return w, acc



In [10]:
start = time.time()
train_raw = pd.read_parquet('./train_data.parquet')
end = time.time() 
print(f"Reading the parquet file took {end - start:0.4f} seconds")

Reading the parquet file took 24.9092 seconds


In [105]:
def clean_and_getSampledata(train_raw):
    raw_sample = train_raw.iloc[:100_000] #Change sample size here
    raw_sample = raw_sample.drop('B_31', axis='columns')
    sample = raw_sample.select_dtypes(include=['float32', 'int64'], exclude=['object', 'category']).fillna(0)
    categorical_features = ['target']
    sample[categorical_features] = sample[categorical_features].astype("float32")
    X_train = sample.iloc[:,:-1].values
    y_train = sample[['target']].values
    return X_train, y_train

In [106]:
X_train, y_train = clean_and_getSampledata(train_raw)

In [None]:
# %%timeit
# w2, acc = compare(X_train, y_train)

In [107]:
%%time
w2, acc = compare(X_train, y_train)

CPU times: user 7min 15s, sys: 15min 22s, total: 22min 37s
Wall time: 4min 42s


In [108]:
print(f'Accuracy of our logistic regression: {acc}')

Accuracy of our logistic regression: 0.874530017375946
