In [1]:
%load_ext line_profiler

In [1]:
import numpy as np
import pandas as pd
import random
import math
import time
from tqdm import tqdm

In [2]:
##### NUMPY VERSION
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))


def loss(w, X, y):
    margin = np.matmul(X, w)
    l_if_pos = -np.logaddexp(0, -margin) * y
    l_if_neg = -np.logaddexp(0, margin) * (1 - y)
    
    l = -(l_if_pos + l_if_neg)
    sum_l = np.sum(l)
    return sum_l


def gradients(X, y, y_hat):
    m = X.shape[0]
    
    # Gradient of loss w.r.t weights
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    
    # Gradient of loss w.r.t bias
    db = (1/m)*np.sum((y_hat - y)) 
    
    return dw, db


def normalize(X):
    m, n = X.shape
    
    # Compute column-wise means and standard deviations
    means = np.mean(X, axis=0)
    stds = np.std(X, axis=0)
    
    # Normalize X based on means and standard deviations
    X = (X - means) / stds
    
    return X


def train(X, y, bs, epochs, lr):
    m, n = X.shape
    
    # Initializing weights and bias to zeros.
    w = np.zeros((n,1))
    b = 0
    
    # Normalize inputs
    x = normalize(X)
    
    # Store losses
    losses = []
    
    # Train
    for epoch in tqdm(range(epochs)):
    # for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            
            # Defining batches for SGD (this can be changed)
            start_i = i*bs
            end_i = start_i + bs
            xb = x[start_i:end_i]
            yb = y[start_i:end_i]
            
            # Predict
            y_hat = sigmoid(np.dot(xb, w) + b)
            
            # Calculate gradients
            dw, db = gradients(xb, yb, y_hat)
            
            # Update params
            w -= lr*dw
            b -= lr*db
        
        # Calc loss
        l = loss(w, x, y)
        losses.append(l)
        
    return w, b, losses



def predict(X, w, b):
    
    # Normalizing the inputs.
    x = normalize(X)
    
    # Calculating presictions/y_hat.
    preds = sigmoid(np.dot(x, w) + b)
    
    # Converting predicted probabilities to binary classes.
    pred_class = np.where(preds >= 0.5, 1, 0)
    
    return pred_class


def accuracy(y, y_hat):
    y = np.array(y).ravel()
    y_hat = np.array(y_hat).ravel()
    return  np.sum(y == y_hat) / len(y)


def compare(X, y):
    # Training 
    start1 = time.time()
    w, b, l = train(X, y, bs=100, epochs=1000, lr=0.001)
    pred = predict(X, w, b)
    acc = accuracy(y, pred)
    end1 = time.time()
    print(f'Time to run our logistic regression: {end1 - start1} s')
    print(f'Accuracy of our logistic regression: {acc}')
    return w

In [3]:
start = time.time()
train_raw = pd.read_parquet('./data/train_data.parquet')
end = time.time() 
print(f"Reading the parquet file took {end - start:0.4f} seconds")

Reading the parquet file took 8.5037 seconds


In [4]:
def clean_and_getSampledata(train_raw):
    raw_sample = train_raw.iloc[:100_000] #Change sample size here
    raw_sample = raw_sample.drop('B_31', axis='columns')
    sample = raw_sample.select_dtypes(include=['float32', 'int64'], exclude=['object', 'category']).fillna(0)
    categorical_features = ['target']
    sample[categorical_features] = sample[categorical_features].astype("float32")
    X_train = sample.iloc[:,:-1].values
    y_train = sample[['target']].values
    return X_train, y_train

In [5]:
X_train, y_train = clean_and_getSampledata(train_raw)

In [6]:
w2 = compare(X_train, y_train)

100%|██████████| 1000/1000 [02:06<00:00,  7.90it/s]


Time to run our logistic regression: 126.9586329460144 s
Accuracy of our logistic regression: 0.87453


In [None]:
# %%timeit
# w2 = compare(X_train, y_train)

In [None]:
# %%time
# w2 = compare(X_train, y_train)

In [16]:
# %lprun -f train w, b, losses = train(X_train, y_train, bs=100, epochs=1000, lr=0.001)

## Numpy version: 10_000 rows

NUMPY version with 10_000 rows

Timer unit: 1e-09 s

Total time: 8.83284 s
File: /var/folders/9c/9y0zmgk55297pv9nj1zpn02c0000gn/T/ipykernel_7388/2755537311.py
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def train(X, y, bs, epochs, lr):
     2         1       5000.0   5000.0      0.0      m, n = X.shape
     3                                               
     4                                               # Initializing weights and bias to zeros.
     5         1      19000.0  19000.0      0.0      w = np.zeros((n,1))
     6         1       1000.0   1000.0      0.0      b = 0
     7                                               
     8                                               # Normalize inputs
     9         1   32351000.0 32351000.0      0.4      x = normalize(X)
    10                                               
    11                                               # Store losses
    12         1       3000.0   3000.0      0.0      losses = []
    13                                               
    14                                               # Train
    15                                               # for epoch in tqdm(range(epochs)):
    16      1000     672000.0    672.0      0.0      for epoch in range(epochs):
    17    100000   39397000.0    394.0      0.4          for i in range((m-1)//bs + 1):
    18                                                       
    19                                                       # Defining batches for SGD (this can be changed)
    20    100000   35944000.0    359.4      0.4              start_i = i*bs
    21    100000   36371000.0    363.7      0.4              end_i = start_i + bs
    22    100000   80850000.0    808.5      0.9              xb = x[start_i:end_i]
    23    100000   51996000.0    520.0      0.6              yb = y[start_i:end_i]
    24                                                       
    25                                                       # Predict
    26    100000 3408959000.0  34089.6     38.6              y_hat = sigmoid(np.dot(xb, w) + b)
    27                                                       
    28                                                       # Calculate gradients
    29    100000 3699699000.0  36997.0     41.9              dw, db = gradients(xb, yb, y_hat)
    30                                                       
    31                                                       # Update params
    32    100000  279678000.0   2796.8      3.2              w -= lr*dw
    33    100000   53445000.0    534.5      0.6              b -= lr*db
    34                                                   
    35                                                   # Calc loss
    36      1000 1111383000.0 1111383.0     12.6          l = loss(w, x, y)
    37      1000    2070000.0   2070.0      0.0          losses.append(l)
    38                                                   
    39         1       1000.0   1000.0      0.0      return w, b, losses, dw, db, xb, yb, y_hat, x, y

## Slowest Version: 10_000 rows

Slower Loss Function with 10_000 rows

Timer unit: 1e-09 s

Total time: 1245.67 s
File: /var/folders/9c/9y0zmgk55297pv9nj1zpn02c0000gn/T/ipykernel_6936/2755537311.py
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def train(X, y, bs, epochs, lr):
     2         1       3000.0   3000.0      0.0      m, n = X.shape
     3                                               
     4                                               # Initializing weights and bias to zeros.
     5         1      12000.0  12000.0      0.0      w = np.zeros((n,1))
     6         1          0.0      0.0      0.0      b = 0
     7                                               
     8                                               # Normalize inputs
     9         1 2920546000.0 2920546000.0      0.2      x = normalize(X)
    10                                               
    11                                               # Store losses
    12         1          0.0      0.0      0.0      losses = []
    13                                               
    14                                               # Train
    15                                               # for epoch in tqdm(range(epochs)):
    16      1000     604000.0    604.0      0.0      for epoch in range(epochs):
    17    100000   46172000.0    461.7      0.0          for i in range((m-1)//bs + 1):
    18                                                       
    19                                                       # Defining batches for SGD (this can be changed)
    20    100000   49060000.0    490.6      0.0              start_i = i*bs
    21    100000   48828000.0    488.3      0.0              end_i = start_i + bs
    22    100000  104667000.0   1046.7      0.0              xb = x[start_i:end_i]
    23    100000   59746000.0    597.5      0.0              yb = y[start_i:end_i]
    24                                                       
    25                                                       # Predict
    26    100000 16590260000.0 165902.6      1.3              y_hat = sigmoid(np.dot(xb, w) + b)
    27                                                       
    28                                                       # Calculate gradients
    29    100000 68910900000.0 689109.0      5.5              dw, db = gradients(xb, yb, y_hat)
    30                                                       
    31                                                       # Update params
    32    100000  477482000.0   4774.8      0.0              w -= lr*dw
    33    100000   55244000.0    552.4      0.0              b -= lr*db
    34                                                   
    35                                                   # Calc loss
    36      1000 1156398994000.0 1156398994.0     92.8          l = loss(w, x, y)
    37      1000    2751000.0   2751.0      0.0          losses.append(l)
    38                                                   
    39         1       1000.0   1000.0      0.0      return w, b, losses, dw, db, xb, yb, y_hat, x, y

Slower gradients function over 10_000 rows

Timer unit: 1e-09 s

Total time: 97.2686 s
File: /var/folders/9c/9y0zmgk55297pv9nj1zpn02c0000gn/T/ipykernel_6380/2755537311.py
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def train(X, y, bs, epochs, lr):
     2         1       4000.0   4000.0      0.0      m, n = X.shape
     3                                               
     4                                               # Initializing weights and bias to zeros.
     5         1      15000.0  15000.0      0.0      w = np.zeros((n,1))
     6         1          0.0      0.0      0.0      b = 0
     7                                               
     8                                               # Normalize inputs
     9         1 3182344000.0 3182344000.0      3.3      x = normalize(X)
    10                                               
    11                                               # Store losses
    12         1       1000.0   1000.0      0.0      losses = []
    13                                               
    14                                               # Train
    15                                               # for epoch in tqdm(range(epochs)):
    16      1000     762000.0    762.0      0.0      for epoch in range(epochs):
    17    100000   49147000.0    491.5      0.1          for i in range((m-1)//bs + 1):
    18                                                       
    19                                                       # Defining batches for SGD (this can be changed)
    20    100000   51853000.0    518.5      0.1              start_i = i*bs
    21    100000   56232000.0    562.3      0.1              end_i = start_i + bs
    22    100000  120018000.0   1200.2      0.1              xb = x[start_i:end_i]
    23    100000   67372000.0    673.7      0.1              yb = y[start_i:end_i]
    24                                                       
    25                                                       # Predict
    26    100000 16504809000.0 165048.1     17.0              y_hat = sigmoid(np.dot(xb, w) + b)
    27                                                       
    28                                                       # Calculate gradients
    29    100000 75450204000.0 754502.0     77.6              dw, db = gradients(xb, yb, y_hat)
    30                                                       
    31                                                       # Update params
    32    100000  515040000.0   5150.4      0.5              w -= lr*dw
    33    100000   67845000.0    678.5      0.1              b -= lr*db
    34                                                   
    35                                                   # Calc loss
    36      1000 1201491000.0 1201491.0      1.2          l = loss(w, x, y)
    37      1000    1483000.0   1483.0      0.0          losses.append(l)
    38                                                   
    39         1       1000.0   1000.0      0.0      return w, b, losses, dw, db, xb, yb, y_hat, x, y

EXTREEEEEEEMLY SLOW gradients function on 200 rows only

Timer unit: 1e-09 s

Total time: 20.9818 s
File: /var/folders/9c/9y0zmgk55297pv9nj1zpn02c0000gn/T/ipykernel_5342/2755537311.py
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def train(X, y, bs, epochs, lr):
     2         1       2000.0   2000.0      0.0      m, n = X.shape
     3                                               
     4                                               # Initializing weights and bias to zeros.
     5         1       6000.0   6000.0      0.0      w = np.zeros((n,1))
     6         1          0.0      0.0      0.0      b = 0
     7                                               
     8                                               # Normalize inputs
     9         1   12975000.0 12975000.0      0.1      x = normalize(X)
    10                                               
    11                                               # Store losses
    12         1       1000.0   1000.0      0.0      losses = []
    13                                               
    14                                               # Train
    15                                               # for epoch in tqdm(range(epochs)):
    16      1000     600000.0    600.0      0.0      for epoch in range(epochs):
    17      2000    2656000.0   1328.0      0.0          for i in range((m-1)//bs + 1):
    18                                                       
    19                                                       # Defining batches for SGD (this can be changed)
    20      2000    1196000.0    598.0      0.0              start_i = i*bs
    21      2000     903000.0    451.5      0.0              end_i = start_i + bs
    22      2000    3723000.0   1861.5      0.0              xb = x[start_i:end_i]
    23      2000    1261000.0    630.5      0.0              yb = y[start_i:end_i]
    24                                                       
    25                                                       # Predict
    26      2000  279952000.0 139976.0      1.3              y_hat = sigmoid(np.dot(xb, w) + b)
    27                                                       
    28                                                       # Calculate gradients
    29      2000 20562835000.0 10281417.5     98.0              dw, db = gradients(xb, yb, y_hat)
    30                                                       
    31                                                       # Update params
    32      2000   12389000.0   6194.5      0.1              w -= lr*dw
    33      2000    2388000.0   1194.0      0.0              b -= lr*db
    34                                                   
    35                                                   # Calc loss
    36      1000   99422000.0  99422.0      0.5          l = loss(w, x, y)
    37      1000    1456000.0   1456.0      0.0          losses.append(l)
    38                                                   
    39         1       1000.0   1000.0      0.0      return w, b, losses, dw, db, xb, yb, y_hat, x, y

ITERATIVE SIGMOID function

Timer unit: 1e-09 s

Total time: 22.8768 s
File: /var/folders/9c/9y0zmgk55297pv9nj1zpn02c0000gn/T/ipykernel_3505/2337235970.py
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def train(X, y, bs, epochs, lr):
     2         1       3000.0   3000.0      0.0      m, n = X.shape
     3                                               
     4                                               # Initializing weights and bias to zeros.
     5         1      12000.0  12000.0      0.0      w = np.zeros((n,1))
     6         1       1000.0   1000.0      0.0      b = 0
     7                                               
     8                                               # Normalize inputs
     9         1 3408265000.0 3408265000.0     14.9      x = normalize(X)
    10                                               
    11                                               # Store losses
    12         1       1000.0   1000.0      0.0      losses = []
    13                                               
    14                                               # Train
    15      1000     876000.0    876.0      0.0      for epoch in range(epochs):
    16    100000   42802000.0    428.0      0.2          for i in range((m-1)//bs + 1):
    17                                                       
    18                                                       # Defining batches for SGD (this can be changed)
    19    100000   37798000.0    378.0      0.2              start_i = i*bs
    20    100000   40305000.0    403.1      0.2              end_i = start_i + bs
    21    100000   92731000.0    927.3      0.4              xb = x[start_i:end_i]
    22    100000   51316000.0    513.2      0.2              yb = y[start_i:end_i]
    23                                                       
    24                                                       # Predict
    25    100000 13611616000.0 136116.2     59.5              y_hat = sigmoid(np.dot(xb, w) + b)
    26                                                       
    27                                                       # Calculate gradients
    28    100000 4155308000.0  41553.1     18.2              dw, db = gradients(xb, yb, y_hat)
    29                                                       
    30                                                       # Update params
    31    100000  290947000.0   2909.5      1.3              w -= lr*dw
    32    100000   60531000.0    605.3      0.3              b -= lr*db
    33                                                   
    34                                                   # Calc loss
    35      1000 1082679000.0 1082679.0      4.7          l = loss(w, x, y)
    36      1000    1577000.0   1577.0      0.0          losses.append(l)
    37                                                   
    38         1          0.0      0.0      0.0      return w, b, losses

Timer unit: 1e-09 s

Total time: 12.0944 s
File: /var/folders/9c/9y0zmgk55297pv9nj1zpn02c0000gn/T/ipykernel_2969/2337235970.py
Function: train at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def train(X, y, bs, epochs, lr):
     2         1       3000.0   3000.0      0.0      m, n = X.shape
     3                                               
     4                                               # Initializing weights and bias to zeros.
     5         1      32000.0  32000.0      0.0      w = np.zeros((n,1))
     6         1       1000.0   1000.0      0.0      b = 0
     7                                               
     8                                               # Normalize inputs
     9         1 3052914000.0 3052914000.0     25.2      x = normalize(X)
    10                                               
    11                                               # Store losses
    12         1          0.0      0.0      0.0      losses = []
    13                                               
    14                                               # Train
    15      1000     826000.0    826.0      0.0      for epoch in range(epochs):
    16    100000   42030000.0    420.3      0.3          for i in range((m-1)//bs + 1):
    17                                                       
    18                                                       # Defining batches for SGD (this can be changed)
    19    100000   36225000.0    362.2      0.3              start_i = i*bs
    20    100000   36684000.0    366.8      0.3              end_i = start_i + bs
    21    100000   83706000.0    837.1      0.7              xb = x[start_i:end_i]
    22    100000   53319000.0    533.2      0.4              yb = y[start_i:end_i]
    23                                                       
    24                                                       # Predict
    25    100000 3582322000.0  35823.2     29.6              y_hat = sigmoid(np.dot(xb, w) + b)
    26                                                       
    27                                                       # Calculate gradients
    28    100000 3859054000.0  38590.5     31.9              dw, db = gradients(xb, yb, y_hat)
    29                                                       
    30                                                       # Update params
    31    100000  287882000.0   2878.8      2.4              w -= lr*dw
    32    100000   52495000.0    525.0      0.4              b -= lr*db
    33                                                   
    34                                                   # Calc loss
    35      1000 1005112000.0 1005112.0      8.3          l = loss(w, x, y)
    36      1000    1817000.0   1817.0      0.0          losses.append(l)
    37                                                   
    38         1       1000.0   1000.0      0.0      return w, b, losses