# Code Logistic Regression from scratch in Python

Equation of logistic regression is: y = 1/(1 + exp(-z))
where z = θ0 + θ1\*x1 + θ2\*x2 + θ3\*x3 + θ4\*x4 + ...

Lets say in our case z = 5 + (-2)\*x1 + 3\*x2 + (-4)\*x3 + 2.5\*x4

Our goal is to estimate θ0, θ1, θ2, θ3 and θ4 from the data

In [1]:
import copy
import numpy as np
import pandas as pd

NUM_FEATURES = 4
NUM_ROWS = 10000
THETAS = np.array([5, -2, 3, -4, 2.5])
EPOCHS = 10000
LEARNING_RATE = 0.1

### Create dummy data

In [2]:
def sigmoid(x):
    return 1/(1 + np.exp((-1)*x))

In [3]:
def create_dummy_data(num_features, num_rows):
    columns = []
    for i in range(1, num_features+1):
        column_name = f'x{i}'
        columns.append(column_name)
        
    x_columns = copy.deepcopy(columns)
    columns.append('y')
    
    df = pd.DataFrame(columns=columns)
    for i in range(num_rows):
        X = np.random.randint(1, 100, num_features)
        X = np.insert(X, 0, 1)
        products = X*THETAS
        # Add random jitter of 10%
        jitter = np.random.uniform(-0.1, 0.1, num_features+1)
        products = products+(jitter*products)
        z = sum(products)
        z_sigmoid = sigmoid(z)
        y = 1 if z_sigmoid >= 0.5 else 0
        df.loc[len(df)] = list(X[1:]) + [y]
        
    return df

In [4]:
df = create_dummy_data(num_features=NUM_FEATURES, num_rows=NUM_ROWS)

In [5]:
df.shape

(10000, 5)

In [6]:
df.head()

Unnamed: 0,x1,x2,x3,x4,y
0,26,37,95,99,0
1,51,3,58,81,0
2,26,49,61,69,1
3,25,33,93,41,0
4,92,12,82,45,0


In [7]:
df['y'].value_counts()

0    5462
1    4538
Name: y, dtype: int64

### Estimate parameters from data

In [8]:
# Start with random estimates of parameters
params = np.zeros((NUM_FEATURES+1, 1))
X = df.iloc[:, :NUM_FEATURES]
X['x0'] = 1
columns_order = sorted(X.columns)
X = X[columns_order]
y = df['y'].values.reshape((NUM_ROWS, 1))

# Update parameters in each epoch using Gradient Descent
for i in range(EPOCHS):
    y_hat = sigmoid(np.array(np.dot(X, params), dtype=np.float32))
    d_y_hat = y_hat - y
    d_params = np.dot(d_y_hat.T, X)/NUM_ROWS
    params = params - (LEARNING_RATE * d_params.T)

  


In [9]:
params

array([[1.3310168926529196],
       [-2.2472836416330457],
       [3.7876212192331646],
       [-4.67323900760257],
       [3.1673963195294292]], dtype=object)

In [10]:
THETAS.reshape((5,1))/params

array([[3.7565263278020744],
       [0.8899633152433972],
       [0.7920538581752309],
       [0.8559373902110883],
       [0.7892918181995671]], dtype=object)

### Our estimated parameters are not quite close to the original parameters, but the ratio of actual/computed is roughly same for all (~0.8) which shows they will converge if the number of epochs are increased.