In [18]:
import numpy as np 
from helpers import * 
from implementations import logistic_regression

In [147]:
def normalize(data):
    """
    Normalize the input data to have mean 0 and standard deviation 1.

    Parameters:
    - data: numpy array of shape (m, n) where m is the number of samples and n is the number of features.

    Returns:
    - normalized_data: numpy array of shape (m, n) with normalized values.
    """
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    normalized_data = (data - mean) / (std + 10e-300)

    return normalized_data



In [181]:
MAX_ROWS = 100000

x_data = np.genfromtxt('data/dataset/x_train.csv', delimiter=",", skip_header=1,  max_rows=MAX_ROWS)
y_data = np.genfromtxt('data/dataset/y_train.csv', delimiter=",", skip_header=1,  max_rows=MAX_ROWS)

x_data[np.isnan(x_data)] = 0
x_data = normalize(x_data)

In [178]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]



def split_data(x, y, ratio, seed=1):
    """
    split the dataset based on the split ratio. If ratio is 0.8
    you will have 80% of your data set dedicated to training
    and the rest dedicated to testing. If ratio times the number of samples is not round
    you can use np.floor. Also check the documentation for np.random.permutation,
    it could be useful.

    Args:
        x: numpy array of shape (N,), N is the number of samples.
        y: numpy array of shape (N,).
        ratio: scalar in [0,1]
        seed: integer.

    Returns:
        x_tr: numpy array containing the train data.
        x_te: numpy array containing the test data.
        y_tr: numpy array containing the train labels.
        y_te: numpy array containing the test labels.

    >>> split_data(np.arange(13), np.arange(13), 0.8, 1)
    (array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]), array([ 2,  3,  4, 10,  1,  6,  0,  7, 12,  9]), array([ 8, 11,  5]))
    """
    # set seed
    np.random.seed(seed)
    
    x_shuffle, y_shuffle = unison_shuffled_copies(x, y)
    split_pos = round(len(x_shuffle)*ratio)
    
    return x_shuffle[: split_pos], x_shuffle[split_pos:], y_shuffle[:split_pos], y_shuffle[split_pos:],


In [182]:
x_train, x_test, y_train, y_test = split_data(x_data, y_data, 0.8)

In [166]:
def sigmoid(t):
    """Apply the logistic function."""
    return 1.0 / (1 + np.exp(-t))

    
def compute_logistic_loss(y, tx, w): 

    pred = tx@w
    # sigmoids = 1.0 / (1 + np.clip(np.exp(-pred), 10e-10, np.inf))
    
    sigmoids = 1 / (1 + np.exp(-pred))
    # sigmoids = pred #1.0 / (1 + np.exp(-pred))
    sigmoids_clipped = np.clip(sigmoids , 10e-15, 1 - 10e-15)

    
    loss = -np.mean(y * np.log(sigmoids_clipped) + (1 - y) * np.log(1 - sigmoids_clipped)) 
    # loss = -np.mean(pred) 
    return sigmoids, loss


def logistic_regression(y, tx, initial_w, max_iters, gamma):
    """
    Perform logistic regression using gradient descent.
    
    Parameters:
    y: np.array
        The target values
    tx: np.array
        The data matrix (each row is a data point)
    initial_w: np.array
        Initial weights
    max_iters: int
        Maximum number of iterations for gradient descent
    gamma: float
        Learning rate

    Returns:
    w: np.array
        Optimized weights after training
    """
    
    w = initial_w

    sigmoids, loss = compute_logistic_loss(y, tx, w )
    
    for iter in range(max_iters):
        # compute the gradient

        grad =  tx.T.dot(sigmoids - y)/len(tx)

        # update w through the negative gradient direction
        w = w - gamma * grad

        sigmoids, loss = compute_logistic_loss(y, tx, w)
        
    return w, loss

In [183]:
N, D = x_train.shape
initial_w = np.random.rand(D)

w, loss = logistic_regression(y_train.reshape(-1, 1), x_train, initial_w.reshape(-1, 1), 100, 0.01)

In [184]:
pred = np.zeros(len(y_test))
pred[(x_test @ w >= 0.5).T[0]] = 1
np.sum(pred == y_test) / len(y_test)

0.585

array([False,  True, False,  True,  True,  True, False,  True, False,
       False,  True, False, False,  True,  True,  True,  True,  True,
       False, False, False, False, False,  True, False, False, False,
        True,  True,  True, False,  True,  True,  True, False, False,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
       False,  True,  True,  True,  True, False,  True,  True, False,
       False, False,  True,  True, False,  True, False,  True,  True,
       False,  True,  True, False, False, False,  True, False,  True,
       False, False,  True,  True, False, False,  True,  True, False,
       False, False,  True,  True, False, False, False, False,  True,
       False, False, False,  True, False, False, False,  True,  True,
       False, False,  True, False, False, False, False,  True, False,
        True, False,