In [1]:
import numpy as np

In [None]:
'''
    Given two points p and q (np.array)
    Returns the euclidean distance between them (float)
'''
def euclidean_dist(p, q):
    dist = 0
    for i in range(len(p)):
        dist += (p[i] - q[i]) ** 2
    return dist**(1/2)

In [None]:
'''
    Given
        X: data to be standardized (np.array)
    Returns
        X_std: standardized data - mean = 0, std = 1 (np.array)
    
    Usada para padronizar os dados no pré-processamento tanto do Universo quanto da amostra
'''
def standardization(X):
    X_std = (X - X.mean(axis=0)) / X.std(axis=0)
    return X_std

In [7]:
'''
    Given
        Y: labels (np.array or list)
        Y_hat: predicts (np.array or list)
    Returns
        error: normalized error (float)
    
    Será utilizada tanto para calcular Ein e Eout
    O erro deverá ser normalizado pelo número de elementos
'''
def calc_error(Y, Y_hat):
    error = 0
    for i in range(min(len(Y),len(Y_hat))):
        if Y[i] != Y_hat[i]:
            error += 1
    error += abs(len(Y) - len(Y_hat))
    error = error / min(len(Y), len(Y_hat))
    return error

In [None]:
'''
    Given
        N: number of samples (integer)
        X: Universe (np.array)
        Y: Universe labels (np.array)
        random_state: random seed (integer)
    Returns
        X (np.array), Y (np.array): N random samples of X and Y (np.array)

    Usada para pegar uma amostra do Universo X
'''
def sampling(N, X, Y, random_state=42):
    np.random.seed(random_state)
    idx = np.random.randint(0, len(X), N)
    return X[idx], Y[idx]

In [2]:
'''
    Função Diagonais: retas 45 graus (coeficiente angular +1 e -1) 
    Bias variando de -M//4*b até M//4*b, com passo b

    Sabendo que:
        x0 * w[0] + x1 * w[1] + b = 0
        w = [1, 1] na reta com inclinação negativa
        w = [1, -1] na reta com inclinação positiva

    Para cada bias, calcula a reta com inclinação negativa (w = [1, 1]), classifica os pontos em -1 ou +1, calcula a reta com inclinação positiva (w = [1, -1]) e classifica os pontos em -1 ou +1

    Given
        X: Universe (np.array)
        M: hypothesis number (integer, M%4 == 0)
        b: bias step (float)
    Returns:
        predict: np.array of np.array of y_hat. an y_hat for each hypothesis/line (np.array). shape = (M, len(X))
   
    Do not use np.linalg.norm in this function
'''
def diagonais(X, M, b):
    predict = np.zeros((M, len(X)))

    w = [1, 1]
    bias = -M//4*b
    for i in range(0, M, 2):
        for j in range(len(X)):
            if X[j][0] * w[0] + X[j][1] * w[1] + bias >= 0:
                predict[i][j] = 1
            else:
                predict[i][j] = -1
        bias += b

    w = [1, -1]
    bias = -M//4*b
    for i in range(1, M, 2):
        for j in range(len(X)):
            if X[j][0] * w[0] + X[j][1] * w[1] + bias >= 0:
                predict[i][j] = 1
            else:
                predict[i][j] = -1
        bias += b

    return predict


In [None]:
'''
    Função Egocentric: circunferências com raio r centradas em cada ponto da amostra

    Given
        X: Universo (np.array)
        C: Centros (np.array)
        r: raio (float)
    Returns
        predicts: list of predicts (np.array) for each point in C
    
    For each point in C, creates a circunference c centered in C with radius r, and classifies points inside c as 1 and outside as -1
    '''
def egocentric(X, C, r):
    predicts = []
    for c in C:
        predicts.append(np.where(np.array([euclidean_dist(x, c) for x in X]) <= r, 1, -1))
    return predicts

In [8]:
'''
Given
    N: number of samples (integer)
    H_Set: name of hypothesis set (string - 'diagonais' or 'egocentric')
    eps: epsilon, abs(error_in-error_out) desired (float)
    X: Universe data, complete dataset (np.array)
    Y: Universe labels (np.array)
    M: number of hypothesis, if 'diagonais' is chosen (integer)
    b: bias step, if 'diagonais' is chosen (integer)
    r: radius, if 'egocentric' is chosen (float)
    random_state: random seed (integer)

Returns:
    bound: theoretical bound for Pr[abs(error_in-error_out) > eps] (float)
    probs: approximated probability of Pr[abs(error_in-error_out) > eps] by the frequency - number of occurences divided by number of hypothesis (float)

    Do not use np.sum or np.exp in this function
'''


def calc_freq(N, H_set, eps, X, Y, M=100, b=0.05, r=1, random_state=42):
    if H_set == 'diagonais':
        X_std = standardization(X)
        X_sample, Y_sample = sampling(N, X_std, Y, random_state)
        predict = diagonais(X_std, M, b)

        probs = 0
        for i in range(M):
            error_in = calc_error(Y_sample, predict[i])
            error_out = calc_error(Y, predict[i])
            if abs(error_in - error_out) > eps:
                probs += 1
        probs = probs / M

        bound = 2 * M * np.exp(-2 * eps**2 * N)

        return bound, probs

    elif H_set == 'egocentric':
        X_std = standardization(X)
        X_sample, Y_sample = sampling(N, X_std, Y, random_state)
        X_sample_std = standardization(X_sample)
        predict = egocentric(X_std, X_sample_std, r)

        probs = 0
        for i in range(N):
            error_in = calc_error(Y_sample, predict[i])
            error_out = calc_error(Y, predict[i])
            if abs(error_in - error_out) > eps:
                probs += 1
        probs = probs / N

        bound = 2 * N * np.exp(-2 * eps**2 * N)
        
        return bound, probs

    else:
        raise ValueError('H_set must be "diagonais" or "egocentric"')

    