In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# What version of Python do you have?
import torch
import sklearn as sk
from sklearn.model_selection import StratifiedKFold
import copy
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("GPU is", "available" if torch.cuda.is_available() else "NOT AVAILABLE")

PyTorch Version: 1.7.0

Python 3.7.9 (default, Aug 31 2020, 17:10:11) [MSC v.1916 64 bit (AMD64)]
Pandas 1.1.3
Scikit-Learn 0.23.2
GPU is available


# Normalizar Datos

In [2]:
def normalize(vector):
    media_vector = vector.mean(axis=0)
    std_vector = vector.std(axis=0)
    return (vector - media_vector)/std_vector, media_vector, std_vector

# Sigmoidal

In [3]:
# implemento la función sigmoidal
def sigma(x):
    return 1.0 / (1 + torch.exp(-x)) 


A = torch.tensor([[1.,0.],[0.,1.]]).to('cuda')
b = torch.tensor([[0.],[0.]]).to('cuda')
ans = torch.matmul(A,b)
sigma(ans)

tensor([[0.5000],
        [0.5000]], device='cuda:0')

# Calcular Función de Costo
En teoría la función costo es la norma del vector resultante de la diferencia del valor resultante de la entrada X y su valor resultante definido.  
$$
Cost(X, y) = \sum_{i=0}^{m}(y^i- \hat{y}^i)^{2} = \|h_{\theta}(X) - y\|
$$
En el caso de nuestra implementación como todos los datos de entrada serán insertados en una sola pasada, necesito el promedio de la diferencia de cada valor de salida.  

In [4]:
# recibo los datos
def Calcular_Funcion_Costo(y):
    return torch.linalg.norm(y)
    #return torch.linalg.norm(y) / y.nelement()
    #return torch.sqrt(torch.sum(y**2)) / y.nelement()
    #return torch.mean(torch.linalg.norm(vector, dim=0))
A = torch.tensor([[1.,0.],[0.,1.]]).to('cuda')
b = torch.tensor([[0.],[0.]]).to('cuda')
ans = torch.matmul(A,b)
Calcular_Funcion_Costo(sigma(ans))

tensor(0.7071, device='cuda:0')

# dS
Recibe los datos $D$ y calcula para dichos datos la derivada de la función sigmoidal.  
Recibiré un vector $a$ como en la teoría está establecido, luego retornaré la derivada de la función sigmoidal, dada por la ecuación:  
$$
\frac{d\sigma(x)}{dx} = \sigma(x)(1 - \sigma(x))
$$
Todo esto en notación vectorial.

In [5]:
def dS(D):
    # recordemos que la variable sigma es la función sigmoidal vectorizada
    return sigma(D)*(1 - sigma(D))

a = torch.tensor([[1.],[2.]]).to('cuda')
print(dS(a))

tensor([[0.1966],
        [0.1050]], device='cuda:0')


# Forward
Recibe los datos $X$ y un diccionario de parámetros $W$ (i.e. los pesos de cada capa), donde cada elemento del diccionario es una matriz de pesos; y realiza la etapa de propagación. Devuelve un diccionario de activaciones $A$, donde cada elemento del diccionario son las activaciones de cada capa.

In [91]:
# El primer paso es crear todas las matrices de pesos basándonos en las dimensiones de la entrada X
# y el tamaño del batch que ingresaremos
def generar_pesos(parameters):
    layers = {}
    n = len(parameters)
    # i -> (0,1,...,n-1)
    for i in range(0, n-1):
        layers[i] = {"W": torch.randn(parameters[i+1], parameters[i], dtype=torch.double).to('cuda'),
                     "b": torch.randn(parameters[i+1], 1, dtype=torch.double).to('cuda')}
    return layers

def Forward(X, W):
    A = {0: {"a": X, "z": X}} # el diccionario a retornar
               # como primer elemento de la lista esta X, para mantener el orden
    n = len(W) # primero extraigo la cantidad de pesos para iterar sobre eso
    for i in range(1, n + 1):
        activation = torch.matmul(W[i-1]["W"], A[i-1]["a"]) + W[i-1]["b"] # se hace broadcasting al sumar el bias
        A[i] = {"a": sigma(activation), "z": activation}
    return A


Wout = generar_pesos([3,2,3])
print(Wout)
Xin = torch.tensor([[100, 10],[50, 70],[74, 20]], dtype=torch.double).to('cuda')
print(Forward(Xin, Wout))

{0: {'W': tensor([[ 1.0610, -0.3707, -0.4381],
        [-1.3938, -0.9740,  0.3350]], device='cuda:0', dtype=torch.float64), 'b': tensor([[-0.5897],
        [-0.6444]], device='cuda:0', dtype=torch.float64)}, 1: {'W': tensor([[ 0.5771, -0.7015],
        [ 0.4396, -0.6355],
        [ 0.5541,  0.9023]], device='cuda:0', dtype=torch.float64), 'b': tensor([[0.5018],
        [0.2165],
        [0.1532]], device='cuda:0', dtype=torch.float64)}}
{0: {'a': tensor([[100.,  10.],
        [ 50.,  70.],
        [ 74.,  20.]], device='cuda:0', dtype=torch.float64), 'z': tensor([[100.,  10.],
        [ 50.,  70.],
        [ 74.,  20.]], device='cuda:0', dtype=torch.float64)}, 1: {'a': tensor([[1.0000e+00, 1.8913e-11],
        [6.4034e-72, 9.2453e-34]], device='cuda:0', dtype=torch.float64), 'z': tensor([[  54.5629,  -24.6912],
        [-163.9293,  -76.0638]], device='cuda:0', dtype=torch.float64)}, 2: {'a': tensor([[0.7463, 0.6229],
        [0.6584, 0.5539],
        [0.6698, 0.5382]], device='cuda:0',

# Backward

In [136]:
def Backward(X, A, W, y, lr):
    """
    X: valores de entrada
    A: diccionario con los resultados de cada capa
    W: diccionario con los pesos de cada capa
    y: salida de la última capa
    lr: tasa de aprendizaje (learning rate en inglés)
    """
    r = len(A)-1
    deltas = {r: -(y - A[r]["a"]) * dS(A[r]["z"])}
    for i in range(r-1,-1,-1):
        deltas[i] = torch.matmul(torch.t(W[i]["W"]), deltas[i+1]) * dS(A[i]["z"])
        # ahora viene la etapa del gradiente descendiente, para los pesos
        W[i]["W"] = W[i]["W"] - lr * torch.matmul(deltas[i+1], torch.t(A[i]["a"]))
        W[i]["b"] = W[i]["b"] - lr * deltas[i+1]
        W[i]["b"] = torch.mean(W[i]["b"], dim=1, keepdim=True)
    return W

Wout = generar_pesos([3,2,3])
print(Wout)
Xin = torch.tensor([[100, 10],[50, 70],[74, 20]], dtype=torch.double).to('cuda')
A = Forward(Xin, Wout)
yout = torch.tensor([[1, 0],[0, 1],[0, 0]], dtype=torch.double).to('cuda')
print(Backward(Xin, A, Wout, yout, 0.5))

{0: {'W': tensor([[-1.4010,  0.8687,  1.1399],
        [ 0.7904, -1.9768,  1.4166]], device='cuda:0', dtype=torch.float64), 'b': tensor([[1.0430],
        [0.4766]], device='cuda:0', dtype=torch.float64)}, 1: {'W': tensor([[ 0.5027, -0.3178],
        [-0.5467, -1.3417],
        [ 0.1107,  0.3005]], device='cuda:0', dtype=torch.float64), 'b': tensor([[-0.5026],
        [ 1.5596],
        [-0.3113]], device='cuda:0', dtype=torch.float64)}}
{0: {'W': tensor([[-1.4010,  0.8687,  1.1399],
        [ 0.7904, -1.9768,  1.4166]], device='cuda:0', dtype=torch.float64), 'b': tensor([[1.0430],
        [0.4766]], device='cuda:0', dtype=torch.float64)}, 1: {'W': tensor([[ 0.4402, -0.2441],
        [-0.5207, -1.4101],
        [ 0.0550,  0.2383]], device='cuda:0', dtype=torch.float64), 'b': tensor([[-0.4970],
        [ 1.5384],
        [-0.3703]], device='cuda:0', dtype=torch.float64)}}


# Gradiente Descendiente

In [139]:
def Gradiente_Descendiente(X, y, W, epochs, lr):
    result = {"costs": []}
    Wout = copy.deepcopy(W)
    for i in range(epochs):
        Aout = Forward(X, Wout)
        #result["weigths"].append(W)
        result["costs"].append(Calcular_Funcion_Costo(y - Aout[len(Aout)-1]["a"]))
        Backward(X, Aout, Wout, y, lr)
    
    #for idx in Wout:
        #Wout[idx]['b'] = Wout[idx]['b'][:,0:1]
    result['W'] = Wout
    return result

Wout = generar_pesos([3,4,3])
Xin = torch.tensor([[100, 10],[50, 100],[74, 200]], dtype=torch.double).to('cuda')
yout = torch.tensor([[1, 0],[0, 1],[0, 0]], dtype=torch.double).to('cuda')

ans = Gradiente_Descendiente(Xin, yout, Wout, 2000, 0.3)
print(Xin)
print(yout)
print(Forward(Xin, ans['W'])[2]["a"])
print(ans['W'])
ans["costs"][-4:-1]

tensor([[100.,  10.],
        [ 50., 100.],
        [ 74., 200.]], device='cuda:0', dtype=torch.float64)
tensor([[1., 0.],
        [0., 1.],
        [0., 0.]], device='cuda:0', dtype=torch.float64)
tensor([[0.9565, 0.0477],
        [0.0448, 0.9507],
        [0.0125, 0.0120]], device='cuda:0', dtype=torch.float64)
{0: {'W': tensor([[ 1.0870,  0.1922,  2.1272],
        [-0.0183, -0.0778,  0.6326],
        [-0.7345, -1.4435, -0.0217],
        [ 0.8280,  0.8204, -0.6601]], device='cuda:0', dtype=torch.float64), 'b': tensor([[ 2.2541],
        [ 0.1484],
        [-0.0714],
        [-0.8803]], device='cuda:0', dtype=torch.float64)}, 1: {'W': tensor([[-0.7086, -2.7267, -0.1962,  6.0847],
        [-0.6228,  2.2853,  0.9127, -6.0189],
        [-1.9426, -2.5066,  1.3835,  0.0470]], device='cuda:0',
       dtype=torch.float64), 'b': tensor([[0.4419],
        [1.2977],
        [0.0361]], device='cuda:0', dtype=torch.float64)}}


[tensor(0.0945, device='cuda:0', dtype=torch.float64),
 tensor(0.0944, device='cuda:0', dtype=torch.float64),
 tensor(0.0944, device='cuda:0', dtype=torch.float64)]

In [33]:
# estimaremos el resultado de nuestros vectores considerando la posición del vector 
#con la máxima puntuación como 1 y el resto como 0, p.ej.
# si tueviésemos un vector a = [0.5, 0.2, 0.7], su estimado será [0, 0, 1]
def estimate_result(x): # receives a vector of n*1
    Xout = torch.zeros(x.shape[0], x.shape[1], dtype=torch.int).to('cuda')
    maxidx = torch.argmax(x, dim=0)
    for i in range(maxidx.shape[0]):
        Xout[maxidx[i], i] = 1
    return Xout

# una pequeña prueba
vectest = torch.tensor([[0.5],[0.2],[0.7]], dtype=torch.double).to('cuda')
vectest = estimate_result(vectest)
vectest

tensor([[0],
        [0],
        [1]], device='cuda:0', dtype=torch.int32)

# Calcular Accuracy

In [34]:
def Calcular_Accuracy(YMLP, Y):
    set_size = YMLP.shape[1]
    correct = 0.0
    for i in range(set_size):
        if torch.all(torch.eq(YMLP[:,i], Y[:,i])):
            correct += 1.0
    return correct / set_size

YMLP = torch.tensor([[1,0],[0,0],[0,1]], dtype=torch.double).to('cuda')
Y = torch.tensor([[1,0],[1,0],[0,1]], dtype=torch.double).to('cuda')
Calcular_Accuracy(YMLP, Y)

0.5

# K-fold Cross Validation

In [160]:
# myset será una lista con los valores de los 
def KfoldsCrossValidation(X, y, Y, params, k=3, shuff=True):
    """
    X: el conjunto de datos reescalados
    y: la columna de datos categóricos
    Y: la columna de los datos categóricos con la configuración one hot
    params: un diccionario de datos con los siguiente parámetros
            -layers: una lista con el número de neuronas por cada capa
            -learning: una lista con las tasas de aprendizaje a evaluar
            -epochs: una lista con el número de épocas(iteraciones)
    k: el k para realizar el k-folds (3 por defecto)
    shuff: la variable booleana para decidir si los datos se barajan o no
    """
    skf = StratifiedKFold(n_splits=k, shuffle=shuff)
    learning_rates = params['learning']
    epochs = params['epochs']
    layers = params['layers']
    bestAcc = 0
    bestLr = -1
    bestEpoch = -1
    Weights = generar_pesos(layers)
    for lr in learning_rates:
        for epoch in epochs:
            print(f'Tasa de aprendizaje: {lr}, épocas: {epoch} ', end="")
            avgAcc = 0
            for train_index, test_index in skf.split(X, y):
                #print("TRAIN:", train_index[0:10], "TEST:", test_index[0:10])
                X_train, X_test = torch.from_numpy(np.array(X[train_index], dtype='double').T).to('cuda'), torch.from_numpy(np.array(X[test_index], dtype='double').T).to('cuda')
                y_train, y_test = torch.from_numpy(np.array(Y[train_index], dtype='double').T).to('cuda'), torch.from_numpy(np.array(Y[test_index], dtype='int').T).to('cuda')
                ans = Gradiente_Descendiente(X_train, y_train, Weights, epoch, lr)
                acc = estimate_result(Forward(X_test, ans['W'])[len(layers)-1]["a"]) # primero estimamos el valor de la última capa
                acc = Calcular_Accuracy(acc, y_test)
                avgAcc += acc
            avgAcc /= k
            if avgAcc > bestAcc:
                bestAcc = avgAcc
                bestLr = lr
                bestEpoch = epoch
            print(f'Average acc: {avgAcc}')
            
                
    print(f'Results\nBest learning rate: {bestLr}')
    print(f'Best Nro epochs: {bestEpoch}')
    print(f'Best Average Accuracy: {bestAcc}')
        

# Clasificación del género de música

In [43]:
# leo los datos
Data = pd.read_csv('music_genre.csv')
Data = Data.drop(['filename'], axis=1)
# hacemos un shuffle de los datos
# Data = Data.sample(frac=1).reset_index(drop=True)
print("Music genres:\n", Data['label'].value_counts())
Data.head()

Music genres:
 classical    100
disco        100
reggae       100
hiphop       100
rock         100
blues        100
country      100
pop          100
jazz         100
metal        100
Name: label, dtype: int64


Unnamed: 0,tempo,beats,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,103.359375,50,0.38026,0.248262,2116.942959,1956.611056,4196.10796,0.127272,-26.929785,107.334008,...,14.336612,-13.821769,7.562789,-6.181372,0.330165,-6.829571,0.965922,-7.570825,2.918987,blues
1,95.703125,44,0.306451,0.113475,1156.070496,1497.668176,2170.053545,0.058613,-233.860772,136.170239,...,-2.250578,3.959198,5.322555,0.812028,-1.107202,-4.556555,-2.43649,3.316913,-0.608485,blues
2,151.999081,75,0.253487,0.151571,1331.07397,1973.643437,2900.17413,0.042967,-221.802549,110.84307,...,-13.037723,-12.652228,-1.821905,-7.260097,-6.660252,-14.682694,-11.719264,-11.025216,-13.38726,blues
3,184.570312,91,0.26932,0.119072,1361.045467,1567.804596,2739.625101,0.069124,-207.20808,132.799175,...,-0.613248,0.384877,2.605128,-5.188924,-9.527455,-9.244394,-2.848274,-1.418707,-5.932607,blues
4,161.499023,74,0.391059,0.137728,1811.076084,2052.332563,3927.809582,0.07548,-145.434568,102.829023,...,7.457218,-10.470444,-2.360483,-6.783623,2.671134,-4.760879,-0.949005,0.024832,-2.005315,blues


## One hot encoding para los datos

In [44]:
genres = pd.get_dummies(Data[['label']])
Xinput = Data.drop(['label'], axis=1)

In [45]:
# aplicando la normalización de los datos
Xinput, mediaXinput, stdXinput = normalize(Xinput)
Data.head()

Unnamed: 0,tempo,beats,chroma_stft,rmse,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,mfcc1,mfcc2,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,103.359375,50,0.38026,0.248262,2116.942959,1956.611056,4196.10796,0.127272,-26.929785,107.334008,...,14.336612,-13.821769,7.562789,-6.181372,0.330165,-6.829571,0.965922,-7.570825,2.918987,blues
1,95.703125,44,0.306451,0.113475,1156.070496,1497.668176,2170.053545,0.058613,-233.860772,136.170239,...,-2.250578,3.959198,5.322555,0.812028,-1.107202,-4.556555,-2.43649,3.316913,-0.608485,blues
2,151.999081,75,0.253487,0.151571,1331.07397,1973.643437,2900.17413,0.042967,-221.802549,110.84307,...,-13.037723,-12.652228,-1.821905,-7.260097,-6.660252,-14.682694,-11.719264,-11.025216,-13.38726,blues
3,184.570312,91,0.26932,0.119072,1361.045467,1567.804596,2739.625101,0.069124,-207.20808,132.799175,...,-0.613248,0.384877,2.605128,-5.188924,-9.527455,-9.244394,-2.848274,-1.418707,-5.932607,blues
4,161.499023,74,0.391059,0.137728,1811.076084,2052.332563,3927.809582,0.07548,-145.434568,102.829023,...,7.457218,-10.470444,-2.360483,-6.783623,2.671134,-4.760879,-0.949005,0.024832,-2.005315,blues


In [46]:
print(Xinput.shape)
print(genres.shape)

(1000, 28)
(1000, 10)


## K-folds cross validation
### 2 capas intermedias

In [162]:
myparams = {
            'layers': [Xinput.shape[1],10,10,genres.shape[1]],
            'epochs': [1000, 1500, 2000, 2500, 3000],
            'learning': [0.2, 0.1, 0.07, 0.05]
           }
KfoldsCrossValidation(np.array(Xinput), np.array(Data['label']), np.array(genres), myparams, 3, True)

Tasa de aprendizaje: 0.2, épocas: 1000 Average acc: 0.26702750654846463
Tasa de aprendizaje: 0.2, épocas: 1500 Average acc: 0.24694454933975893
Tasa de aprendizaje: 0.2, épocas: 2000 Average acc: 0.3059976143808479
Tasa de aprendizaje: 0.2, épocas: 2500 Average acc: 0.2560164955374536
Tasa de aprendizaje: 0.2, épocas: 3000 Average acc: 0.25301648954343564
Tasa de aprendizaje: 0.1, épocas: 1000 Average acc: 0.3280016543489597
Tasa de aprendizaje: 0.1, épocas: 1500 Average acc: 0.34204264144383906
Tasa de aprendizaje: 0.1, épocas: 2000 Average acc: 0.34893576210941485
Tasa de aprendizaje: 0.1, épocas: 2500 Average acc: 0.36602171033308756
Tasa de aprendizaje: 0.1, épocas: 3000 Average acc: 0.32398865931800064
Tasa de aprendizaje: 0.07, épocas: 1000 Average acc: 0.3710087332841823
Tasa de aprendizaje: 0.07, épocas: 1500 Average acc: 0.3819717921514329
Tasa de aprendizaje: 0.07, épocas: 2000 Average acc: 0.384986783190376
Tasa de aprendizaje: 0.07, épocas: 2500 Average acc: 0.3690277103450

### 3 capas intermedias

In [163]:
myparams['layers'] = [Xinput.shape[1],10, 10, 10, genres.shape[1]]
KfoldsCrossValidation(np.array(Xinput), np.array(Data['label']), np.array(genres), myparams, 3, True)

Tasa de aprendizaje: 0.2, épocas: 1000 Average acc: 0.1289463115810421
Tasa de aprendizaje: 0.2, épocas: 1500 Average acc: 0.10800620980261699
Tasa de aprendizaje: 0.2, épocas: 2000 Average acc: 0.10099920279560998
Tasa de aprendizaje: 0.2, épocas: 2500 Average acc: 0.14003824183464902
Tasa de aprendizaje: 0.2, épocas: 3000 Average acc: 0.130936325547104
Tasa de aprendizaje: 0.1, épocas: 1000 Average acc: 0.4450228671785558
Tasa de aprendizaje: 0.1, épocas: 1500 Average acc: 0.4609789430148712
Tasa de aprendizaje: 0.1, épocas: 2000 Average acc: 0.46699094303884725
Tasa de aprendizaje: 0.1, épocas: 2500 Average acc: 0.44404284524045007
Tasa de aprendizaje: 0.1, épocas: 3000 Average acc: 0.45800291309273344
Tasa de aprendizaje: 0.07, épocas: 1000 Average acc: 0.43901086715457965
Tasa de aprendizaje: 0.07, épocas: 1500 Average acc: 0.45898892904880934
Tasa de aprendizaje: 0.07, épocas: 2000 Average acc: 0.4519789250328172
Tasa de aprendizaje: 0.07, épocas: 2500 Average acc: 0.457996919074

### 4 capas intermedias

In [None]:
myparams['layers'] = [Xinput.shape[1], 10, 10, 10, 10, genres.shape[1]]
KfoldsCrossValidation(np.array(Xinput), np.array(Data['label']), np.array(genres), myparams, 3, True)

# Clasificación Titanic

In [None]:
# leo los datos
TitanicData = pd.read_csv('titani.csv')
TitanicData = TitanicData[['filename']], axis=1)
# hacemos un shuffle de los datos
# Data = Data.sample(frac=1).reset_index(drop=True)
print("Music genres:\n", Data['label'].value_counts())
Data.head()