# Como entrenar una red en google colab y no morir en el intento

Entrenar una red neuronal si no se tienen los recursos computacionales sufientes es una tarea imposible.

Por suerte Google nos viene a ayudar con Google Colab, que nos facilita el uso de GPUs para entrenar redes neuronales.  Pero esto tiene ciertas limitaciones en tiempo de entrenamiento, por lo que cada cierto tiempo se cae el entorno de ejecución y se detiene el entrenamiento.

En el presente documento, se desarrolla una métodologia para evitar perder los datos de entrenamiento e iniciar posteriormende desde donde habiamos quedado. 



In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
root = '/content/drive/My Drive/CNN'
save_dir = os.path.join(root, 'weights')
print(save_dir)

/content/drive/My Drive/CNN/weights


In [3]:
from __future__ import print_function
%tensorflow_version 1.x
import tensorflow as tf
from keras.datasets import cifar10 
import keras
import numpy as np

TensorFlow 1.x selected.


Using TensorFlow backend.


In [4]:
n_classes = 10 
height = 32
width = 32
channels = 3
batch_size = 128
num_classes = 10


# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

# Modelo

def conv2d(x, W, padding):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding=padding)

def maxpool2d(x, padding = 'VALID'):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding=padding)

# red ejemplo de https://keras.io/examples/cifar10_cnn/


# Entrada grafo
x = tf.placeholder(tf.float32, shape=(None, height, width, channels))
# salida grafo
y = tf.placeholder(tf.float32, [None, n_classes])
# dropout 






def cnn(x):
    
    weights = {
    # 3 x 3 convolution, 1 input image, 32 outputsMax-Pool
    'W_conv1': tf.Variable(tf.keras.initializers.glorot_uniform()([3, 3, 3, 32]), name='W_conv1'),
    # 3 x 3 conv, 32 inputs, 32 outputs 
    'W_conv2': tf.Variable(tf.keras.initializers.glorot_uniform()([3, 3, 32, 32]), name='W_conv2'),

    # 3 x 3 conv, 32 inputs, 64 outputs 
    'W_conv3': tf.Variable(tf.keras.initializers.glorot_uniform()([3, 3, 32, 64]), name='W_conv3'),
    # 3 x 3 conv, 64 inputs, 64 outputs 
    'W_conv4': tf.Variable(tf.keras.initializers.glorot_uniform()([3, 3, 64, 64]), name='W_conv4'),

    #clasificador    
    # fully connected, 6*6*64 inputs, 512 outputs
    'W_fc': tf.Variable(tf.keras.initializers.glorot_uniform()([6*6*64, 512]), name='W_fc'),
    # 512 inputs, 10 outputs (class prediction)
    'out': tf.Variable(tf.keras.initializers.glorot_uniform()([512, n_classes]), name='W_out')
    }

    biases = {
    'b_conv1': tf.Variable(tf.zeros([32]), name='b_conv1'),
    'b_conv2': tf.Variable(tf.zeros([32]), name='b_conv2'),
    'b_conv3': tf.Variable(tf.zeros([64]), name='b_conv3'),
    'b_conv4': tf.Variable(tf.zeros([64]), name='b_conv4'),

    'b_fc': tf.Variable(tf.zeros([512]), name='b_fc'),
    'out': tf.Variable(tf.zeros([n_classes]), name='b_out')
    }
    
        
    # convolutional layer 1
    with tf.name_scope('conv1') as scope:
        conv1 = tf.nn.relu(conv2d(x, weights['W_conv1'], padding='SAME') + biases['b_conv1'])
    
    # convolutional layer 2
    with tf.name_scope('conv2') as scope:
        conv2 = tf.nn.relu(conv2d(conv1, weights['W_conv2'],padding = 'VALID') + biases['b_conv2'])
        # Max Pooling (down-sampling)
        conv2 = maxpool2d(conv2)

  
    
    # convolutional layer 3
    with tf.name_scope('conv3') as scope:
        conv3 = tf.nn.relu(conv2d(conv2, weights['W_conv3'],padding = 'SAME') + biases['b_conv3'])
    # convolutional layer 4architecture
    with tf.name_scope('conv4') as scope:
        conv4 = tf.nn.relu(conv2d(conv3, weights['W_conv4'],padding = 'VALID') + biases['b_conv4'])
        # max pooling
        conv4 = maxpool2d(conv4)

    
    #clasificador
    with tf.name_scope('capa_oculta') as scope:
        # flatten
        fc = tf.reshape(conv4, [-1, 6*6*64], name = 'flatten')
        fc = tf.nn.relu(tf.matmul(fc, weights['W_fc']) + biases['b_fc'])
    with tf.name_scope('capa_salida') as scope:
        output = tf.matmul(fc, weights['out']) + biases['out']
        
    return output
    


def crear_batch(x_train, y_train, i, batch_size):
    x_batch = x_train[i*batch_size: (i+1)*batch_size]
    y_batch = y_train[i*batch_size:(i+1)*batch_size]
    return x_batch, y_batch

def train_cnn(x,y,epochs,x_train,y_train,batch_size, checkpoint=None, epoch_i=0):
    output = cnn(x)
    y_ = tf.nn.softmax(output, name='prediction')
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output, labels=y), name='loss')
    optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001).minimize(cost)
    correct = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1))  
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name= 'accuracy')
    saver = tf.train.Saver()
    
    # setup the initialisation operator
    init_op = tf.global_variables_initializer()

    vector_costos_train = np.zeros([epochs])
    vector_costos_test = np.zeros([epochs])
    vector_acc_test = np.zeros([epochs])
    vector_acc_train = np.zeros([epochs])

    last_checkpoint_dir = os.path.join(save_dir, 'last_checkpoint.ckpt')
    best_checkpoint_dir = os.path.join(save_dir, 'best_checkpoint.ckpt')

    

    with tf.Session() as sess:
        
        if checkpoint == None:
          sess.run(init_op) # cuando se realiza inicializacion aleatoria
        else:
          saver.restore(sess, checkpoint) # restaurar desde checkpoint

        best_epoch_loss = 10000


        
        for epoch in range(epoch_i,epochs):
            epoch_loss = 0
            acc_train = 0
            for i in range(int(len(x_train)/batch_size)):
                x_batch, y_batch = crear_batch(x_train,y_train,i,batch_size)
                _, c = sess.run([optimizer, cost], feed_dict={x: x_batch, y: y_batch})
                acc_batch = accuracy.eval({x:x_batch, y:y_batch})
                epoch_loss += c
                acc_train += acc_batch
            
            print('Epoch', epoch, 'completed out of',epochs,'loss:',epoch_loss/(int(len(x_train)/batch_size)),
                 'Loss test:',cost.eval({x:x_test, y:y_test}),
                  'Accuracy train:', acc_train/(int(len(x_train)/batch_size)),
                  'Accuracy test:',accuracy.eval({x:x_test, y:y_test})
                 )
            
 
            
            vector_costos_train[epoch] = epoch_loss/(int(len(x_train)/batch_size))
            vector_costos_test[epoch] = cost.eval({x:x_test, y:y_test})
            vector_acc_train[epoch] = acc_train/(int(len(x_train)/batch_size)) 
            vector_acc_test[epoch] = accuracy.eval({x:x_test, y:y_test})
            
           
            if epoch%10 == 0:              
              saver.save(sess,last_checkpoint_dir)


            if (epoch_loss < best_epoch_loss):
              best_epoch_loss = epoch_loss
              best_accuracy = accuracy.eval({x:x_test, y:y_test})
              save_path = saver.save(sess, best_checkpoint_dir)

        print('El mejor accuracy en test es:', best_accuracy)
      
        return vector_costos_train, vector_costos_test, vector_acc_test, vector_acc_train
        



x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples


In [5]:
epochs = 20
vector_costos_train, vector_costos_test, vector_acc_test, vector_acc_train = train_cnn(x,y,epochs,x_train,y_train,batch_size)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Epoch 0 completed out of 20 loss: 2.300763751298953 Loss test: 2.297692 Accuracy train: 0.10989583333333333 Accuracy test: 0.1235
Epoch 1 completed out of 20 loss: 2.2947549135257037 Loss test: 2.2917926 Accuracy train: 0.14783653846153846 Accuracy test: 0.1626
Epoch 2 completed out of 20 loss: 2.2881936843578634 Loss test: 2.2841907 Accuracy train: 0.16316105769230768 Accuracy test: 0.1604
Epoch 3 completed out of 20 loss: 2.2787468451720017 Loss test: 2.2724652 Accuracy train: 0.17091346153846154 Accuracy test: 0.1715
Epoch 4 completed out of 20 loss: 2.2633693285477468 Loss test: 2.252627 Accuracy train: 0.18387419871794872 Accuracy test: 0.1875
Epoch 5 completed out of 20 loss: 2.236096830245776 Loss test: 2.2175226 Accuracy train: 0.19763621794871794 Accuracy test: 0.1977
Epoch 6 completed out