# Environement setup


---



In [None]:
import tensorflow as tf
import numpy as np
from numpy.random import multivariate_normal
from sklearn.model_selection import train_test_split
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder 


# Practice questions

### Question 1: Gradient descient

In [None]:
def f(x):
  return x*x - 1

def f2(x1, x2):
  return 1*(x1**2) + 2*x2 - 1

In [None]:
def gradient_descent_one_variable(f, initial_x, max_iters=100, lr=0.01, precision = 0.000001, round_to=0.001):
  cur_x = initial_x 
  previous_step_size = 1
  iters = 0

  for iters in range(max_iters):
    print("iteration",iters)
    prev_x = cur_x 
    cur_x = cur_x - lr * f(prev_x) 
    previous_step_size = abs(cur_x - prev_x)
    
    if previous_step_size <= precision :
      print("Converged with", x.numpy())
      return cur_x

  print("Exited with", cur_x)
  return cur_x
      
      
def tf_gradient_descent_one_variable(f, initial_x, max_iters=1000, lr=0.01, precision = 0.001, optimizer=tf.keras.optimizers.SGD):
  x = tf.Variable(1.0*initial_x)
  opt = optimizer(learning_rate=lr)
  prev_x = x.numpy()
  
  for i in range(max_iters):
    
    # https://medium.com/analytics-vidhya/3-different-ways-to-perform-gradient-descent-in-tensorflow-2-0-and-ms-excel-ffc3791a160a
    # method 2
    with tf.GradientTape() as tape:
      y = f(x)
    grads = tape.gradient(y, [x])
    processed_grads = [g for g in grads]
    grads_and_vars = zip(processed_grads, [x])
    opt.apply_gradients(grads_and_vars)
    previous_step_size = abs(x.numpy() - prev_x)
    
    prev_x = x.numpy()
    if previous_step_size <= precision :
      print("Converged with", x.numpy())
      return x.numpy()

  print("Exited with", x.numpy())
  return x.numpy()

def tf_gradient_descent_n_variables(f, initial_variables, max_iters=1000, lr=0.01, precision = 0.001, optimizer=tf.keras.optimizers.SGD):
  X = [ tf.Variable(1.0* variable) for variable in initial_variables ]
  opt = optimizer(learning_rate=lr)
  
  for i in range(max_iters):
    with tf.GradientTape() as tape:
      y = f(*X)  # <=> X.unpack()
    grads = tape.gradient(y, X)
    processed_grads = [g for g in grads]
    grads_and_vars = zip(processed_grads, X)
    opt.apply_gradients(grads_and_vars)
    
  variables = [v.numpy() for v in X]
  print("Converged with: ", variables)
  return variables


print("\nMinima of y= x + 2*z - 1")
minimum = tf_gradient_descent_n_variables(f2, [5, 0], lr=0.1, precision=0.001)

print("\nMinima of y=x²")
minimum = tf_gradient_descent_one_variable(f, 5, lr=0.1, precision=10**-8)


Minima of y= x + 2*z - 1
Converged with:  [0.0, -199.9981]

Minima of y=x²
Converged with 3.6185043e-08


In [None]:
class GradientDescent():
  def __init__(self, X, y):
    self.X = np.array(X)
    self.y = np.array(y)
    self.weights = [1.0] + [1.0 for _ in X[0] ]

  def __call__(self, X):
    return self.predict(X)

  def predict(self, X):
    assert len(X) == len(self.weights) - 1
    X = X
    return sum(i[0] * i[1] for i in zip(X, self.weights))

  def loss(self, weights):
    predictions = []
    i = 0
    for x in self.X:
      x = [1.0] + x
      predictions.append(sum(i[0] * i[1] for i in zip(x, weights)) )
    
    MAE = 0
    for i in range(len(predictions)):
      MAE += abs(predictions[i] - self.y[i] )
    return MAE
    

  def train(self, max_iters=100, lr=0.01, epochs=2, precision = 0.001, optimizer=tf.keras.optimizers.SGD):
    opt = optimizer(learning_rate=lr)
    X = [tf.Variable(1.0* variable) for variable in self.weights ]
    # Compute the gradients for a list of variables.
    for i in range(max_iters):
      with tf.GradientTape() as tape:
        y = self.loss(X) # has to be a one line loss function...
      grads = tape.gradient(y, X)
      # Process the gradients, for example cap them, etc.
      # capped_grads = [MyCapper(g) for g in grads]
      processed_grads = [g for g in grads]
      # Ask the optimizer to apply the processed gradients.
      opt.apply_gradients(
          (grad, var) 
          for (grad, var) in zip(processed_grads, X)
          if grad is not None
          )
      variables = [ v.numpy() for v in X ]
      self.weights = variables
    return variables



## Question 2: optimizers

---



In [None]:
print("Adam")
minimum = tf_gradient_descent_one_variable(f, 5, lr=0.1, precision=10**-8, optimizer=tf.keras.optimizers.Adam)
print("RMSprop")
minimum = tf_gradient_descent_one_variable(f, 5, lr=0.1, precision=10**-8, optimizer=tf.keras.optimizers.RMSprop)
print("Momentum")
minimum = tf_gradient_descent_one_variable(f, 5, lr=0.1, precision=10**-8)


Adam
Converged with -2.0924533e-06
RMSprop
Converged with 4.97395e-10
Momentum
Converged with 3.6185043e-08


## 2. I implement in order to understand

In [None]:
# perceptron from tp1
def perceptron(x, y, max_iterations=3000):
  w = np.zeros((1,2))
  p = np.zeros((1,2))
  for it in range(max_iterations):
        for X, Y in zip(x, y) :
            if  Y * np.inner(X, w) <= 0 : 
                p = w
                w = w + np.multiply(X, Y)
        if (w == p).all() :
            return (w.tolist(), True)
  return (w.tolist(), it>=max_iterations)

### 2.1 Toy data set

In [None]:
def generate_dataset(sigma1, sigma2, test_size=0.2):
  mu1 = [-1, 0]
  mu2 = [1, 0]
  
  cov1 = [ [sigma1 , 0 ], 
           [0 , sigma1 ] 
          ]

  cov2 = [  [ sigma2 , 0 ], 
            [ 0 , sigma2 ] 
          ] 
  x1 = multivariate_normal(mu1, cov1, 125)
  x2 = multivariate_normal(mu2, cov2, 125)
  dataset = pd.DataFrame({
      "x": x1.tolist() + x2.tolist(),
      "y": [-1 for _ in range(125)] + [1 for _ in range(125)]
  })

  return train_test_split(dataset.get("x"), dataset.get("y"), test_size=test_size, random_state=42)

# use:
train_x, test_x, train_y, test_y = generate_dataset(sigma1=0.1, sigma2=0.01)
dataset_x = (train_x, test_x)


learning_rate = 0.01
max_iterations = 1000

In [None]:
grad = GradientDescent(train_x.tolist(), train_y.tolist())
%time w = grad.train(lr=learning_rate, max_iters=max_iterations)

print("Gradient descent converges with: ", w)

CPU times: user 2min 36s, sys: 2.24 s, total: 2min 38s
Wall time: 2min 38s
Gradient descent converges with:  [1.6695287, 0.036065035, 1.0]


In [None]:
%time w, converges = perceptron(train_x, train_y, max_iterations=max_iterations)

if converges :
  print("Perceptron converged: ")
print("minima found: ", w)

CPU times: user 1.35 s, sys: 143 ms, total: 1.5 s
Wall time: 1.35 s
minima found:  [[0.8343986857525884, 0.02353063923702465]]


## 2.2  Real data: IRIS dataset

In [None]:
# import some data to play with
iris = datasets.load_iris()

In [None]:
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [None]:
encoder =  LabelEncoder()

iris_X = iris_df.iloc[:,0:3].values
iris_y = encoder.fit_transform(iris_df.iloc[:,4].values)

print("input: {}\noutput shape:{}".format(iris_X.shape, iris_y.shape))

input: (150, 3)
output shape:(150,)


In [None]:
print(set(iris_y))

{0, 1, 2}


### This is a multi class classification problem.

In [None]:
y_dummies = pd.get_dummies(iris_y).values # encoding in ternary: 0 -> 100, 1 -> 010 and 2 -> 001

X_train, X_test, y_train, y_test = train_test_split(iris_X, y_dummies, test_size=0.2, random_state=42)

### Model 1: Simple, plain, unimpressive neural net (Linear correlation)



In [None]:
stupid_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(3)
  ])

### Model 2: Neural net with hidden layer



In [None]:
long_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(3)
  ])

### Model 3: One layer neural net with more neurons

In [None]:
thicc_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(3)
  ])

### Model 4: One layer neural net with softmax instead of relu

In [None]:
softmax_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='softmax'),
    tf.keras.layers.Dense(3)
  ])

# Model 5: Everything combined

In [None]:
chad_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='softmax'),
    tf.keras.layers.Dense(3)
  ])

## Experiments

In [None]:
# Evaluation method for the models:
def evaluate(model, lr=0.01, opt='rmsprop', loss='categorical_crossentropy'):
  model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
  
  model.fit(X_train, y_train, batch_size=50, epochs=100, verbose=0)
  loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
  print('Test loss:', loss)
  print('Test accuracy:', accuracy)

#### Experiment 1: 
The learning rate.

In [None]:
for learning_rate in [0.1, 0.01, 0.001]:
  print("Learning rate: ",learning_rate)
  evaluate(stupid_NN, lr=learning_rate)
  print("")


Learning rate:  0.1
Test loss: 8.05904769897461
Test accuracy: 0.20000000298023224

Learning rate:  0.01
Test loss: 8.05904769897461
Test accuracy: 0.20000000298023224

Learning rate:  0.001
Test loss: 8.05904769897461
Test accuracy: 0.20000000298023224



#### Experiment 2: 
The number of layers.

In [None]:
print("1 layer:")
evaluate(stupid_NN)
print("\n2 layers:")
evaluate(long_NN)

1 layer:
Test loss: 8.05904769897461
Test accuracy: 0.20000000298023224

2 layers:
Test loss: 10.20812702178955
Test accuracy: 0.36666667461395264


#### Experiment 3: 
The number of neurons in each layer.

In [None]:
print("1 layer - 256 neurons:")
evaluate(stupid_NN)
print("\n1 layers - 512 neurons:")
evaluate(thicc_NN)
print("\n2 layers - 512 and 256 neurons:")
evaluate(chad_NN)


1 layer - 256 neurons:
Test loss: 8.05904769897461
Test accuracy: 0.20000000298023224

1 layers - 512 neurons:
Test loss: 6.984508037567139
Test accuracy: 0.9666666388511658

2 layers - 512 and 256 neurons:
Test loss: 5.90996789932251
Test accuracy: 0.36666667461395264


#### Experiment 4: 
The error function

In [None]:
print("\n1 layer - 512 neurons - Categorical crossentropy:")
evaluate(thicc_NN, loss='cateogorical_crossentropy')
print("\n1 layer - 512 neurons - Categorical cringe:")
evaluate(thicc_NN, loss='cateogorical_hinge')
print("\n1 layer - 512 neurons - Cosine similarity:")
evaluate(thicc_NN, loss='cosine_similarity')


1 layer - 512 neurons - Categorical crossentropy:
Test loss: 6.984508037567139
Test accuracy: 0.9666666388511658

1 layer - 512 neurons - Categorical cringe:
Test loss: 6.984508037567139
Test accuracy: 0.9666666388511658

1 layer - 512 neurons - Cosine similarity:
Test loss: 6.984508037567139
Test accuracy: 0.9666666388511658


# Experiment 5: the activation function

In [None]:
print("\n1 layer - 512 neurons - Relu:")
evaluate(thicc_NN)
print("\n1 layer - 512 neurons - Softmax:")
evaluate(softmax_NN)


1 layer - 512 neurons - Relu:
Test loss: 6.984508037567139
Test accuracy: 0.9666666388511658

1 layer - 512 neurons - Softmax:
Test loss: 12.894476890563965
Test accuracy: 0.4333333373069763
