In [22]:
# Import libraries
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import itertools
import argparse
import sys
import time
from sklearn import preprocessing
import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='3'
import random
from math import floor

In [2]:
## Preprocessing of data
# Function to load data

def get_power_data():
    """
    Read the Individual household electric power consumption dataset
    """
    
    # Assume that the dataset is located on folder "data"
    data = pd.read_csv('data/household_power_consumption.txt',
                       sep=';', low_memory=False)

    # Drop some non-predictive variables
    data = data.drop(columns=['Date', 'Time'], axis=1)

    #print(data.head())

    # Replace missing values
    data = data.replace('?', np.nan)

    # Drop NA
    data = data.dropna(axis=0)

    # Normalize
    standard_scaler = preprocessing.StandardScaler()
    np_scaled = standard_scaler.fit_transform(data)
    data = pd.DataFrame(np_scaled)

    # Goal variable assumed to be the first
    X = data.values[:, 1:].astype('float32')
    y = data.values[:, 0].astype('float32')

    # Create categorical y for binary classification with balanced classes
    y = np.sign(y+0.46)

    # Split train and test data here: (X_train, Y_train, X_test, Y_test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    no_class = 2                 #binary classification

    return X_train.T, X_test.T, y_train, y_test, no_class

In [3]:
X_train, X_test, y_train, y_test, no_class = get_power_data()
print("X,y types: {} {}".format(type(X_train), type(y_train)))
print("X size {}".format(X_train.shape))
print("Y size {}".format(y_train.shape))

# Create a binary variable from one of the columns.
# You can use this OR not

idx = y_train >= 0
notidx = y_train < 0
y_train[idx] = 1
y_train[notidx] = -1

X,y types: <class 'numpy.ndarray'> <class 'numpy.ndarray'>
X size (6, 1536960)
Y size (1536960,)


Let
\begin{align}
    E=\min_{w_3, W_2,W_1} \frac{1}{N}\sum_i || w_3 s(W_2 s(W_1 x_i)-y_i)||^2   
\end{align}

## Layer 3
Define 
\begin{align}
a_3(x)& :=w_3 s(W_2 s(W_1 x))\\
a_2(x)& :=s(W_2 s(W_1 x)),\\
a_1(x)& :=s(W_1 x).
\end{align}

Then
\begin{align}
\frac{\partial E}{\partial w_3} &= \frac{2}{N}(a_3-t)\frac{\partial a_3}{\partial w_3} \\
&=\frac{2}{N}(x_3-t)\frac{\partial w_3a_2}{\partial w_3}\\
&=\frac{2}{N}(x_3-t)a_2^T\\
\end{align}

So defining 
$$\delta_3 := \frac{2}{N}(a_3-t),$$
then
$$\frac{\partial E}{\partial w_3} =\delta_3\,a_2^T.$$

## Layer 2

\begin{align*}
\frac{\partial E}{\partial W_2} &= \frac{2}{N}(a_3-t)\frac{\partial a_3}{\partial W_2} \\
&=\frac{2}{N}(a_3-t)\frac{\partial (W_3 a_2)}{\partial W_2}\\
&=\delta_3\frac{\partial (W_3 a_2)}{\partial W_2}\\
&=W_3^T\delta_3\frac{\partial a_2}{\partial W_2}\\
&=[W_3^T\delta_3 \circ s'(W_2 a_1)]\frac{\partial W_2 a_1}{\partial W_2}\\
\end{align*}

So defining $$\delta_2 :=W_3^T\delta_3 \circ s'(W_2 a_1),$$
we have

$$\frac{\partial E}{\partial W_2}=\delta_2 a_1^T$$

## Layer 1

Define 
$$\delta_1 :=W_2^T\delta_2 \circ s'(W_1x),$$
similar to layer_2:
\begin{align*}
\frac{\partial E}{\partial W_1} &=\delta_1x^T
\end{align*}

In [38]:
# Sigmoid function
def sigmoid(x, derivative=False):
    sigm = 1. / (1. + np.exp(-x))
    if derivative:
        return sigm * (1. - sigm)
    return sigm

# Define weights initialization
def initialize_w(N, d):
    return 2*np.random.random((N,d)) - 1

# Fill in feed forward propagation
def feed_forward_propagation(X, y, w_1, w_2, w_3, lmbda):
    # Fill in
    #X is q x n
    # w_1 is p x q
    # w_2 is p x p
    # w_3 is 1 x p
    layer_0=X # q x n
    layer_1=sigmoid(np.matmul(w_1 , X)) # p x n 
    layer_2=sigmoid(np.matmul(w_2 , layer_1)) # p x n 
    layer_3=np.matmul(w_3 , layer_2) # p x n
    return layer_0, layer_1, layer_2, layer_3


# Fill in backpropagation    
def back_propagation(y, w_1, w_2, w_3, layer_0, layer_1, layer_2, layer_3):
    # Calculate the gradient here
    N = y.shape[0]    
        
    delta3=2/N*(layer_3 - y)
    delta2=np.multiply(np.matmul(w_3.T,delta3),sigmoid(np.matmul(w_2,layer_1),derivative=True))
    delta1=np.multiply(np.matmul(w_2.T,delta2),sigmoid(np.matmul(w_1,layer_0),derivative=True))
    
    layer_3_delta=np.matmul(delta3,layer_2.T)
    layer_2_delta=np.matmul(delta2,layer_1.T)
    layer_1_delta=np.matmul(delta1,layer_0.T)

    return layer_1_delta, layer_2_delta, layer_3_delta


# Cost function
def cost(X, y, w_1, w_2, w_3, lmbda):
    N, d = X.shape
    a1,a2,a3,a4 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)

    return np.linalg.norm(a4[:,0] - y,2) ** 2 / N

# Funtion to get mini batch sgd
def miniBatch(x,y,batchSize):
    D,N = x.shape
    X_mini = np.zeros((D,batchSize))
    Y_mini = np.zeros((batchSize,))
    indexArray = random.sample(range(N), batchSize)
    for i in range(batchSize):
        X_mini[:,i] = x[:,indexArray[i]]
        Y_mini[i,] = y[indexArray[i],]
    return X_mini,Y_mini

# Define SGD
def SGD(X, y, w_1, w_2, w_3, lmbda, learning_rate, batch_size, iterations):
    
    for i in range(iterations):

        X_mini,Y_mini = miniBatch(X,y,batch_size)
        L0,L1,L2,L3 = feed_forward_propagation(X_mini,Y_mini,w_1,w_2,w_3,lmbda)
        D1,D2,D3 = back_propagation(Y_mini,w_1,w_2,w_3,L0,L1,L2,L3)

        #cost1 = cost(X_mini, Y_mini, w_1, w_2, w_3, lmbda)
        
        a = w_1-(learning_rate*D1).reshape(w_1.shape)
        b = w_2-(learning_rate*D2).reshape(w_2.shape)
        c = w_3-(learning_rate*D3).reshape(w_3.shape)
        
        #cost2 = cost(X_mini, Y_mini, a, b, c, lmbda)
    
        #if ((cost2-cost1)/cost1>0.5):
        #    break
        
        w_1 = a
        w_2 = b
        w_3 = c
        print(i,': ', cost(X,y,w_1,w_2,w_3,lmbda=lmbda))        
    return w_1, w_2, w_3

# Define SVRG here:
def SVRG(X, y, w_1, w_2, w_3, lmbda, learning_rate, T,iterations):
    
    for i in range(iterations):
        
        K  = floor(iterations/T)
        N  = X.shape[1]
        wk_1= w_1
        wk_2= w_2
        wk_3= w_3
        
        for k in range(K):
            L0,L1,L2,L3 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)
            ga_1, ga_2, ga_3 = back_propagation(y,wk_1,wk_2,wk_3,L0,L1,L2,L3) #the average
            
            for t in range(T):
                index = np.random.randint(N, size=1)
                
                L0,L1,L2,L3 = feed_forward_propagation(X[:,index],y[index,],w_1,w_2,w_3,lmbda)
                g1_1,g1_2,g1_3 = back_propagation(y[index,], w_1,w_2,w_3,L0,L1,L2,L3)
                
                Lk0,Lk1,Lk2,Lk3 = feed_forward_propagation(X[:,index],y[index,],wk_1,wk_2,wk_3,lmbda)
                g2_1,g2_2,g2_3 = back_propagation(y[index,], wk_1,wk_2,wk_3,Lk0,Lk1,Lk2,Lk3)
                
                g1  = g1_1 - g2_1 + ga_1
                g2  = g1_2 - g2_2 + ga_2
                g3  = g1_3 - g2_3 + ga_3

                #cost1 = cost(X, y, w_1, w_2, w_3, lmbda)
            
                a = w_1-(learning_rate*g1).reshape(w_1.shape)
                b = w_2-(learning_rate*g2).reshape(w_2.shape)
                c = w_3-(learning_rate*g3).reshape(w_3.shape)
            
                #cost2 = cost(X_mini, Y_mini, a, b, c, lmbda)
    
                #if ((cost2-cost1)/cost1>0.5):
                #    break
                  
            
                w_1 = a
                w_2 = b
                w_3 = c

            wk_1 = w_1
            wk_2 = w_2
            wk_3 = w_3
        print(i,': ', cost(X,y,w_1,w_2,w_3,lmbda=lmbda))        
    return w_1, w_2, w_3

# Define GD here:
def GD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations):

    for i in range(iterations):    
        L0,L1,L2,L3 = feed_forward_propagation(X,y,w_1,w_2,w_3,lmbda)
        D1,D2,D3 = back_propagation(y,w_1,w_2,w_3,L0,L1,L2,L3)
    
        #cost1 = cost(X, y, w_1, w_2, w_3, lmbda)
        
        a = w_1-(learning_rate*D1).reshape(w_1.shape)
        b = w_2-(learning_rate*D2).reshape(w_2.shape)
        c = w_3-(learning_rate*D3).reshape(w_3.shape)
        
        #cost2 = cost(X, y, a, b, c, lmbda)
    
        #if ((cost2-cost1)/cost1>0.5):
        #    break
    
        w_1 = a
        w_2 = b
        w_3 = c
        print(i,': ', cost(X,y,w_1,w_2,w_3,lmbda=lmbda))        
    
    return w_1, w_2, w_3

# Define projected GD here:
def PGD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations, noise):
    # Complete here:
    
    return w_1, w_2, w_3

# Define BCD here:
def BCD(X, y, w_1,w_2,w_3, learning_rate, lmbda, iterations):
    # Complete here:
    
    return w_1, w_2, w_3







In [5]:
# Should be a hyperparameter that you tune, not an argument - Fill in the values
parser = argparse.ArgumentParser()
parser.add_argument('--lambda', type=float, default=0.1, dest='lmbda') 
parser.add_argument('--w_size', type=int, default=3, dest='w_size')
parser.add_argument('--lr', type=float, default=0.1)
parser.add_argument('--iterations', type=int, default=100)

#args = parser.parse_args()




_StoreAction(option_strings=['--iterations'], dest='iterations', nargs=None, const=None, default=100, type=<class 'int'>, choices=None, help=None, metavar=None)

In [12]:
w_1 = initialize_w(3,X_train.shape[0])
w_2 = initialize_w(3,3)
w_3 = initialize_w(1,3)
lmbda=0.1
print(w_1.shape)
print(w_2.shape)
print(w_3.shape)

print(X_train.shape)
print(y_train.shape)

initialCost=cost(X_train,y_train,w_1,w_2,w_3,lmbda)
layer_0,layer_1,layer_2,layer_3 = feed_forward_propagation(X_train,y_train,w_1,w_2,w_3,lmbda)
#print('cost: ',costx)

w_1_s,w_2_s,w_3_s = GD(X_train, y_train, w_1,w_2,w_3, learning_rate = 0.1, lmbda=lmbda, iterations=10)
finalCost=cost(X_train,y_train,w_1_s,w_2_s,w_3_s,lmbda)
print('Initial Cost:',initialCost)
print('Initial Cost:',finalCost)
#d1,d2,d3 = back_propagation(y_train, w_1, w_2, w_3, layer_0,layer_1,layer_2,layer_3)

(3, 6)
(3, 3)
(1, 3)
(6, 1536960)
(1536960,)
0 :  276354.43197827035
1 :  269124.48233849666
2 :  264370.10250831017
3 :  261261.74667032715
4 :  259248.19936452515
5 :  257961.62821370878
6 :  257156.08122641404
7 :  256667.06076916543
8 :  256384.68365730604
9 :  256235.7422634473
Initial Cost: 287317.49150864885
Initial Cost: 256235.7422634473


In [15]:
w_1 = initialize_w(3,X_train.shape[0])
w_2 = initialize_w(3,3)
w_3 = initialize_w(1,3)
lmbda=0.1
#SGD(X, y, w_1, w_2, w_3, lmbda, learning_rate, batch_size, iterations):
initialCost = cost(X_train,y_train,w_1,w_2,w_3,lmbda=0.1)
s1,s2,s3 = SGD(X_train, y_train, w_1, w_2, w_3, lmbda=0.1, learning_rate=0.05,batch_size=100,iterations=20)
finalCost = cost(X_train,y_train,s1,s2,s3,0.1)
print('Initial cost:',initialCost)
print('Final cost:',finalCost)


0 :  341668.22407190833
1 :  323609.2344836599
2 :  309696.54967288714
3 :  299274.5250727535
4 :  292964.09983598336
5 :  284671.29434684635
6 :  276202.4298221254
7 :  270683.14151518454
8 :  267165.6341329409
9 :  265010.5291433944
10 :  263077.8888569198
11 :  262175.14883577405
12 :  260724.10405093024
13 :  258904.3950195528
14 :  258483.0741386785
15 :  258140.32664012132
16 :  257853.27577987572
17 :  257594.1189225933
18 :  256669.22904976993
19 :  256447.64151588836
Initial cost: 370940.29838883615
Final cost: 256447.64151588836


In [41]:
w_1 = initialize_w(3,X_train.shape[0])
w_2 = initialize_w(3,3)
w_3 = initialize_w(1,3)
initialCost = cost(X_train,y_train,w_1,w_2,w_3,lmbda=0.1)
s1,s2,s3 = SVRG(X_train, y_train, w_1, w_2, w_3, lmbda=0.1, learning_rate=0.05,T=100,iterations=200)
finalCost = cost(X_train,y_train,s1,s2,s3,lmbda=0.1)
print('Initial cost:',initialCost)
print('Final cost:',finalCost)

0 :  329393.9112000158
1 :  480578.4085409427
2 :  469788.73202624946


KeyboardInterrupt: 

In [105]:
# Should be a hyperparameter that you tune, not an argument - Fill in the values
parser = argparse.ArgumentParser()
parser.add_argument('--lambda', type=float, default=, dest='lmbda') 
parser.add_argument('--w_size', type=int, default=, dest='w_size')
parser.add_argument('--lr', type=float, default=)
parser.add_argument('--iterations', type=int, default=)

args = parser.parse_args()

# Initialize weights
w_1 = initialize_w(X_train.shape[0], args.w_size)

w_2 = initialize_w(args.w_size,args.w_size)

w_3 = initialize_w(args.w_size, 1)

# Get iterations
iterations = args.iterations
# Define plotting variables
fig, ax = plt.subplots(2, 1, figsize=(16, 8))

# Define the optimizers for the loop
optimizers = [
        {# Fill in the hyperparameters
            "opt": SGD(X_train, y_train, w_1, w_2, w_3, args.lmbda, args.lr, batch_size),
            "name": "SGD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": SVRG(X_train, y_train, w_1, w_2, w_3, args.lmbda, args.lr),
            "name": "SVRG",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": GD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations),
            "name": "GD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": PGD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations, noise=),
            "name": "PGD",
            "inner": # Fill in
        },
        {# Fill in the hyperparameters
            "opt": BCD(
                X_train, y_train, w_1, w_2, w_3, learning_rate=args.lr,
                lmbda=args.lmbda, iterations=iterations),
            "name": "BCD",
            "inner": # Fill in
        }
    ]

SyntaxError: invalid syntax (<ipython-input-105-39a1d18e33d7>, line 3)

In [None]:
# Run the iterates over the algorithms above

for opt in optimizers:
    #
    # Fill in

In [None]:
# Plot results
ax[0].legend(loc="upper right")
ax[0].set_xlabel(r"Iteration", fontsize=16)
ax[0].set_ylabel("Loss", fontsize=16)
ax[0].set_title("CA3 - Training a deep neural network for the power consumption Dataset")
ax[0].set_ylim(ymin=0)

ax[1].legend(loc="upper right")
ax[1].set_xlabel(r"Time [s]", fontsize=16)
ax[1].set_ylabel("Loss", fontsize=16)
ax[1].set_ylim(ymin=0)

plt.savefig("power.png")