# Module 6: Gradient Descent

In [None]:
# Setup the matplotlib styling
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
import pandas as pd
import numpy as np

try:
    # Try to use the BI style sheet for plots
    line1 = (0/256, 224/256, 170/256)
    line2 = (96/256, 126/256, 229/256)
    line3 = (136/256, 76/256, 255/256)
    plt.style.use('matplotlibrc')
    plt.rcParams['axes.prop_cycle'] = plt.cycler(color=[(136/256, 76/256, 255/256), (60/256, 170/256, 207/256), (12/256, 229/256, 177/256)]) 
    
    colors = [(0.53125, 0.296875, 0.99609375), (0.453125, 0.3984375, 0.9453125), (0.375, 0.4921875, 0.89453125), (0.3046875, 0.578125, 0.8515625), (0.234375, 0.6640625, 0.80859375), (0.16015625, 0.75390625, 0.76171875), (0.09375, 0.8359375, 0.72265625), (0.046875, 0.89453125, 0.69140625), (0.0, 0.875, 0.6640625)]
    bicmap = LinearSegmentedColormap.from_list(name='BIcmp', 
                                                colors=colors,
                                                N=len(colors))
    cm_bright = ListedColormap([(0.53125, 0.296875, 0.99609375), (12/256, 229/256, 177/256)])
    colors = np.array([line1, line2, line3])
except:
    bicmap = plt.cm.BuGn 
    colors = ['r', 'g', 'b']

## **Exercise 6.1**

The gradient descent method is an algorithm that minimizes a function f. It works like this:
- Initialize x to some value  
- While “still change”:  
    x=x -f'(x)  

### **Exercise 6.1.1**

We want to implement the gradient descend method in python for the function f(x)=x^4. Use x=5 as starting point.

In [None]:
# This is the function we want to minimize
def func(x):
    return x**4

In [None]:
# Create the derivative of f(x)
def gradient_f(x):
    # TODO
    pass

In [None]:
# Our starting point will be x=5
x = 5
# Append all tried positions to this list
xpos = []

for i in range(5):
    # Use the gradient function to calculate the gradient
    # TODO
    # Use the update rule from above to change the next x based on the gradient
    # TODO
    # Log the tried x to xpos
    xpos.append(x)

In [None]:
# Plot the results

# Create the target function
x_lin = np.linspace(-10, 10, 20)
f_lin = x_lin**4

# Calculate the y value for all tried points
ypos = [func(x) for x in xpos]

plt.plot(x_lin, f_lin, color=line1, label='Original function')
plt.scatter(xpos, ypos, color=line2, label='Checked points')
plt.legend()
plt.show()

**Does this strategy lead you to the minimal point 0?**

**What is the problem and what do you need to change?**

### **Exercise 6.1.2**

In [None]:
# Use the code from above, but this time include a learning rate

# Our starting point will be x=5
x = 5
# Append all tried positions to this list
xpos = []

for i in range(5):
    # Use the gradient function to calculate the gradient
    # TODO
    # Use the update rule from above to change the next x based on the gradient this time with a learning rate
    # TODO
    # Log the tried x to xpos
    xpos.append(x)

# Plot the results
x_lin = np.linspace(-10, 10, 20)
f_lin = x_lin**4
print(xpos)
ypos = [func(x) for x in xpos]

plt.plot(x_lin, f_lin, color=line1, label='Original function')
plt.scatter(xpos, ypos, color=line2, label='Checked points')
plt.legend()
plt.show()

**Does this strategy lead you to the minimal point 0?**

## **Exercise 6.2: Linear Regression**

We want to implement our own Linear Regression modul with gradient descent.

### **Exercise 6.2.1: One-dimensional linear regression**

In [None]:
# Import the dataset
reg_data = pd.read_csv('data/LinearRegression.csv')
x = reg_data['x'].values
y = reg_data['y'].values
reg_data.head()

In [None]:
# Define the linear regression as 
def f(x, intercept, slope):
    return intercept + slope * x

In [None]:
# Calculate the error as the difference between the desired output and the predicted output
def error(desired_y, predicted_y):
    # TODO
    pass

In [None]:
# Define the loss as the squared sum of errors
def squared_loss(error):
    # TODO
    pass

In [None]:
# Define the gradients of our function
# Returning one gradient for intercept and one gradient for slope separated by a comma
def gradient_f(error, x):
    # TODO
    pass

In [None]:
# Our starting point will
intercept = 0
slope = 0

for i in range(10):
    # Make a prediction with the function f
    pred_y = None # TODO
    # Calculate the error of the prediction
    e = None # TODO
    # Calculate the loss
    l = None # TODO
    # Calculate the gradient
    # Tip: The actual gradient is mean of the gradients from all data points
    grad_intercept, grad_slope = None # TODO
    # Update the parameters based on your gradients
    # Tip: Use the learning rate of 0.1 for the intercept and 0.01 for the slope
    # TODO

    # Plot the results of the iteration
    x_lin = np.linspace(x.min(), x.max(), 40)
    f_lin = f(x_lin, intercept, slope)

    plt.plot(x_lin, f_lin, color=line1, label='Predicted Linear Regression')
    plt.scatter(x, y, color=line2, label='Data points')
    plt.legend()
    plt.show()

### **Exercise 6.2.2: Doing something different**

For your above optimization process, change the update rule from
```python
theta = theta - lr * gradient
```
to
```python
theta = theta + lr * gradient
```

**What happens when you do that?**

### **Bonues Exercise 6.2.3: Mulitdimensional Linear Regression**

Modify your code to allow optimizing any dimension.

In [None]:
from sklearn.datasets import make_regression

x, y = make_regression(n_features=20)

In [None]:
# Define the linear regression as 
def f(x, params):
    # TODO
    pass

In [None]:
# Define the gradient function
def gradient_f(error, x):
    # TODO
    pass

In [None]:
# Our starting point will be random
params = np.random.random((1, x.shape[1] + 1))

for i in range(30):
    # Make a prediction with the function f
    y_pred = None # TODO
    # Calculate the error of the prediction
    e = None # TODO
    # Calculate the loss
    l = None # TODO
    # Calculate the gradient
    gradients = None # TODO
    # Update the parameters based on your gradients
    # TODO

    # We can't plot 100 dimensions
    # Print the loss and see if it decreases
    print(l)

## **Exercise 6.3: Logistic Regression**

In [None]:
clf_data = pd.read_csv('data/LogisticRegression.csv')
x = clf_data['x'].values
y = clf_data['y'].values
z = clf_data['z'].astype(int).values
clf_data.head()

### **Exercise 6.3.1: Two dimensional logistic regression**

In [None]:
# Define the logistic model
def logistic_regression(x, y, intercept, slope1, slope2):
    return 1/(1+np.e ** -(intercept + slope1 * x + slope2 * y))

In [None]:
# Define the gradient function
# Return separated by comma the gradients for intercept, slope1, slope2
def gradient_f(error, x, y, output):
    # TODO
    pass

In [None]:
# Our starting point will
intercept = 0
slope1 = 0
slope2 = 0

for i in range(50+1):
    # Make a prediction with the function f
    pred_y = None # TODO
    # Calculate the error of the prediction
    e = None # TODO
    # Calculate the loss
    l = None # TODO
    # Calculate the gradient
    grad_intercept, grad_slope1, grad_slope2 = None # TODO
    # Update the parameters based on your gradients
    # Tip: A learning rate of 0.2 should work
    # TODO
    
    # Every tenthed interation visualize the results
    if i % 10 == 0:
        # Visualize each iteration and and the loss
        h = 0.2 # Resolution
        xx, yy = np.meshgrid(np.arange(x.min()-0.5, x.max()+0.5, h), np.arange(y.min()-0.5, y.max()+0.5, h))
        zz = logistic_regression(xx.ravel(), yy.ravel(), intercept, slope1, slope2).reshape(xx.shape)

        plt.contourf(xx, yy, zz, cmap=bicmap, alpha=.8)
        plt.scatter(x, y, c=colors[1-z], cmap=bicmap, edgecolors='k')
        plt.title(f'Iteration {i}: Loss {l}')
        plt.show()

### **Bonus Exercise 6.3.2: Multi-dimensional Logistic Regression**

In [None]:
# Redefine the data for this
x = clf_data[['x', 'y']].values
y = clf_data['z'].values

In [None]:
# Define the logistic model
def logistic_regression(x, params):
    # TODO
    pass

In [None]:
# Define the gradient function
def gradient_f(error, x, output):
    # TODO
    pass

In [None]:
# Our starting point will
params = np.random.random((1, x.shape[1] + 1))

for i in range(100):
    # Make a prediction with the function f
    pred_y = None # TODO
    # Calculate the error of the prediction
    e = None # TODO
    # Calculate the loss
    l = None # TODO
    # Calculate the gradient
    gradients = None # TODO
    # Update the parameters based on your gradients
    # TODO

    # We can't plot more than 3 dimensions
    # Print the loss and see if it decreases
    print(l)

## **Exercise 6.4: Categorical Loss**

Why would we prefer to use the this loss function instead of squared error: 
```
l(y,pred_y) = -[y log(y_pred) + (1-y) log⁡(1-y_pred)] 
```