<img src="pics/otus.png">

In [2]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from pandas.tools.plotting import table
from sklearn import datasets

%matplotlib inline
plt.rcParams["figure.figsize"] = [12, 8]

np.random.seed(42)


# Логистическая регрессия

## Как принимается решение

Простой вариант - узнать, с какой стороны от гиперплоскости находится точка

$$\hat{y} = sign(x\theta)$$

Уравнение прямой
      $$Ax+By+C=0$$

Расстояние от точки $(x0, y0)$ до прямой $Ax+By+C=0$ это $$\frac{|Ax0 + By0 + C|}{\sqrt{(A^2 + B^2)}}$$

## Упражнение 1

In [13]:
import math
w = np.array([7, -1, 1]) # Ax + By + C = 0
     
# Замечание: расстояние от точки (x0, y0) до прямой Ax + By + C это abs(Ax0 + By0 + C) / sqrt(A^2 + B^2)
    
def predict(x, w):
    
    res = np.sign(x.dot(w))
    
    return res

x_pred = np.hstack([np.random.rand(10, 2) * 4 - 2, np.ones(10).reshape(-1, 1)])
y_pred = predict(x_pred, w)

print (x_pred)
print (y_pred)

[[ 0.56812658 -1.66344014  1.        ]
 [-1.35348514  1.59421675  1.        ]
 [ 0.42571624 -1.96321179  1.        ]
 [-1.59411383  0.65400708  1.        ]
 [-1.97975366 -1.35676779  1.        ]
 [ 0.19493516  0.76758079  1.        ]
 [ 0.60784504 -1.10292276  1.        ]
 [ 0.84871689 -1.05100365  1.        ]
 [-0.69840121  0.98596562  1.        ]
 [ 0.5985316   1.39689364  1.        ]]
[ 1. -1.  1. -1. -1.  1.  1.  1. -1.  1.]


In [None]:
x_naive = np.arange(-0.5, 0.5, 0.1)
y_naive = 7 * x_naive + 1

plt.scatter(x_pred[:, 0], x_pred[:, 1], c=['yellow' if y_i > 0 else 'green' for y_i in y_pred])
plt.title("Classification problem \n What is the color for the new (x1, x2) pair?")
plt.xlabel("x1 (feature)")
plt.ylabel("x2 (feature)")
plt.plot(x_naive, y_naive, c='red', label='descision boundary naive')
plt.legend()
plt.savefig('pics/classification_random_line.pdf')

# Градиентный спуск



Посчитаем, чему равен градиент функции потерь $RSS(\theta)$

$$ RSS = \mathcal{L}(\theta) = (\hat{y} - y)^2 $$

$$ \frac{\partial L}{\partial \theta_i} = 2(\hat{y} - y)\frac{\partial L}{\partial \theta_i}(\hat{y} - y) = 2(\hat{y} - y)\frac{\partial L}{\partial \theta_i}(\theta_0x_0 + ... + \theta_1x_1 - y) = 2(\hat{y} - y)\cdot x_i$$

$$ \theta_i:= \theta_i - \alpha(\hat{y} - y)\cdot x_i$$

Или:
$$ \frac{\partial RSS(\theta)}{\partial \theta_i} = 2\sum_{i=1}^{n}(\theta^T\cdot x_i - y_i)x_i$$

$$\nabla_\theta RSS(\theta) = \left( \begin{matrix} \frac{\partial L}{\partial \theta_0} \\ \frac{\partial L}{\partial \theta_1} \\ ... \\ \frac{\partial L}{\partial \theta_p} \end{matrix} \right) = x^\top(x\theta - y)$$


Для MSE:
$$ \frac{\partial L}{\partial \theta} = \frac{1}{n} X^\top(X\theta - y)$$

In [None]:
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * x + np.random.randn(100, 1)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X, y)
lin_reg.intercept_, lin_reg.coef_

In [None]:
theta_best = np.vstack([lin_reg.intercept_, lin_reg.coef_.ravel()])

In [None]:
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new]  # add x0 = 1 to each instance
y_predict = X_new_b.dot(theta_best)
y_predict

## Псевдокод алгоритма

```{python}
1.function gd(X, alpha, epsilon):
2.    initialise theta 
3.    do: 
4.        theta = new_theta
5.        new_theta = theta - alpha * grad(X, theta)
6.    until dist(new_theta, theta) < epsilon
7.    return theta
```

## Упражнение 2

In [None]:
X_b = np.c_[np.ones((100, 1)), X]

alpha = 0.1
n_iterations = 1000
m = 100
theta = np.random.randn(2,1)
eps = 0.0001
for iteration in range(n_iterations):
    # your code here
        
print iteration, dist

In [None]:
theta

## Стохастический градиентный спуск


Проблема - используется вся обучающая выборка на каждом шаге алгоритма  
Решение - использовать один случайный элемент выборки

### Градиентный спуск

```{python}
1.function gd(X, alpha, epsilon):
2.    initialise theta 
3.    do: 
4.        theta = new_theta
5.        new_theta = theta - alpha * grad(X, theta)
6.    until dist(new_theta, theta) < epsilon
7.    return theta
```

### Стохастический градиентный спуск

```{python}
1.function sgd(X, alpha, epsilon):
2. 	initialise theta 
3. 	do: 
4.        X = shuffle(X)
5.        for x in X:
6.            theta = new_theta
7.            new_theta = theta - alpha * grad(x, theta)
8.	until dist(new_theta, theta) < epsilon
9.	return theta
```

In [None]:
m = len(X_b)
alpha = 0.01
n_epochs = 100
t0, t1 = 5., 50  # learning schedule hyperparameters

theta = np.random.randn(2,1)  # random initialization

for epoch in range(n_epochs):
    
    # your code here
        
print theta