In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron

In [2]:
class Perceptron_2(object):
    """
    Inputs: eta & maxiter
    Output: object
    """
    def __init__(self, eta, maxiter) -> None:
        self.eta = eta
        self.maxiter = maxiter
    
    def fit(self, X, y):
        """
            Perceptron
            The activation function here is unit step function
        """
        self.w_ = np.zeros(1+X.shape[1])
        self.errors_ = []
        self.iterations = 0
        errors = 1
        while (self.iterations < self.maxiter and errors!=0):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_[1:] += update * xi
                self.w_[0] += update
                errors += int(update != 0.0)
            self.iterations += 1
            self.errors_.append(errors)
        return self

    def fit_sgd(self, X, y):
        """
            Perceptron using Stochastic Gradient Descent.
            L(w, b) = -y(w.x+b) is for each point

            Keep in mind this is a customized loss function based on loss on 
            each point & in SGD we update w & b based on each point. We can also
            use on the shelf loss functions like least square,logistic, unit step
            .... using both gradient & stochastic descent.

            How we arrived at Loss Function:
            Input Data Background:
                X = [x1, x2, x3,....xd] d-dimensional with each dimension being
                vector of real numbers..
                y = -1 or +1

                In euclidean space X = [x1, x2], a point. So in this case the 
                data sample of X & y looks like 
                X = [[1, 2], [3, 4], [5, 3],....], 
                y=[-1, +1, -1,....]
            
            Algorithm:
            From X = [x1, x2, x3,...] the linear classifier is estimated 
            from the function: w1x1 + w2x2 +.....+b = w.x+b

            w.X+b = 0 is the decision boundary
            w.X+b>0 point is above the decision boundary & it is classified as y = +1
            w.X+b<=0 point is below the decision boundary & it is classified as y =-1

            Goal is to find w, b based on training data of X & y

            Consider based on current values of w & b for sample point x
            y=-1, w.x+b=-c then y(w.x+b)=c >0, where c is some number
            y=1, w.x+b=c then y(w.x+b)=c >0
            In both the cases we are right on the target & prediction for point x
            i.e y is positive & sign of w.x+b is positive & vice-versa so
            if y(w.x+b)>0, we don't have to anything, the point is classified correctly

            Now consider,
            y=-1, w.x+b=c, y(w.x+b)=-c <= 0, where c is some number
            y=1, w.x+b=-c, y(w.x+b)=-c <= 0
            so we are wrong on a point x, if y(w.x+b)<=0
            and by being wrong, how much we are losing? 
            We are losing by amount c on each wrongly classified point
            so rearranging y(w.X+b)=-c, 
            Loss c = -y(w.x+b), which is function of w & b, & we want to minimize it by 
            finding the optimal value of w & b
            w & b are updated only when we are wrong on the point that is 
            y(w.x+b)<=0 because we are wrong on classified point & there is loss c
            
            So the loss function is:
                L(w, b) = -y(w.x+b)

            Derivative of Loss(w, b) with respect to w & b gives
            
            dL/dw = -yx
            dL/db = -y

            Stochastic Gradient Descent till convergence:

            w = w - (-yx) 
            w = w+yx

            b = b - (-y)
            b = b+y
            
        """
        self.w_=np.zeros(1+X.shape[1])
        self.errors_ = []
        self.iterations = 0
        errors = 1
        while (self.iterations < self.maxiter and errors!=0):
            errors = 0
            X, y = self.shuffle(X, y)
            for xi, target in zip(X, y):
                if (target * self.predict(xi)) <= 0:
                    self.w_[1:] += self.eta * target * xi
                    self.w_[0] += self.eta * target
                    errors += 1
            self.iterations += 1
            self.errors_.append(errors)
        return self

    def fit_sgd_one(self, X, y):
        """
            L(w, b)= Sum(y-(wX+b)**2)

            derivative 
            dL/dw = -sum((y-(w.X+b))X)
            db/dw = -sum(y-(w.X+b))

            In SGD we calculate on each point, implementation reflects this.
        """
        self.w_=np.zeros(1+X.shape[1])
        self.errors_ = []
        self.iterations = 0
        errors = 1
        diff_errors = 1
        while (self.iterations < self.maxiter and errors!=0):
            errors=0
            X, y = self.shuffle(X, y)
            for x1, y1 in zip(X, y):
                self.w_[1:] += self.eta * x1.dot((y1-self.predict(x1)))
                self.w_[0] += self.eta  * (y1-self.predict(x1))
                errors += (y1-self.predict(x1))**2
            self.iterations += 1
            self.errors_.append(errors)
            diff_errors = abs(diff_errors-errors)
        # print(self.iterations)
        return self
    
    def fit_gd(self, X, y):
        """
            In case of Gradient Descent, we act on the complete data set to 
            calculate single instance of w & b.
            So Loss function need to be defined keeping in mind the that we are
            going to act on complete data set, hence summation is required over 
            loss on each point. Here we are using simple loss function
            L(w, b)= sum(y-(wX+b)**2)

            derivative 
            dL/dw = -sum((y-(w.X+b))X)
            db/dw = -sum(y-(w.X+b))
        """
        self.w_=np.zeros(1+X.shape[1])
        self.errors_ = []
        self.iterations = 0
        errors = 1
        while (self.iterations < self.maxiter and errors!=0):
            self.w_[1:] += self.eta * X.T.dot((y-self.predict(X)))
            self.w_[0] += self.eta * (y-self.predict(X)).sum()
            errors = ((y-self.predict(X))**2).sum()
            self.iterations += 1
            self.errors_.append(errors)
        return self

    def net_input(self, X):
        """
            Returns dot product of w.x + b
        """
        return np.dot(X, self.w_[1:])+self.w_[0]

    def predict(self, X):
        """
            Returns the sign of w.x + b
        """
        return np.where(self.net_input(X)> 0, 1, -1)

    def shuffle(self, X, y):
        i = np.random.permutation(len(X))
        return(X[i], y[i])

In [None]:
x = np.array([[1,2], [1, 4], [1, 7], [2, 5], [2, 8], [2, 6], [3, 6], [3, 9], [4, 2], [2, 10],[10, 8]])
y = np.array([-1, -1, 1, -1, 1, 1, 1, 1, -1, 1, 1])
# plt.scatter(x[:, 0], x[:, 1])

In [None]:
prn = Perceptron_2(eta=0.01, maxiter=100)
prn.fit_gd(x, y)
print(f'Total number of iterations: {prn.iterations}')
plt.plot(range(len(prn.errors_)), prn.errors_)
print(f'Errors: {prn.errors_}')
print(f'Intercept & Co-efficients: {prn.w_}')
test = np.array([[1, 4], [4, 4], [6, 7], [5, 4], [1, 4.9], [10, 10], [10, 3]])
print(f'Predicted values: {prn.predict(test)}')

In [None]:
prn = Perceptron_2(eta=0.01, maxiter=1000)
prn.fit_sgd(x, y)
print(f'Total number of iterations: {prn.iterations}')
plt.plot(range(len(prn.errors_)), prn.errors_)
print(f'Errors: {prn.errors_}')
print(f'Intercept & Co-efficients: {prn.w_}')
test = np.array([[1, 4], [4, 4], [6, 7], [5, 4], [1, 4.9], [10, 10], [10, 3]])
print(f'Predicted values: {prn.predict(test)}')

In [None]:
prn = Perceptron_2(eta=0.001, maxiter=100)
prn.fit_sgd_one(x, y)
print(f'Total number of iterations: {prn.iterations}')
plt.plot(range(len(prn.errors_)), prn.errors_)
print(f'Errors: {prn.errors_}')
print(f'Intercept & Co-efficients: {prn.w_}')
test = np.array([[1, 4], [4, 4], [6, 7], [5, 4], [1, 4.9], [10, 10], [10, 3]])
print(f'Predicted values: {prn.predict(test)}')

In [None]:
prn = Perceptron_2(eta=0.1, maxiter=100)
prn.fit(x, y)
print(f'Total number of iterations: {prn.iterations}')
plt.plot(range(len(prn.errors_)), prn.errors_)
print(f'Errors: {prn.errors_}')
print(f'Intercept & Co-efficients: {prn.w_}')
test = np.array([[1, 4], [4, 4], [6, 7], [5, 4], [1, 4.9], [10, 10], [10, 3]])
print(f'Predicted values: {prn.predict(test)}')

In [None]:
# Using Sklearn Perceptron, keep in mind we have to use StandardScaler()
sc = StandardScaler()
sc.fit(x)
x_p = sc.transform(x)
x_t = sc.transform(test)

In [None]:
clf = Perceptron(eta0=0.1, max_iter=100, random_state=0)
clf.fit(x_p, y)

In [None]:
clf.predict(x_t)

In [None]:
clf.coef_, clf.intercept_