In [179]:
import numpy as np
import matplotlib.pyplot as plt
import random
from numpy import mean
import pandas as pd
from numpy import log, dot, e

In [180]:
#!pip install matplotlib
#!pip install sklearn

## Multivariate logistic regression equation

1) log(odds) = p / 1-p

2) p = e^log(odds)/ (1 + e^log(odds))

3) log(odds) = mx+c

In [181]:
class My_Logistic_Regression():
    def __init__(self):
        pass   

    def create_random_data(self,hm_features,obs):

        X = np.ones((obs))

        for i in range(hm_features):
            xs = np.random.rand(obs)*100
            X = np.append(X,xs)

        X = np.reshape(X, (-1,obs)).transpose()
        ys = np.random.randint(0,2,size=obs) # 2 high as it's exclusive
        
        return X,ys
    
    def sigma_fx(self,x):
        return 1/ (1+e**-x)
    
    def cost_function(self,x, y,weights):
        '''
            Cost function in Logistic is different than Linear
            
            Cost(x) = -y_actual ln(y_hat) - (1-y_actual) ln(1-y_hat)
            Easy way to remember this equation
            When y_actual = 1 then Cost(x) = -ln(y_hat)
            when y_actual = 0 then Cost(x) = -ln(1 - y_hat)
            
            this cost function is optimised using Gradient descent which 
            require derivatives
            
            cost(x) = x (y_hat - y)
        '''
        z = dot(x,weights)
        predict_1 = -y * np.log(self.sigma_fx(z))
        predict_0 = -(1-y) * np.log(self.sigma_fx(z))

        return sum(predict_1+predict_0)/len(x)
    
    
    def fit(self,x,y,epc = 3,lr=0.05):
        weights = np.random.rand(x.shape[1]) # random weights initialised
        loss = [] # to track of loss value
        N = len(x) # no of samples
        
        for _ in range(epc):
            #Gradient descent
            
            y_hat = self.sigma_fx(x.dot(weights)) # independent variables multiplied with weights
                                                  #  values scaled by sigmoid function
            weights = weights - (lr * (dot(x.T, y_hat - y)/N))
            loss.append(self.cost_function(x,y,weights))
        
        self.weights = weights
        
        self.loss = loss
        return weights, loss[-1]
        
    def predict_prob(self,x):
        '''
        
        '''
        return self.sigma_fx(x.dot(self.weights))
    
    def predict_class(self,x):
        '''
        '''
        return [1 if i > 0.5 else 0 for i in self.sigma_fx(x.dot(self.weights))]
    
    def model_stats(self,y_hat,y):
        '''
        1) MAE -> Y_hat - y
        2) MAPE -> (Y_hat - y)/y
        3) R2
        '''
        y_mean = np.mean(y)
        sse = sum((y_hat - y)**2)
        ssm =sum((y_mean-y)**2)
        return np.average(np.abs(y_hat - y)), np.mean(np.abs(y_hat - y)/y), 1 - (sse/ssm)

In [182]:
Xx = pd.read_csv('titanic.csv')

In [183]:
ys = Xx['Survived']

In [184]:
X = Xx.drop(['Name','Survived','Sex'],axis=1)

In [185]:
X_ = X.copy() ; X_['Intercept'] = 1

In [186]:
X_

Unnamed: 0,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Intercept
0,3,22.0,1,0,7.2500,1
1,1,38.0,1,0,71.2833,1
2,3,26.0,0,0,7.9250,1
3,1,35.0,1,0,53.1000,1
4,3,35.0,0,0,8.0500,1
...,...,...,...,...,...,...
882,2,27.0,0,0,13.0000,1
883,1,19.0,0,0,30.0000,1
884,3,7.0,1,2,23.4500,1
885,1,26.0,0,0,30.0000,1


In [187]:
clf_my_LR =  My_Logistic_Regression()

In [156]:
#random data generated
X,ys = clf_my_LR.create_random_data(3,100)

In [201]:
clf_my_LR.fit(X_,ys,10000,0.005)

(array([-0.48028579, -0.04113382, -0.37075653,  0.17132078,  0.00775823,
         0.99083312]),
 1.525775286428782)

In [189]:
clf_my_LR.predict_class(X_); sum(clf_my_LR.predict_class(X_))

286

In [190]:
clf_my_LR.predict_prob(X_)[:10]

0    0.000453
1    0.999988
2    0.029974
3    0.998414
4    0.004354
5    0.028287
6    0.998848
7    0.000109
8    0.038566
9    0.927744
dtype: float64

In [191]:
confusion_matrix(ys,clf_my_LR.predict_class(X_))

array([[445, 100],
       [156, 186]], dtype=int64)

### Linear regression by sklearn

In [192]:
import sklearn
from sklearn.linear_model import LogisticRegression

In [193]:
clf = LogisticRegression()

In [194]:
#X = [x for x in zip(xs_0,xs_1)]
#X = [x for x in zip(xs_0,xs_1,xs_2,xs_3)]
#X = [xs_1] + [xs_0]

In [195]:
#ys_ = ys.reshape(-1,1)
#clf.fit(xs_0.reshape(-1,1),ys)  #temp.reshape(-1,1)
clf.fit(X,ys)  #temp.reshape(-1,1)

LogisticRegression()

In [196]:
clf.coef_,clf.intercept_

(array([[-1.00777123, -0.04229388, -0.28125379,  0.21454081,  0.00474583]]),
 array([2.9612604]))

In [197]:
y_LR_hat = clf.predict(X)

In [198]:
sum(y_LR_hat)

239

In [199]:
from sklearn.metrics import confusion_matrix

In [200]:
confusion_matrix(ys,y_LR_hat)

array([[464,  81],
       [184, 158]], dtype=int64)