# Linear regression and logistic regression

### Small example of simple linear regression (one independent variable)

The work directory is given from the function.

In [None]:
def towdir(s):
    return (str('./datasets_book/'+s))

import deepglmlib.utils as utils
import numpy as np

In [None]:
import importlib
importlib.reload(utils)

In this example, the explaining variable is univariate such that $x_{i}=x_{i1}$, hence:

$$y_i = \beta_0 + \beta_1 x_{i} \,.$$

In python, we generate the noise, the explicative variable $x_i$, compute $y_i$ and show their linear relation with the line.

In [None]:
import numpy as np

In [None]:
n      = 10
b0     = -0.5
b1     = 3.5
xmin   = 0
xmax   = 1
x      = np.random.uniform(xmin,xmax,n)  
e      = np.random.randn(n)/3           
y      = b0 + b1 * x + e    

In [None]:
beta = [b0, b1]

np.savetxt(towdir("./beta_1d_reglinear.txt"),beta)

np.savetxt(towdir("./xy_1d_reglinear.txt"),
           np.hstack([x.reshape((n,1)),y.reshape((n,1))]))

In [None]:
beta = np.loadtxt(towdir("./beta_1d_reglinear.txt"))
xy   = np.loadtxt(towdir("./xy_1d_reglinear.txt"))
x    = xy[:,0].reshape((len(xy),1))
y    = xy[:,1].reshape((len(xy),1))
x.shape,y.shape, beta.shape

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
xmin   = 0
xmax   = 1

fig, ax = plt.subplots()

ax.plot(np.array([xmin,xmax]), beta[0]+beta[1]*np.array([xmin,xmax]), "b-")
ax.plot(x, y ,'bo')
ax.axis([xmin,xmax,0.80*min(y), 1.20*max(y)])
ax.set_xlabel(r'$x$')
ax.set_xlabel(r'$y$')
ax.set_title(r'Sample points and the real linear regression line')

plt.show()

In [None]:
X = np.hstack([np.ones((len(x),1)),x.reshape(len(x),1)])
y = y.reshape(len(x),1)

### Example of linear regression with the small example

#### Implementation from Python with the algebra


In [None]:
betahat_np = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
print("betahat_np=")
print(betahat_np.reshape(2,1))

#### Implementation from **Numpy**
 

In [None]:
betahat_np2 = np.linalg.lstsq(X, y, rcond =None)[0]
print("betahat_np2=")
print(betahat_np2.reshape(2,1))

#### Implementation from **Scikit-Learn**


In [None]:
#import sklearn as sklearn
from sklearn.linear_model import LinearRegression
fit_skl = LinearRegression(fit_intercept=False).fit(X, y)
#yhat_skl = fit_skl.predict(X)
#betahat_skl = [ fit_skl.coef_, fit_skl.intercept_]
betahat_skl = fit_skl.coef_.reshape(2,1)

print("betahat_skl=")
print(betahat_skl)

#### Quality indicator with numpy

In [None]:
def MSE_score(y,yhat):
    return np.sum((y-yhat)**2)/len(yhat)

In [None]:
def R2_score(y, yhat):
    return 1 - np.sum((y - yhat) ** 2) / np.sum((y - np.mean(y)) ** 2)

In [None]:
yhat = X @ betahat_skl
print(f" R^2 = {np.round(R2_score(y,yhat),2):2.2f} \
        \n MSE = {np.round(MSE_score(y,yhat),2):2.2f}")

 #### Solution for $\sigma$

In [None]:
sigmahat = np.sqrt( np.sum((y-np.matmul(X,betahat_skl))**2) / len(y) )
print("sigmahat=",np.round(sigmahat,5))

In [None]:
import numpy as np

def f_loglik_gauss(y,beta,sigma):
    n = len(y)
    f = +n/2*np.log(2*np.pi*sigma**2) + np.sum((y-np.matmul(X,beta))**2) /(2*sigma**2)
    return -f


### Indicators with sklearn

In [None]:
mse_yyhat,r2_yyhat = utils.f_metrics_regression(y,yhat,True,False,None,ndec=3,samplename="sample")

# Logistic Regression

$\newcommand{\bH}{H}$
$\newcommand{\bW}{W}$
$\newcommand{\bp}{p}$
$\newcommand{\bX}{X}$
$\newcommand{\by}{y}$
$\newcommand{\hy}{\hat{y}}$
$\newcommand{\bhy}{\hat{y}}$
$\newcommand{\bx}{x}$
$\newcommand{\bbeta}{\beta}$
$\newcommand{\hbeta}{\hat{\beta}}$
$\newcommand{\bepsilon}{\epsilon}$
$\newcommand{\bhbeta}{\hat{\beta}}$

### Small example



In [None]:
def f_sigmoid(a):
    return np.exp(a)/(1+np.exp(a))

In [None]:
import numpy as np

n1 = n2 = 50
n  = 2 * n1

beta = np.array([-0.5,3.5,2.0]).reshape((3,1))

x = np.vstack([ np.random.normal(1,1,n).reshape((n1,2)) ,
                np.random.normal(-1,1,n).reshape((n2,2)) ])

p            = f_sigmoid( np.hstack([ np.ones((n,1)), x]) @ beta ) 
y            = np.random.binomial(1,p)

Let save the dataset with the python module panda.

In [None]:
np.savetxt(towdir("./xy_2d_reglogistic.txt"),np.hstack([x,y]))
np.savetxt(towdir("./beta_2d_reglogistic.txt"),beta)

In [None]:
import numpy as np
xy   = np.loadtxt(towdir("./xy_2d_reglogistic.txt",))
beta = np.loadtxt(towdir("./beta_2d_reglogistic.txt"))

x    = xy[:,[0,1]]
y    = xy[:,2]
n    = len(y)

print(xy.shape,x.shape,y.shape)

In [None]:
n0 = np.sum(y==0)
n1 = np.sum(y==1)
n0,n1

In [None]:
y = y.ravel()

Let plot the dataset with the known labels.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

ax.plot(x[y.ravel()==0,0], x[y.ravel()==0,1], 'bx', 
         x[y.ravel()==1,0], x[y.ravel()==1,1], 'bo')
ax.set_xlabel(r'$x1$')
ax.set_ylabel(r'$x2$')
ax.set_title(r'Sample points from two classes')

plt.show()

In [None]:
import numpy as np
from sklearn.model_selection import  train_test_split 
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='lbfgs')
logreg = logreg.fit(x, y.ravel())

In [None]:
betahat_rg=np.append(logreg.intercept_,logreg.coef_)
betahat_rg.reshape((x.shape[1]+1,1))

In [None]:
yhat = logreg.predict(x)
print("acc=",np.sum(y==yhat)/len(y))

### Indicators with sklearn

In [None]:
acc_yyhat, prc_yyhat, rcc_yyhat, cm_yyhat = \
    utils.f_metrics_classification(y,yhat,True,False,None,ndec=3,samplename="sample")