## Expectation of RV when distribution of RV is provided

In [6]:
# Lecture 2-3 LogRegression; Slide 8
import numpy as np

def expected_value(values, weights):
    values = np.asarray(values)
    weights = np.asarray(weights)
    return (values * weights).sum() / weights.sum()

#define values
values = [0, 1, 2]

#define probabilities
probs  = [0.5, 0.25, .25]

#calculate expected value
expected_value(values, probs)

0.75

# When data is provided

Reference: https://machinelearningmastery.com/introduction-to-expected-value-variance-and-covariance/

## Mean

In [16]:
# Lecture 2-3 LogRegression; Slide 12
from numpy import array
from numpy import mean
v = array([1.5, 0.5, 1])
print(v)
result = mean(v)
print(result)

[1.5 0.5 1. ]
1.0


## Variance and STD

In [17]:
# Lecture 2-3 LogRegression; Slide 12
from numpy import array
from numpy import var, std
v = array([1.5, 0.5, 1])
print(v)
result = var(v, ddof=1)
print(result)
r_std = std(v, ddof=1)
print(r_std)

[1.5 0.5 1. ]
0.25
0.5


In [12]:
from numpy import array
from numpy import var
M = array([[1,2,3,4,5,6],[1,2,3,4,5,6]])
print(M)
col_mean = var(M, ddof=1, axis=0)
print(col_mean)
c_std = std(M, ddof=1, axis=0)
print(c_std)
row_mean = var(M, ddof=1, axis=1)
print(row_mean)
r_std = std(M, ddof=1, axis=1)
print(r_std)

[[1 2 3 4 5 6]
 [1 2 3 4 5 6]]
[0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0.]
[3.5 3.5]
[1.87082869 1.87082869]


## Covariance

In [15]:
from numpy import array
from numpy import cov
x = array([1,2,3,4,5,6,7,8,9])
print(x)
y = array([9,8,7,6,5,4,3,2,1])
print(y)
# Covariance Matrix
Sigma = cov(x,y)
print(Sigma)

[1 2 3 4 5 6 7 8 9]
[9 8 7 6 5 4 3 2 1]
[[ 7.5 -7.5]
 [-7.5  7.5]]


### Computing covariance from samples
### Cov matrix

In [18]:
# Lecture 2-3 LogRegression; Slide 17 --- Provided by professor
import pandas as pd 
import numpy as np

D= [ [80,160],[85,170],[75,180]]

#convert to a data frame 
df = pd.DataFrame(D, columns = ['Weight', 'Height'])
df.cov() 

dnp=np.array(D)   #this converts D into a numpy array 
print(np.cov(dnp))
# vs
print(np.cov(dnp.T))  #numpy treats rows as features unfortunately. 



[[3200.  3400.  4200. ]
 [3400.  3612.5 4462.5]
 [4200.  4462.5 5512.5]]
[[ 25. -25.]
 [-25. 100.]]


# Projection

Problem: Project the vectors p1=[3,3,3]T,p2=[1,2,3]T and p3=[0,0,1] T on the subspace spanned by x1= [1,1,1] T, x2=[1,0,0]T

In [20]:
# Lecture 2-3 LogRegression; Slide 22 --- Provided by professor
# Projection P1
import numpy as np

xxinv= np.linalg.inv( [[3,1],[1,1]])

x= np.array([[1,1,1],[1,0,0]]).T
p1=np.array([3,3,3]).T
projmat= np.matmul(xxinv, x.T)
print(projmat)
beta_star = np.matmul(projmat, p1)
print(beta_star)


[[ 1.11022302e-16  5.00000000e-01  5.00000000e-01]
 [ 1.00000000e+00 -5.00000000e-01 -5.00000000e-01]]
[3.00000000e+00 3.33066907e-16]


In [21]:
# Lecture 2-3 LogRegression; Slide 25 --- Provided by professor
# Projection P2
import numpy as np

xxinv= np.linalg.inv( [[3,1],[1,1]])

x= np.array([[1,1,1],[1,0,0]]).T
p2=np.array([1,2,3]).T
projmat= np.matmul(xxinv, x.T)
print(projmat)
beta_star = np.matmul(projmat, p2)
print(beta_star)


[[ 1.11022302e-16  5.00000000e-01  5.00000000e-01]
 [ 1.00000000e+00 -5.00000000e-01 -5.00000000e-01]]
[ 2.5 -1.5]


In [22]:
# Lecture 2-3 LogRegression; Slide 26 --- Provided by professor
# Projection P3
import numpy as np

xxinv= np.linalg.inv( [[3,1],[1,1]])

x= np.array([[1,1,1],[1,0,0]]).T
p3=np.array([0,0,1]).T
projmat= np.matmul(xxinv, x.T)
print(projmat)
beta_star = np.matmul(projmat, p3)
print(beta_star)


[[ 1.11022302e-16  5.00000000e-01  5.00000000e-01]
 [ 1.00000000e+00 -5.00000000e-01 -5.00000000e-01]]
[ 0.5 -0.5]


## Inner product of matrix

In [25]:
# Python Program illustrating
# numpy.inner() method
import numpy as np

# Matrices
x = np.array([[2, 3, 4], [3, 2, 9]])
y = np.array([[1, 5, 0], [5, 10, 3]])
print("\nMatrices :")
print("x =", x)
print("\ny =", y)

# Inner product of matrices
print("\nInner product of matrices x and y =")
print(np.inner(x, y))



Matrices :
x = [[2 3 4]
 [3 2 9]]

y = [[ 1  5  0]
 [ 5 10  3]]

Inner product of matrices x and y =
[[17 52]
 [13 62]]


## Outer Product of Vectors and Matrices

In [26]:
# Python Program illustrating
# numpy.outer() method
import numpy as np

# Vectors
a = np.array([2, 6])
b = np.array([3, 10])
print("Vectors :")
print("a = ", a)
print("\nb = ", b)

# Outer product of vectors
print("\nOuter product of vectors a and b =")
print(np.outer(a, b))

print("------------------------------------")

# Matrices
x = np.array([[3, 6, 4], [9, 4, 6]])
y = np.array([[1, 15, 7], [3, 10, 8]])
print("\nMatrices :")
print("x =", x)
print("\ny =", y)

# Outer product of matrices
print("\nOuter product of matrices x and y =")
print(np.outer(x, y))


Vectors :
a =  [2 6]

b =  [ 3 10]

Outer product of vectors a and b =
[[ 6 20]
 [18 60]]
------------------------------------

Matrices :
x = [[3 6 4]
 [9 4 6]]

y = [[ 1 15  7]
 [ 3 10  8]]

Outer product of matrices x and y =
[[  3  45  21   9  30  24]
 [  6  90  42  18  60  48]
 [  4  60  28  12  40  32]
 [  9 135  63  27  90  72]
 [  4  60  28  12  40  32]
 [  6  90  42  18  60  48]]


## Cross Product of Vectors and Matrices

In [27]:
# Python Program illustrating
# numpy.cross() method
import numpy as np

# Vectors
a = np.array([3, 6])
b = np.array([9, 10])
print("Vectors :")
print("a = ", a)
print("\nb = ", b)

# Cross product of vectors
print("\nCross product of vectors a and b =")
print(np.cross(a, b))

print("------------------------------------")

# Matrices
x = np.array([[2, 6, 9], [2, 7, 3]])
y = np.array([[7, 5, 6], [3, 12, 3]])
print("\nMatrices :")
print("x =", x)
print("\ny =", y)

# Cross product of matrices
print("\nCross product of matrices x and y =")
print(np.cross(x, y))


Vectors :
a =  [3 6]

b =  [ 9 10]

Cross product of vectors a and b =
-24
------------------------------------

Matrices :
x = [[2 6 9]
 [2 7 3]]

y = [[ 7  5  6]
 [ 3 12  3]]

Cross product of matrices x and y =
[[ -9  51 -32]
 [-15   3   3]]


### Projection of a Vector on another vector

In [23]:
# import numpy to perform operations on vector
import numpy as np

u = np.array([1, 2, 3]) # vector u
v = np.array([5, 6, 2]) # vector v:

# Task: Project vector u on vector v

# finding norm of the vector v
v_norm = np.sqrt(sum(v**2))	

# Apply the formula as mentioned above
# for projecting a vector onto another vector
# find dot product using np.dot()
proj_of_u_on_v = (np.dot(u, v)/v_norm**2)*v

print("Projection of Vector u on Vector v is: ", proj_of_u_on_v)


Projection of Vector u on Vector v is:  [1.76923077 2.12307692 0.70769231]


## MSE loss 

In [29]:
# Lecture 4-5 LogRegression; Slide 10
from sklearn.metrics import mean_squared_error
  
# Given values
Y_true = [80,20,30,10,70]  # Y_true = Y (original values)
  
# calculated values
Y_pred = [90,40,30,40,100]  # Y_pred = Y'
  
# Calculation of Mean Squared Error (MSE)
mean_squared_error(Y_true,Y_pred)

460.0

## Gini Impurity

In [52]:
# Lecture 4-5 LogRegression; Slide 67
def Ginx(P1,P2):
    #P1 and P2 are the counts for each class after the split
    denom = P1 + P2
    Ginx = 2 * (P1/denom) * (P2/denom)
    return(Ginx)

print(Ginx(0,6))
print(Ginx(1,5))
print(Ginx(3,3))

0.0
0.2777777777777778
0.5


## Entropy

In [60]:
# Lecture 4-5 LogRegression; Slide 67
from collections import Counter
from scipy import stats

def entropy_x(a):
    return stats.entropy(a, base=2)

print(entropy_x([0,6]))
print(entropy_x([1,5]))
print(entropy_x([3,3]))

0.0
0.6500224216483541
1.0


## Gain based on Gini

In [7]:
# Lecture 4-5 LogRegression; Slide 75
# Calculate the gain of splitting on home ownership. [2 way split]

S = [1, 0, 0, 1, 0, 0, 1, 0, 0, 0] # Actual table
A = [0, 0, 0] # 1st split
B = [0, 0, 1, 0, 1, 0, 1] # 2nd split

def gini(p):
    """Gini impurity based on probability"""
    return 2 * p * (1-p)

def p(data):
    """Chance of success from bool array"""
    return sum(data) / len(data)

giniS = gini(p(S))
deltaA = gini(p(A)) * len(A) / len(S)
deltaB = gini(p(B)) * len(B) / len(S)
# If more splits add more deltas

gain = giniS - deltaA - deltaB

print(round(gain, 5))

0.07714


In [12]:
# Lecture 4-5 LogRegression; Slide 77
# Calculate the gain of splitting on Marital Status (3 way split)

S = [0,0,0,0,1,0,0,1,0,1] # Actual table
A = [0,0,0,0] # 1st split
B = [0,0,1,1] # 2nd split
C = [1,0] # 3rd split

def gini(p):
    """Gini impurity based on probability"""
    return 2 * p * (1-p)

def p(data):
    """Chance of success from bool array"""
    return sum(data) / len(data)

giniS = gini(p(S))
deltaA = gini(p(A)) * len(A) / len(S)
deltaB = gini(p(B)) * len(B) / len(S)
deltaC = gini(p(C)) * len(C) / len(S)

gain = giniS - deltaA - deltaB - deltaC

print(round(gain, 5))

0.12


## KL Divergence

Don't have an exact code, so below code is for reference only. Snippets taken from this link:<br>
https://datascience.stackexchange.com/questions/9262/calculating-kl-divergence-in-python

In [21]:
import numpy as np

def KL(a, b):
    a = np.asarray(a, dtype=np.float)
    b = np.asarray(b, dtype=np.float)

    return np.sum(np.where(a != 0, a * np.log(a / b), 0))


values1 = [0.7,0.2,0.1]
values2 = [0.4,0.4,0.2]

print(KL(values1, values2))

0.1837868973868122


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  a = np.asarray(a, dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  b = np.asarray(b, dtype=np.float)


In [20]:
import sklearn.metrics

sklearn.metrics.mutual_info_score([0.7,0.2,0.1],[0.4,0.4,0.2])



0.6365141682948129

## Softmax

In [22]:
import numpy as np

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

# Lecture 9-11 LogRegression; Slide 19
scores2D = np.array([1,2,5])

# scores2D = np.array([[1, 2, 3, 6],
#                      [2, 4, 5, 6],
#                      [3, 8, 7, 6]])


print(softmax(scores2D))
# probabilities = softmax (scores) 
# Therefore below values are the probability

[0.01714783 0.04661262 0.93623955]
