In [None]:
library(tidyverse)

**Data Matrix**

Patient dataset corresponding to 4 patients and 3 features:

![Patient dataset](https://bl3302files.storage.live.com/y4mDUt2YjMzGPgHdbnZnzzQ9GYvwHVIhAFiGFkCAW3d7QS2vr5v9x6cQLkfrgKK_O9MwAnqe6UF-r3jAeywfR3prvcRRxTumNRJjrIMA5eWbZuRAh1QnEi8bwzLsmrtg9AIjXY3Eh-8P2OxJcfkbGxH2-rEyKQYV3L7DjnomZ68lTEBjShqYOTH6oD65KoaFRK-?width=256&height=153&cropmode=none)

In [None]:
# Create dataframe with 3 columns
HR = c(76, 74, 72, 78)
BP = c(126, 120, 118, 136)
Temp = c(38, 38, 37.5, 37)
pData = data.frame(HR, BP, Temp)
print(pData)
cat(sprintf('---------------------\n'))

# Convert dataframe to matrix
X = as.matrix(pData)
print(dim(X))
cat(sprintf('---------------------\n'))
print(X)

**Vectors from the data matrix**

![Patient dataset](https://bl3302files.storage.live.com/y4mOmQPnVqf4iHNia7wgkQQs8Oi7mrO3_h5zh1zK3ExVR7SB_IGCFwy72nvi8_f9q2eeK5B73A8v1qvr9UIhs8pChuvy0Y9YPznn7hsL3ktxkI33BJIox7-7LqQEx8XvH6yHrafSadsHxIopns4AbmrGKZaxGrb_DStRhZpD2y_EZhQo2mfNJSixwwFELAXO_e4?width=256&height=167&cropmode=none)

1st feature vector (heart rate) for all patients:
$$x_1 = \begin{bmatrix}76\\74\\72\\78\end{bmatrix}$$

1st patient vector for all features:
$$x^{(1)} = \begin{bmatrix}76\\126\\38\end{bmatrix}$$

In [None]:
# Vector for 1st feature (HR)
x_1 = X[, 1]
print(x_1)
cat(sprintf('\n'))

# Vector for 2nd feature (BP)
x_2 = X[, 2]
print(x_2)
cat(sprintf('\n'))

# Vector for 1st patient
x1 = X[1, ]
print(x1)
cat(sprintf('\n'))

# Vector for 2nd patient
x2 = X[2, ]
print(x2)

**Projection of vectors and its relationship to dot product**

![Vector projection](https://bl3302files.storage.live.com/y4miuCtKP9ptv6lIB8EqEU_u7cbEydy0UsEgHl4ECni2UVONtvKZgf73pIQ4vuA99ZHP8K96W_1i-QuhSIN12IudLaUTF3_jZzFqVfsaRK7QubMS9p5C1ErN6tB8I_UqQZnSY2JSGnu0IvJQrRcd2rX2Hzngfka3tCqJhbAMdElywcis2gRaoiuEGDVqaXpZYYp?width=256&height=209&cropmode=none)

In [None]:
# Project all patients onto the e_1 (HR) direction
e_1 = c(1, 0, 0)
projected_samples = c(e_1 %*% X[1, ] / norm(e_1, type = '2'),
e_1 %*% X[2, ] / norm(e_1, type = '2'),
e_1 %*% X[3, ] / norm(e_1, type = '2'),
e_1 %*% X[4, ] / norm(e_1, type = '2'))
print(projected_samples)

# Variance of the projected data
var(projected_samples)

In [None]:
X %*% (e_1 / norm(e_1, type = '2')) 

In [None]:
v = c(1, 2, 3)
var(X %*% (v / norm(v, type = '2')))

In [None]:
# How different (or varying) are the patients w.r.t. each feature?
var(X[, 1]) # Variance of HR
var(X[, 2]) # Variance of BP
var(X[, 3]) # Variance of Temp
var(X)

In [None]:
# A simple data matrix to understand samples seen as vectors
sData = data.frame("X1" = c(5, 3), "X2" = c(2, 6))
print(sData)

**Geometric representation of samples as vectors**

In [None]:
p1 = sData %>% ggplot() + 
  geom_segment(aes(x = 0, y = 0, xend = X1[1], yend = X2[1]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'red') +
  geom_segment(aes(x = 0, y = 0, xend = X1[2], yend = X2[2]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'blue') +
  geom_text(aes(x = X1[1], y = X2[1]), label = 'x^1', hjust = -0.5, vjust = 0, size = 10) +
  geom_text(aes(x = X1[2], y= X2[2]), label = 'x^2', hjust = -0.5, vjust = 0.5, size = 10) +
  labs( x = '', y = '', title = 'Vectors') +
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  geom_hline(yintercept = 0, linetype = 'dashed') +
  theme(axis.text.x = element_text(size = 20),
   axis.text.y= element_text(size = 20),
   axis.line = element_line(colour = "darkblue", size = 1, linetype = "solid")) +
   scale_x_continuous(expand = c(0, 0), limits = c(-11, 11)) +
   scale_y_continuous(expand = c(0, 0), limits = c(-11, 11))
p1

In [None]:
X = as.matrix(sData)
print(X)

In [None]:
# Scalar projection of x^2 onto a direction v
v = c(10, 10)
shadowLength = (X[2,] %*% v)/(norm(v, type = '2'))
shadowLength = as.numeric(shadowLength)
print(shadowLength)
# Vector projection of x^2 onto v
unitVector = v / (norm(v, type = '2'))
X2Projected = shadowLength * unitVector
print(X2Projected)

In [None]:
p2 = sData %>% ggplot() + 
  geom_segment(aes(x = 0, y = 0, xend = X1[1], yend = X2[1]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'red') +
  geom_segment(aes(x = 0, y = 0, xend = X1[2], yend = X2[2]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'blue') +
  geom_segment(aes(x = 0, y = 0, xend = v[1], yend = v[2]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'green') +
  geom_segment(aes(x = 0, y = 0, xend = X2Projected[1], yend = X2Projected[2]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'black', linetype = 'dashed') +   
  geom_text(aes(x = X1[1], y = X1[2]), label = 'x^1', hjust = -0.5, vjust = 0, size = 10) +
  geom_text(aes(x = X2[1], y= X2[2]), label = 'x^2', hjust = -0.5, vjust = 0.5, size = 10) +
  labs( x = '', y = '', title = 'Vectors') +
  geom_vline(xintercept = 0, linetype = 'dashed') + 
  geom_hline(yintercept = 0, linetype = 'dashed') +
  theme(axis.text.x = element_text(size = 20),
   axis.text.y= element_text(size = 20),
   axis.line = element_line(colour = "darkblue", size = 1, linetype = "solid")) +
   scale_x_continuous(expand = c(0, 0), limits = c(-11, 11)) +
   scale_y_continuous(expand = c(0, 0), limits = c(-11, 11))
p2

**A simulated data matrix**

In [None]:
nsamples = 1000
sim_frame = data.frame(months = 1:nsamples,
                     fixed_costs = 3995,
                     meals_sold = NA,
                     price_per_meal = NA,
                     profit_per_meal = NA,
                     labor_costs = NA,
                     profit = NA)
head(sim_frame)

In [None]:
set.seed(123)
sim_frame = sim_frame %>% 
  mutate(meals_sold = rnorm(n = nsamples, mean = 3000, sd = 100),
         price_per_meal = sample(x = c(20, 18.50, 16.50, 15),
                                 size = nsamples,
                                 replace = T,
                                 prob = c(.25, .35, .3, .1)),
         profit_per_meal = price_per_meal - 11,
         labor_costs = runif(n = nsamples, min = 5000, max = 7000) + 0.3*meals_sold,
         profit = meals_sold * profit_per_meal - labor_costs - fixed_costs)

head(sim_frame)

In [None]:
str(sim_frame)

In [None]:
qplot(sim_frame$meals_sold, sim_frame$labor_costs)

In [None]:
cor(sim_frame$meals_sold, sim_frame$labor_costs, method = 'pearson')

In [None]:
cor(sim_frame$meals_sold, sim_frame$labor_costs, method = 'spearman')

In [None]:
simData = sim_frame %>% select(meals_sold, labor_costs)
head(simData, n = 5)

In [None]:
cov(simData)

In [None]:
S12 = (1/(nsamples-1)) * (simData$meals_sold - mean(simData$meals_sold)) %*% (simData$labor_costs - mean(simData$labor_costs))
as.numeric(S12)

In [None]:
X = as.matrix(simData)
str(X)

In [None]:
S = matrix(0, nrow = 2, ncol = 2)
mu = colMeans(X)
for (i in seq(1, nsamples)){
  S = S + (X[i,] - mu) %*% t(X[i,] - mu) 
}
S = (1/(nsamples-1)) * S

In [None]:
S

In [None]:
cor(simData)

In [None]:
# Calculate eigenvalues & eigenvectors of sample covariance matrix
eX = eigen(S)
str(eX)

In [None]:
lambda = eX$values 
print(lambda)

In [None]:
u = eX$vectors
print(u)

In [None]:
norm(u[,1], type = '2')
norm(u[,2], type = '2')

In [None]:
u[, 1] %*% u[, 2]

In [None]:
p4 = simDataScaled %>% ggplot(aes(x = meals_sold, y = labor_costs)) +
  geom_point(size = 1) +
  geom_segment(aes(x = 0, y = 0, xend = u[1, 1], yend = u[2, 1]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'red') +
  geom_segment(aes(x = 0, y = 0, xend = u[1, 2], yend = u[2, 2]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'blue') 
p4

In [None]:
cov(S)
(1/nsamples-1)*t(X) %*% () %*% X

In [None]:
eX$vectors = -eX$vectors
u = eX$vectors
print(u)

In [None]:
norm(u[,1], type = '2')
norm(u[,2], type = '2')

In [None]:
u[, 1] %*% u[, 2] 

In [None]:
p5 = simDataScaled %>% ggplot(aes(x = meals_sold, y = labor_costs)) +
  geom_point(size = 1) +
  geom_segment(aes(x = 0, y = 0, xend = u[1, 1], yend = u[2, 1]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'red') +
  geom_segment(aes(x = 0, y = 0, xend = u[1, 2], yend = u[2, 2]), size = 1,
   arrow = arrow(length = unit(0.5,"cm")), color = 'blue') 
p5

In [None]:
X[1:5,]

In [None]:
# Project samples onto the direction of the first eigenvector
shadowLength1 = X %*% u[,1]
shadowLength1 = as.numeric(shadowLength1)
# Vector projection
unitVector = u[,1]
projectedSamples1 = u[,1] %*% t(as.matrix(shadowLength1))

In [None]:
# Project samples onto the direction of the second eigenvector
shadowLength2 = X %*% u[,2]
shadowLength2 = as.numeric(shadowLength2)
# Vector projection
unitVector = u[,2]
projectedSamples2 = u[,2] %*% t(as.matrix(shadowLength2))

In [None]:
str(shadowLength1)

In [None]:
str(projectedSamples1)

In [None]:
projectedSamples1

In [None]:
projectedSamples2

In [None]:
p6 = simDataScaled %>% ggplot(aes(x = meals_sold, y = labor_costs)) +
  geom_point(size = 1) +
  geom_segment(aes(x = 0, y = 0, xend = u[1, 1], yend = u[2, 1]), size = 2,
   arrow = arrow(length = unit(0.5,"cm")), color = 'red') +
  geom_segment(aes(x = 0, y = 0, xend = u[1, 2], yend = u[2, 2]), size = 2,
   arrow = arrow(length = unit(0.5,"cm")), color = 'blue') +
  geom_point(aes(x = projectedSamples1[1, ], y = projectedSamples1[2, ]), color = 'magenta', size = 1) +
  geom_point(aes(x = projectedSamples2[1, ], y = projectedSamples2[2, ]), color = 'brown', size = 1)
p6

In [None]:
var(X %*% u[,1])
lambda[1]

In [None]:
var(X %*% u[,2])
lambda[2]

In [None]:
cor(X %*% u[,1], X %*% u[,2])

In [None]:
eX$vectors

In [None]:
u[, 1]

In [None]:
X[1:2,]

In [None]:
# First eigenvector
as.matrix(u[, 1])
# First sample
as.matrix(X[1, ])
# Shadow length of the projection of the first sample onto the direction of the first eigenvector
X[1, ] %*% u[, 1]