## Statistics

#### Mean

In [3]:
import numpy as np
import math
x = np.array([1,3,5,6])
mean_x = np.mean(x)
print(mean_x)

# in case the data contains Nan values
x_nan = np.array([1,3,5,6, math.nan])
mean_x_nan = np.nanmean(x_nan)
print(mean_x_nan)

def mean(x):
    return sum(x) / len(x)
mean([3, 6, 7, 8])

3.75
3.75


6.0

#### variance

In [10]:
x = np.array([1,3,5,6])
variance_x = np.var(x)
print(variance_x)

# here you need to specify the degrees of freedom (df) max number of logically independent data points that have freedom to vary
x_nan = np.array([1,3,5,6, math.nan])
mean_x_nan = np.nanvar(x_nan, ddof = 1)
print(mean_x_nan)

3.6875
4.916666666666667


#### Covariance

In [11]:
x = np.array([1,3,5,6])
y = np.array([-2,-4,-5,-6])

#this will return the covariance matrix of x,y containing x_variance, y_variance on diagonal elements and covariance of x,y
cov_xy = np.cov(x,y)
cov_xy

array([[ 4.91666667, -3.75      ],
       [-3.75      ,  2.91666667]])

#### Median

In [12]:
def median(x):
    n = len(x)
    sorted_x = sorted(x)
    middle = n // 2
    if n % 2 == 1:
        return sorted_x[middle]
    else:
        h = middle -1
        return (sorted_x[middle] + sorted_x[h]) / 2
median([3, 5, 6])

5

#### Quantile

In [13]:
def quantile(x, p):
    """returns the pth-percentile value in x"""
    p_index = int(p * len(x))
    return sorted(x)[p_index]

#### Dispersion

#### Range

In [14]:
def data_range(x):
    return max(x) - min(x)
data_range([983, 4, 5, 7])

979

In [15]:
def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]
def sum_of_squares(x):
    return sum(xi ** 2 for xi in x)

#### Variance

In [16]:
def variance(x):
    n = len(x)
    deviation = de_mean(x)

    return sum_of_squares(deviation) / n-1
variance([3, 5, 6, 8])

2.25

#### Deviation

In [17]:
import math
def standard_deviation(x):
    return math.sqrt(variance(x))
standard_deviation([3, 5, 6, 8])

1.5

In [18]:
def interquartile_range(x):
    return quantile(x, 0.75) - quantile(x, 0.25)
interquartile_range([3, 5, 6, 8])

3

In [19]:
def dot(x, y):
    return sum(x_i * y_j for x_i in x for y_j in y)

#### We’ll first look at covariance, the paired analogue of variance. Whereas variance measures how a single variable deviates from its mean, covariance measures how two variables vary in tandem from their means:

In [20]:
def covariance(x, y):
    n = len(x)
    return dot(de_mean(x), de_mean(y)) / n - 1
covariance([2, 3, 4, 6], [8, 9, 6, 4])

-1.0

#### Correlation

In [21]:
x = np.array([1,3,5,6])
y = np.array([-2,-4,-5,-6])

corr = np.corrcoef(x,y)
def correlation(x, y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(x, y) / stdev_x / stdev_y
    else:
        return 0
correlation([2, 3, 4, 6], [8, 9, 6, 4])

-0.5597691428330556