In [1]:
import math
import statistics

import numpy as np
import scipy.stats
import pandas as pd


In [2]:
print('numpy version:  ', np.__version__)
print('scipy version:  ', scipy.__version__)
print('pandas version:  ', pd.__version__)

numpy version:   1.19.1
scipy version:   1.5.0
pandas version:   1.1.2


In [22]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x, len(x))
print(x_with_nan, len(x_with_nan))

[8.0, 1, 2.5, 4, 28.0] 5
[8.0, 1, 2.5, nan, 4, 28.0] 6


In [5]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


# Measures of Central Tendency

##### Mean

In [7]:
print('x: ', x)
mean_native = sum(x) / len(x)
print('mean native: ', mean_native)
mean_stats = statistics.mean(x)
print('mean statistics: ', mean_stats)

x:  [8.0, 1, 2.5, 4, 28.0]
mean native:  8.7
mean statistics:  8.7


In [10]:
print('x with nan: ', x_with_nan)
mean_native_with_nan = sum(x_with_nan) / len(x_with_nan)
print('mean with nan native: ', mean_native_with_nan)
mean_stats_with_nan = statistics.mean(x_with_nan)
print('mean with nastatistics: ', mean_stats_with_nan)
mean_np_with_nan = np.mean(y_with_nan) # or y_with_nan.mean()
print('mean with nan np: ', mean_np_with_nan)

x with nan:  [8.0, 1, 2.5, nan, 4, 28.0]
mean with nan native:  nan
mean with nastatistics:  nan
mean with nan np:  nan


In [11]:
mean_no_nan_np = np.nanmean(y_with_nan)
print('mean ignoring nan np: ', mean_no_nan_np)

mean ignoring nan np:  8.7


In [12]:
z_with_nan.mean(), z_with_nan.mean(skipna=False)

(8.7, nan)

In [15]:
arr = [2, 2, 4, 4, 4, 4, 4, 8, 8, 8]
print(arr, len(arr))
print("mean:", np.mean(arr))
print('weighted mean: ', .2*2 + .5*4 + .3*8)

[2, 2, 4, 4, 4, 4, 4, 8, 8, 8] 10
mean: 4.8
weighted mean:  4.8


In [24]:
m = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

print("sum of the weight:", sum(w))

wmean = sum(w[i] * m[i] for i in range(len(x))) / sum(w)
print("weighted mean:", wmean)

wmean = sum(m_ * w_ for (m_, w_) in zip(x, w)) / sum(w)
print("weighted mean:", wmean)

sum of the weight: 1.0
weighted mean: 6.95
weighted mean: 6.95


In [25]:
x += [100]

In [26]:
print(x)
print('arithmetic mean:', sum(x)/len(x))
print('harmonic mean:', len(x) / sum(1/i for i in x))

[8.0, 1, 2.5, 4, 28.0, 100]
arithmetic mean: 23.916666666666668
harmonic mean: 3.2954099646920363


In [28]:
scipy.stats.hmean(x)

3.2954099646920363

In [34]:
gmean = 1

for item in x:
    gmean *= item

gmean = gmean**(1/len(x))    
print(gmean)
print(scipy.stats.gmean(x))

7.793059696775923
7.7930596967759245


##### median

In [38]:
print(x, len(x), sorted(x))

print("median np:", np.median(x))

if len(x) % 2:
    med = sorted(x)[round(.5 * len(x)-1)]
else:
    ordered_x, index = sorted(x), round(.5 * len(x))
    med = .5 * (ordered_x[index-1] + ordered_x[index])
print("median native:", med)

[8.0, 1, 2.5, 4, 28.0, 100] 6 [1, 2.5, 4, 8.0, 28.0, 100]
median np: 6.0
median native: 6.0


In [39]:
# median_high = second median for even len(n) in statistics
# median_low = first median for even len(n) in statistics

statistics.median(x), statistics.median_high(x), statistics.median_low(x)

(6.0, 8.0, 4)

In [40]:
sorted(x_with_nan), statistics.median(x_with_nan) # nan not included

([1, 2.5, 4, 8.0, nan, 28.0], 6.0)

In [41]:
print(z)
z.median()

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64


4.0

##### mode

In [44]:
print(x)
print([x.count(i) for i in x])

[8.0, 1, 2.5, 4, 28.0, 100]
[1, 1, 1, 1, 1, 1]


In [45]:
mode_ = scipy.stats.mode(x)
print(mode_.mode)

[1.]


In [50]:
series_x = pd.Series(x)
series_x.mode()

0      1.0
1      2.5
2      4.0
3      8.0
4     28.0
5    100.0
dtype: float64

In [49]:
z.mode()

0     1.0
1     2.5
2     4.0
3     8.0
4    28.0
dtype: float64

### Variance

In [52]:
x += 100, 1000, 1000

In [53]:
x

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]

In [55]:
n = len(x)
mean_x = sum(x)/len(x)
print(mean_x)
variance_x = sum((point - mean_x)**2 for point in x) / (n-1)
print(variance_x)

249.27777777777777
182702.06944444444


In [57]:
var_ = statistics.variance(x)
var_

182702.06944444444

In [58]:
print(np.var(np.array(x), ddof=1))

182702.06944444444


### Standard deviation

In [60]:
print('std native:', variance_x**.5)
print('std np:', np.std(x, ddof=1))
print('std statistics:', statistics.stdev(x))

std native: 427.4366262318245
std np: 427.4366262318245
std statistics: 427.4366262318245


### skewness

In [63]:
print(x)
# n = len(x)
# mean_x = sum(x)/len(x)
# variance_x = sum((point - mean_x)**2 for point in x) / (n-1)
# var_ = statistics.variance(x)
std_x = variance_x**.5

skew_x = (
    sum((item-mean_x)**3 for item in x) * n / ((n-1) * (n-2) * std_x**3)
)
print(skew_x)
print(scipy.stats.skew(x, bias=False))

[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]
1.5837470893089505
1.5837470893089503


### percentile

In [74]:
print(sorted(x))
print(x)
statistics.quantiles(x, n=4, method="inclusive")


[1, 2.5, 4, 8.0, 28.0, 100, 100, 1000, 1000]
[8.0, 1, 2.5, 4, 28.0, 100, 100, 1000, 1000]


[4.0, 28.0, 100.0]

In [65]:
!python --version

Python 3.8.5


In [69]:
print(np.percentile(x, 25))
print(np.percentile(x, 50))
print(np.percentile(x, 75))

4.0
28.0
100.0


In [70]:
np.quantile(x, [.25, .5, .75])

array([  4.,  28., 100.])