# Introduction of Descriptive Statistics

In [1]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

In [3]:
x

[8.0, 1, 2.5, 4, 28.0]

In [4]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [5]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

In [6]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [7]:
y_with_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [8]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [9]:
z_with_nan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

In [10]:
#Mean

mean_ = sum(x) / len(x)
mean_

8.7

In [11]:
mean_ = statistics.mean(x)
mean_

8.7

In [12]:
mean_ = statistics.mean(x_with_nan)
mean_

nan

In [13]:
# Statistics lib with missing value

statistics.mean(x_with_nan)

nan

In [14]:
# Numpy 1

mean_ = np.mean(y)
mean_

8.7

In [15]:
# Numpy 2

mean_ = y.mean()
mean_

8.7

In [16]:
# Numpy with ignore missing value

print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [17]:
np.nanmean(y_with_nan)

8.7

In [18]:
mean_ = z.mean()
mean_

8.7

In [19]:
z_with_nan.mean()

8.7

In [20]:
sorted(x)

[1, 2.5, 4, 8.0, 28.0]

In [21]:
x[:-1]

[8.0, 1, 2.5, 4]

In [22]:
statistics.median_low(x[:-1])

2.5

In [23]:
statistics.median_high(x[:-1])

4

In [24]:
sorted(x_with_nan)

[1, 2.5, 4, 8.0, nan, 28.0]

In [25]:
# Statistics with missing value

statistics.median(x_with_nan)

6.0

In [26]:
# Numpy

median_ = np.median(y)
median_

4.0

In [27]:
np.median(y[:-1])

3.25

In [28]:
# Numpy with  missing values

np.median(y_with_nan)

nan

In [29]:
np.nanmedian(y_with_nan)

4.0

In [35]:
# Pandas

z.median()

4.0

In [36]:
# Pandas with missing values

z_with_nan.median()

4.0

In [37]:
# Mode

u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

In [38]:
mode_ = statistics.mode(u)
mode_

2

In [39]:
mode_ = statistics.mode(v)
mode_

StatisticsError: no unique mode; found 2 equally common values

In [None]:
u, v = np.array(u), np.array(v)

In [None]:
# Scipy

mode_ = scipy.stats.mode(u)
mode_

In [40]:
mode_.mode

AttributeError: 'int' object has no attribute 'mode'

In [41]:
mode_.count

AttributeError: 'int' object has no attribute 'count'

In [42]:
u, v, w = pd.Series(u), pd.Series(v),  pd.Series([2, 2, math.nan])

In [43]:
# Pandas

u.mode()

0    2
dtype: int64

In [44]:
v.mode()

0    12
1    15
dtype: int64

In [45]:
u.mode()

0    2
dtype: int64

In [46]:
# Variability

In [47]:
# Statistics lib

var_ = statistics.variance(s)
var_

NameError: name 's' is not defined

In [48]:
# Statistics lib with missing values

statistics.variance(x_with_nan)

nan

In [49]:
# Numpy

var_ = np.var(y, ddof=1)
var_

123.19999999999999

In [50]:
var_ = y.var(ddof=1)
var_

123.19999999999999

In [51]:
# Numpy with missing value

np.var(y_with_nan, ddof=1)

nan

In [52]:
np.nanvar(y_with_nan, ddof=1)

123.19999999999999

In [53]:
# Pandas

z.var(ddof=1)

123.19999999999999

In [54]:
z_with_nan.var(ddof=1)

123.19999999999999

In [55]:
# Standard Deviation

In [56]:
# Statistics Lib

std_ = statistics.stdev(x)
std_

11.099549540409287

In [57]:
# Numpy

np.std(y, ddof=1)

11.099549540409285

In [58]:
y.std(ddof=1)

11.099549540409285

In [59]:
np.std(y_with_nan, ddof=1)

nan

In [60]:
# Pandas

z.std(ddof=1)

11.099549540409285

In [61]:
z_with_nan.std(ddof=1)

11.099549540409285

In [62]:
# Skewness

In [63]:
# Scipy

scipy.stats.skew(y)

1.3061163034727836

In [64]:
scipy.stats.skew(y_with_nan)

nan

In [65]:
# Pandas

z.skew()

1.9470432273905924

In [66]:
z_with_nan.skew()

1.9470432273905924

In [67]:
# Percentile

In [68]:
# Numpy

np.percentile(y, 5)   #5%

1.3

In [69]:
np.percentile(y, 95)   #95%

23.999999999999996

In [70]:
np.percentile(y, [25, 50, 75])

array([2.5, 4. , 8. ])

In [71]:
np.median(y)

4.0

In [72]:
# Numpy with missing values

np.percentile(y_with_nan, 5)

nan

In [73]:
np.nanpercentile(y_with_nan, 5)

1.3

In [74]:
# Quantiles

In [75]:
np.quantile(y, 0.05)

1.3

In [76]:
np.quantile(y, 0.95)

23.999999999999996

In [77]:
np.quantile(y, [0.25, 0.5, 0.95])

array([ 2.5,  4. , 24. ])

In [78]:
# Pandas

z.quantile(0.05)

1.3

In [79]:
z.quantile(0.95)

23.999999999999996

In [80]:
# Range

In [81]:
# Numpy

np.ptp(y)

27.0

In [82]:
np.ptp(z)

27.0

In [83]:
np.ptp(y_with_nan)

nan

In [84]:
np.ptp(z_with_nan)

nan

In [85]:
y.max() - y.min()

27.0

In [86]:
z.max() - z.min()

27.0

In [87]:
y_with_nan.max() - y_with_nan.min()

nan

In [88]:
# IQR

In [89]:
quartile_ = np.quantile(y, [0.25, 0.75])

In [90]:
quartile_

array([2.5, 8. ])

In [91]:
quartile_[1] - quartile_[0]

5.5

In [92]:
# Summary of Descriptive Stats

In [93]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [94]:
# Scipy

result = scipy.stats.describe(y, ddof=1, bias=False)

In [95]:
result

DescribeResult(nobs=5, minmax=(1.0, 28.0), mean=8.7, variance=123.19999999999999, skewness=1.9470432273905927, kurtosis=3.878019618875446)

In [96]:
result.nobs

5

In [97]:
result.minmax

(1.0, 28.0)

In [98]:
print(result.minmax[0], result.minmax[1])

1.0 28.0


In [99]:
result.variance

123.19999999999999

In [100]:
# Pandas

result = z.describe()

In [101]:
result

count     5.00000
mean      8.70000
std      11.09955
min       1.00000
25%       2.50000
50%       4.00000
75%       8.00000
max      28.00000
dtype: float64

In [102]:
result['count']

5.0

In [103]:
result['50%']

4.0