# Python for Data Science // S.9 // Descriptive Statistics // PYTN-KS09
---

In [15]:
# Import Modules
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

## Mean

In [3]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [4]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, np.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [6]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, np.nan, 4, 28.0]
print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [16]:
# mengubah list menjadi array 1D
y, y_with_nan = np.array(x), np.array(x_with_nan)
# mengubah list menjadi pandas series
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)
print(y)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


In [17]:
# menentukan rata-rata dengan Python
mean_ = sum(x) / len(x)
mean_

8.7

In [18]:
# menampilkan jumlah item 
print(len(x))

5


In [19]:
# menentukan rata-rata dengan fungsi statistics built-in Python
mean_ = statistics.mean(x)
print(mean_)

8.7


In [13]:
mean_ = statistics.mean(x_with_nan)
print(mean_)

nan


In [20]:
# menentukan mean menggunakan NumPy
mean_ = np.mean(y)
mean_

8.7

In [21]:
# menentukan mean menggunakan method .mean()
mean = y.mean()
mean

8.7

In [22]:
print(np.mean(y_with_nan))
print(y_with_nan.mean())

nan
nan


In [23]:
# mengabaikan nilai NaN untuk menentukan mean 
np.nanmean(y_with_nan)

8.7

In [24]:
# menggunakan method .mean pada pd.series
mean_ = z.mean()
mean_

8.7

In [26]:
# menentukan mean dengan mengabaikan nilai NaN berlaku pada pd.series
z_with_nan.mean()

8.7

In [27]:
0.2 * 2 + 0.5 * 4 + 0.3 * 8

4.8

In [32]:
# mengimplementasikan weighted mean dalam Python pure

x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)

wmean = sum(x_ * w_ for (x_, w_) in zip(x, w)) / sum(w)
print(wmean)

6.95
6.95


In [33]:
# menggunakan np.average() untuk menentukan weighted mean
y, z, w = np.array(x), pd.Series(x), np.array(w)

wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


In [34]:
# menggunakan element-wise product ( w * y) dengan sum()
(w * y).sum() / w.sum()

6.95

In [36]:
# menggunakan element-wise product ( w * y) dengan sum()
(w * z).sum() / w.sum()

6.95

In [37]:
x

[8.0, 1, 2.5, 4, 28.0]

In [38]:
# implementasi harmonic mean
hmean = len(x) / sum(1 / item for item in x)
hmean

2.7613412228796843

In [39]:
# menggunakan statistics.harmnic_mean(x)
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [40]:
# implementasi harmonic mean menggunakan Scipy
scipy.stats.hmean(y)

2.7613412228796843

In [41]:
scipy.stats.hmean(x)

2.7613412228796843

In [42]:
x

[8.0, 1, 2.5, 4, 28.0]

In [44]:
# implementasi geometric dalam Python pure
gmean = 1

for item in x:
    gmean *= item
    # gmean = gmean * item

gmean **= 1 / len(x)
gmean

4.677885674856041

In [45]:
# menentukan geometric mean menggunakan SciPy
scipy.stats.gmean(y)

4.67788567485604

In [46]:
scipy.stats.gmean(z)

4.67788567485604

## Median

In [49]:
# implementasi Python pure dari median
n = len(x)
if n % 2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])

median_

4

In [50]:
x

[8.0, 1, 2.5, 4, 28.0]

In [51]:
statistics.median_low(x[:-1])

2.5

In [52]:
statistics.median_high(x[:-1])

4

In [53]:
print(statistics.median(x_with_nan))
print(statistics.median_low(x_with_nan))
print(statistics.median_high(x_with_nan))

6.0
4
8.0


In [54]:
x_with_nan

[8.0, 1, 2.5, nan, 4, 28.0]

In [55]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [56]:
# menentukan median dengan np.median()
median_ = np.median(y)
print(median_)

4.0


In [57]:
y[-1]

28.0

In [58]:
median_ = np.median(y[:-1])
print(median_)

3.25


In [59]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [60]:
median__ = np.median(y)

In [61]:
median__

4.0

## Mode

In [62]:
# menentukan Mode dengan Python pure
u = [2, 3, 2, 8, 12]

v = [12, 15, 12, 15, 21, 15, 12]

mode_ = max((u.count(item), item) for item in set(u))[1]
mode_

2

In [63]:
# menentukan Mode dengan statistics.mode()
mode_ = statistics.mode(u)
mode_

2

In [64]:
# menentukan Mode dengan scipy.stats.mode():
u, v = np.array(u), np.array(v)

mode_ = scipy.stats.mode(u)
mode_

ModeResult(mode=array([2]), count=array([2]))

In [65]:
mode_ = scipy.stats.mode(v)
mode_

ModeResult(mode=array([12]), count=array([3]))

In [66]:
print(mode_.mode)
print(mode_.count)

[12]
[3]


In [72]:
# menggunakan metode .mode() untuk mengabaikan nilai NaN
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2, 2, math.nan])

print(u.mode())

print(v.mode())

print(w.mode())

0    2
dtype: int32
0    12
1    15
dtype: int32
0    2.0
dtype: float64


In [73]:
print(w.mode())
# print(w.mode(dropna=False)) ==> jika ingin menyertakan NaN

0    2.0
dtype: float64


## Variance

In [74]:
# menghitung sample variance dengan Python pure
n = len(x)

mean_ = sum(x) / n

var_ = sum((item - mean_)**2 for item in x) / (n - 1)
var_

123.19999999999999

In [75]:
# menghitung sample variance dengan fungsi statistics.variance()
var_ = statistics.variance(x)
var_

123.2

In [76]:
# menghitung sample variance dengan NumPy
var_ = np.var(y, ddof=1)
var_

123.19999999999999

In [78]:
var_ = y.var(ddof=1)
var_

123.19999999999999

In [79]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [80]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [81]:
# metode .var() pada pd.series mengabaikan nilai NaN
z.var(ddof=1)

123.19999999999999

## Standard Deviation

In [82]:
# menghitung standar deviasi dengan Python pure
std_ = var_ ** 0.5
std_

11.099549540409285

In [83]:
# menghitung standar deviasi menggunakann statistics.stdev()
std_ = statistics.stdev(x)
std_

11.099549540409287

In [84]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [85]:
# menghitung standar deviasi menggunakann NumPy
np.std(y, ddof=1)

11.099549540409285

In [86]:
y.std(ddof=1)

11.099549540409285

## Skewness

In [87]:
# menghitung sample skewness menggunakan Python pure
x = [8.0, 1, 2.5, 4, 28.0]

n = len(x)

mean_ = sum(x) / n
var_ = sum((item - mean_)**2 for item in x) / (n-1)
std_ = var_ ** 0.5

skew_ = (sum((item - mean_)**3 for item in x) * n / ((n - 1) * (n - 2) * std_**3))

In [88]:
skew_

1.9470432273905929

In [89]:
# menghitung sample skewness dengan SciPy
y, y_with_nan = np.array(x), np.array(x_with_nan)

scipy.stats.skew(y, bias=False)

1.9470432273905927

In [92]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [93]:
y_with_nan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [94]:
# menghitung sample skewness terhadap pd.series
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

z.skew()

1.9470432273905924

In [95]:
z_with_nan.skew()

1.9470432273905924

In [96]:
z_with_nan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

In [97]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

## Percentiles

In [99]:
# membagi data menjadi beberapa interval
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
statistics.quantiles(x, n=2)

[8.0]

In [102]:
statistics.quantiles(x, n=4, method='inclusive')

[0.1, 8.0, 21.0]

In [101]:
statistics.quantiles(x, n=4, method='exclusive')

[-0.5, 8.0, 23.4]

In [103]:
# menentuken persentil sampel
y = np.array(x)
np.percentile(y, 5)

-3.44

In [104]:
np.percentile(y, 95)

34.919999999999995

In [110]:
np.percentile(y, 100)

41.0

In [111]:
np.percentile(y, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [112]:
np.median(y)

8.0

In [113]:
y_with_nan = np.insert(y, 2, np.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [114]:
y

array([-5. , -1.1,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [115]:
y_with_nan = np.insert(y, 2, math.nan)
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [116]:
np.nanpercentile(y_with_nan, [25, 50, 75])

array([ 0.1,  8. , 21. ])

In [117]:
np.percentile(y_with_nan, [25, 50, 75])

array([nan, nan, nan])

In [118]:
np.quantile(y, 0.05)

-3.44

In [119]:
np.percentile(y, 5)

-3.44

In [120]:
np.quantile(y, 0.95)

34.919999999999995

In [123]:
np.quantile(y, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [124]:
np.quantile(y_with_nan, [0.25, 0.5, 0.75])

array([nan, nan, nan])

In [125]:
np.nanquantile(y_with_nan, [0.25, 0.5, 0.75])

array([ 0.1,  8. , 21. ])

In [126]:
# menghitung nilai quantile pada pd.Series objek
z, z_with_nan = pd.Series(y), pd.Series(y_with_nan)

z.quantile(0.05)

-3.44

In [127]:
z.quantile(0.95)

34.919999999999995

In [128]:
z.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

In [130]:
z_with_nan.quantile([0.25, 0.5, 0.75])

0.25     0.1
0.50     8.0
0.75    21.0
dtype: float64

## Ranges

In [131]:
# menentukan range dengan NumPy
np.ptp(y)

46.0

In [132]:
np.ptp(z)

46.0

In [133]:
np.ptp(y_with_nan)

nan

In [134]:
y_with_nan

array([-5. , -1.1,  nan,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [135]:
z_with_nan

0    -5.0
1    -1.1
2     NaN
3     0.1
4     2.0
5     8.0
6    12.8
7    21.0
8    25.8
9    41.0
dtype: float64

In [139]:
np.ptp(z_with_nan)

nan

In [137]:
np.ptp(y_with_nan)

nan

In [140]:
np.ptp(z)

46.0

In [141]:
np.ptp(z_with_nan)

nan

In [142]:
print(type(z_with_nan))

<class 'pandas.core.series.Series'>


In [154]:
np.amax(y) - np.amin(y)

46.0

In [155]:
y

array([-5. , -1.1,  0.1,  2. ,  8. , 12.8, 21. , 25.8, 41. ])

In [156]:
np.amax(y_with_nan) - np.amin(z_with_nan)

nan

In [161]:
len(y_with_nan)

10

In [162]:
len(y)

9

In [164]:
np.nanmax(y_with_nan) - np.nanmin(z_with_nan)

46.0

In [165]:
y.max() - y.min()

46.0

In [166]:
z.max() - z.min()

46.0

In [167]:
z_with_nan.max() - z_with_nan.min()

46.0

In [168]:
y_with_nan.max() - y_with_nan.min()

nan

In [171]:
# menentukan interquartile range terhadap array NumPy
quartiles = np.quantile(y, [0.25, 0.75])
quartiles[1] - quartiles[0]

20.9

In [173]:
# menentukan interquartile range terhadap series Pandas
quartiles = z.quantile([0.25, 0.75])
quartiles[0.75] - quartiles[0.25]

20.9

## Summary of Desciptive Statistics

In [176]:
result = scipy.stats.describe(y, ddof = 1, bias = False)
result

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.622222222222222, variance=228.75194444444446, skewness=0.9249043136685094, kurtosis=0.14770623629658886)

In [177]:
# mengakses nilai tertentu dengan dot notation
result.nobs

9

In [178]:
result.skewness

0.9249043136685094

In [179]:
result.minmax[0] #min

-5.0

In [180]:
result.minmax[1] #max

41.0

In [181]:
result.mean

11.622222222222222

In [182]:
result.variance

228.75194444444446

In [183]:
result.kurtosis

0.14770623629658886

In [184]:
result = z.describe()

In [185]:
result

count     9.000000
mean     11.622222
std      15.124548
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

In [186]:
result['mean']

11.622222222222222

In [188]:
result['std']

15.12454774346805

In [189]:
result['min']

-5.0

In [190]:
result['max']

41.0

In [191]:
result['25%']

0.1

In [192]:
result['50%']

8.0

In [193]:
result['75%']

21.0

## Measures of Correlation Between Pairs of Data

In [195]:
# menentukan korelasi

x = list(range(-10, 11))
print(x)

y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
print(y)

[-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]


In [196]:
x_, y_ = np.array(x), np.array(y)

In [199]:
x_

array([-10,  -9,  -8,  -7,  -6,  -5,  -4,  -3,  -2,  -1,   0,   1,   2,
         3,   4,   5,   6,   7,   8,   9,  10])

In [198]:
x__, y__ = pd.Series(x_), pd.Series(y_)

In [200]:
x__

0    -10
1     -9
2     -8
3     -7
4     -6
5     -5
6     -4
7     -3
8     -2
9     -1
10     0
11     1
12     2
13     3
14     4
15     5
16     6
17     7
18     8
19     9
20    10
dtype: int32

## Covariance

In [201]:
# menghitung covariance dengan Python pure
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n

cov_xy = (sum((x[k] - mean_x) * (y[k] - mean_y) for k in range(n)) / (n - 1))

cov_xy

19.95

In [205]:
# membentuk covariance matrix
cov_matrix = np.cov(x_, y_)
cov_matrix

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [206]:
x_.var(ddof=1)

38.5

In [209]:
y_.var(ddof=1)

13.914285714285711

In [210]:
cov_xy = x__.cov(y__)

In [211]:
cov_xy

19.95

In [212]:
cov_xy = y__.cov(x__)

In [213]:
cov_xy

19.95

In [215]:
cov_xy = y__.cov(x__)

## Correlation Coefficient

In [217]:
# menghitung correlation coefficient dengan Python pure:
var_x = sum((item - mean_x)**2 for item in x) / (n-1)
var_y = sum((item - mean_y)**2 for item in y) / (n-1)
std_x, std_y = var_x ** 0.5, var_y ** 0.5
r = cov_xy / (std_x * std_y)
r

0.861950005631606

In [218]:
# menghitung correlation coefficiet dan p-value menggunakan SciPy
r, p = scipy.stats.pearsonr(x_, y_)
r

0.8619500056316061

In [220]:
p

5.122760847201135e-07

In [221]:
# membentuk correlation coefficient matrix
corr_matrix = np.corrcoef(x_, y_)
corr_matrix

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [222]:
# menentukan correlation coefficient dengan SciPy
scipy.stats.linregress(x_, y_)

LinregressResult(slope=0.5181818181818181, intercept=5.714285714285714, rvalue=0.861950005631606, pvalue=5.122760847201164e-07, stderr=0.06992387660074979, intercept_stderr=0.4234100995002589)

In [223]:
result = scipy.stats.linregress(x_, y_)

In [224]:
r = result.rvalue

In [225]:
r

0.861950005631606

In [226]:
# menentukan correlation coefficient dengan Series Pandas
r = x__.corr(y__)
r

0.8619500056316061

In [227]:
r = y__.corr(x__)
r

0.861950005631606

## Working with 2D Data

In [232]:
a = np.array([1, 1, 1, 2, 3, 1, 4, 9, 2, 8, 27, 4, 16, 1, 1])

In [233]:
a.shape = (5, 3)

In [234]:
a

array([[ 1,  1,  1],
       [ 2,  3,  1],
       [ 4,  9,  2],
       [ 8, 27,  4],
       [16,  1,  1]])

In [235]:
np.mean(a)

5.4

In [236]:
a.mean()

5.4

In [237]:
np.median(a)

2.0

In [240]:
a.var(ddof=1)

53.40000000000001

In [242]:
np.mean(a, axis = 0)

array([6.2, 8.2, 1.8])

In [243]:
np.mean(a, axis = 1)

array([ 1.,  2.,  5., 13.,  6.])

In [246]:
np.mean(a, axis = None)

5.4

In [247]:
np.mean(a[1, 0:2])

2.5

In [248]:
a.mean(axis=0)

array([6.2, 8.2, 1.8])

In [249]:
a.mean(axis=1)

array([ 1.,  2.,  5., 13.,  6.])

In [250]:
np.median(a, axis = 0)

array([4., 3., 1.])

In [251]:
np.median(a, axis = 1)

array([1., 2., 4., 8., 1.])

In [253]:
a.var(axis = 0, ddof = 1)

array([ 37.2, 121.2,   1.7])

In [254]:
a.var(axis = 1, ddof = 1)

array([  0.,   1.,  13., 151.,  75.])

In [256]:
scipy.stats.gmean(a) # default: axis :  0

array([4.        , 3.73719282, 1.51571657])

In [257]:
scipy.stats.gmean(a, axis = 0)

array([4.        , 3.73719282, 1.51571657])

In [258]:
scipy.stats.gmean(a, axis = 1)

array([1.        , 1.81712059, 4.16016765, 9.52440631, 2.5198421 ])

In [259]:
# memperoleh statistics summary deengan scipy

scipy.stats.describe(a, axis = None, ddof=1, bias = False)

DescribeResult(nobs=15, minmax=(1, 27), mean=5.4, variance=53.40000000000001, skewness=2.264965290423389, kurtosis=5.212690982795767)

In [261]:
scipy.stats.describe(a, ddof=1, bias = False) # default axis = 0

DescribeResult(nobs=5, minmax=(array([1, 1, 1]), array([16, 27,  4])), mean=array([6.2, 8.2, 1.8]), variance=array([ 37.2, 121.2,   1.7]), skewness=array([1.32531471, 1.79809454, 1.71439233]), kurtosis=array([1.30376344, 3.14969121, 2.66435986]))

In [262]:
scipy.stats.describe(a, axis =1, ddof=1, bias = False)

DescribeResult(nobs=3, minmax=(array([1, 1, 2, 4, 1]), array([ 1,  3,  9, 27, 16])), mean=array([ 1.,  2.,  5., 13.,  6.]), variance=array([  0.,   1.,  13., 151.,  75.]), skewness=array([0.        , 0.        , 1.15206964, 1.52787436, 1.73205081]), kurtosis=array([-3. , -1.5, -1.5, -1.5, -1.5]))

In [263]:
result = scipy.stats.describe(a, axis = 1, ddof=1, bias=False)

In [264]:
result

DescribeResult(nobs=3, minmax=(array([1, 1, 2, 4, 1]), array([ 1,  3,  9, 27, 16])), mean=array([ 1.,  2.,  5., 13.,  6.]), variance=array([  0.,   1.,  13., 151.,  75.]), skewness=array([0.        , 0.        , 1.15206964, 1.52787436, 1.73205081]), kurtosis=array([-3. , -1.5, -1.5, -1.5, -1.5]))

In [265]:
result.mean

array([ 1.,  2.,  5., 13.,  6.])

## DataFrames

In [269]:
row_names = ['first', 'second', 'third', 'fourth', 'fifth']
col_names = ['A', 'B', 'C']

df = pd.DataFrame(a, index=row_names, columns=col_names)
df

Unnamed: 0,A,B,C
first,1,1,1
second,2,3,1
third,4,9,2
fourth,8,27,4
fifth,16,1,1


In [270]:
df.mean()

A    6.2
B    8.2
C    1.8
dtype: float64

In [271]:
df.mean(axis=1)

first      1.0
second     2.0
third      5.0
fourth    13.0
fifth      6.0
dtype: float64

In [272]:
df.var(axis=1)

first       0.0
second      1.0
third      13.0
fourth    151.0
fifth      75.0
dtype: float64

In [274]:
df['A'].mean()

6.2

In [275]:
df['A'].var()

37.20000000000001

In [276]:
df.values

array([[ 1,  1,  1],
       [ 2,  3,  1],
       [ 4,  9,  2],
       [ 8, 27,  4],
       [16,  1,  1]])

In [277]:
df

Unnamed: 0,A,B,C
first,1,1,1
second,2,3,1
third,4,9,2
fourth,8,27,4
fifth,16,1,1


In [278]:
df.to_numpy()

array([[ 1,  1,  1],
       [ 2,  3,  1],
       [ 4,  9,  2],
       [ 8, 27,  4],
       [16,  1,  1]])

In [281]:
df.describe().at['mean', 'A']

6.2

In [283]:
df['A'].mean()

6.2

In [285]:
df.describe().at['mean', 'A']

6.2

In [287]:
df.describe().at['50%', 'B']

3.0

In [288]:
df['B'].median()

3.0