# Sesi 9

Descriptive Statistics

In [5]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
xWithNan = [8.0, 1, 2.5, math.nan, 4, 28.0]

x

[8.0, 1, 2.5, 4, 28.0]

In [3]:
xWithNan

[8.0, 1, 2.5, nan, 4, 28.0]

In [6]:
y, yWithNan = np.array(x), np.array(xWithNan)
z, zWithNan = pd.Series(x), pd.Series(xWithNan)

In [7]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [8]:
yWithNan

array([ 8. ,  1. ,  2.5,  nan,  4. , 28. ])

In [9]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [11]:
zWithNan

0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64

## Measure of Central Tendency

### Mean

In [12]:
# sum adalah penjumlahan, len adalah jumlah isi variabel
# mean pake python biasa
sum(x)/len(x)

8.7

In [13]:
# pakai STATISTICS
mean = statistics.mean(xWithNan)

In [14]:
mean

nan

In [15]:
# pake NUMPY
np.mean(y)

8.7

In [16]:
np.mean(yWithNan)

nan

In [17]:
# pake NUMPY NANMEAN untuk abaikan nan/missing value
np.nanmean(yWithNan)

8.7

In [18]:
# pake PANDAS, langsung otomatis abaikan missing value
zWithNan.mean()

8.7

### Weighted Mean

In [20]:
w=[0.1, 0.2, 0.3, 0.25, 0.15]

In [21]:
w= np.array(w)

In [22]:
# pake NUMPY
np.average(y, weights=w)

6.95

In [23]:
np.average(z, weights=w)

6.95

### Harmonic Mean

In [24]:
# pure PYTHON
hmean = len(x)/sum(1/item for item in x)

In [25]:
hmean

2.7613412228796843

In [26]:
# pake STATISTICS
statistics.harmonic_mean(x)

2.7613412228796843

In [27]:
#pake SCIPY
scipy.stats.hmean(y)

2.7613412228796843

### Geometric Mean

In [29]:
# pake SCIPY, selain scipy gapunya fx ini
scipy.stats.gmean(y)

4.67788567485604

## Median

In [31]:
# pake STATISTICS
# median low untuk data genap, ambil angka terkecil
# median high ambil angka terbesar
# median untuk rata2 median


In [33]:
x[:-1]

[8.0, 1, 2.5, 4]

In [34]:
statistics.median_low(x[:-1])

2.5

In [35]:
statistics.median_high(x[:-1])

4

In [36]:
statistics.median(x[:-1])

3.25

In [37]:
# pake STATISTICS, menganggap nan bukan missing value, nan urutan terakhir

statistics.median_low(xWithNan)

4

In [38]:
# pake NUMPY
np.median(y)

4.0

In [39]:
np.median(yWithNan)

nan

## Mode

In [40]:
u = [2,3,2,8,12]
v =[12,15,12,15,21,15,12]

In [41]:
statistics.mode(u)

2

In [42]:
statistics.mode(v)

12

In [43]:
u, v = np.array(u), np.array(v)

In [44]:
# pake SCIPY ditunjukin VALUE dan JUMLAH MODUS
scipy.stats.mode(u)

ModeResult(mode=array([2]), count=array([2]))

In [50]:
scipy.stats.mode(v)

ModeResult(mode=array([12]), count=array([3]))

In [45]:
u, v = pd.Series(u), pd.Series(v)

In [46]:
u.mode()

0    2
dtype: int32

In [51]:
v.mode()

0    12
1    15
dtype: int32

## Measures of Variability

In [53]:
statistics.variance(x)

123.2

In [54]:
np.var(y, ddof=1)

123.19999999999999

## Standard Deviation

In [55]:
statistics.stdev(x)

11.099549540409287

In [56]:
np.std(y, ddof=1)

11.099549540409285

In [57]:
# cara lain penulisan untuk PANDAS
z.std(ddof=1)

11.099549540409285

## Skewness

In [58]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [60]:
#PANDAS
z.skew()

1.9470432273905924

## Percentiles

In [61]:
x = [-5.0, -1.1, 0.1, 0.2, 8.0, 12.8, 21.0, 25.8, 41.0]

In [62]:
statistics.quantiles(x, n=2)

[8.0]

In [63]:
y =np.array(x)

In [67]:
np.percentile(y, 50)

8.0

In [68]:
np.percentile(y,[25,50,75])

array([ 0.1,  8. , 21. ])

In [69]:
# percentile argumen 0-100
np.nanpercentile(yWithNan, [25,75])

array([2.5, 8. ])

In [70]:
# quantile argumen 0-1
np.quantile(y, 0.5)

8.0

In [71]:
z=pd.Series(y)

In [72]:
z.quantile(0.95)

34.919999999999995

## Ranges

In [74]:
# peak to peak (ptp)
np.ptp(y)

46.0

## Summary of Descriptive Statistics

In [75]:
scipy.stats.describe(y, ddof=1, bias=False)

DescribeResult(nobs=9, minmax=(-5.0, 41.0), mean=11.422222222222222, variance=233.44194444444446, skewness=0.9206597142483607, kurtosis=0.07966042430381837)

In [76]:
z.describe()

count     9.000000
mean     11.422222
std      15.278807
min      -5.000000
25%       0.100000
50%       8.000000
75%      21.000000
max      41.000000
dtype: float64

## Measure of Correlation

In [77]:
x=list(range(-10,11))

In [78]:
y=[0,2,2,2,2,3,3,6,7,4,7,6,6,9,4,5,5,10,11,12,14]

In [79]:
x_,y_=np.array(x), np.array(y)
x__,y__=pd.Series(x_), pd.Series(y_)

### Covariance

In [80]:
np.cov(x_,y_)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [81]:
x__.cov(y__)

19.95

### Correlation Coefficient

In [82]:
scipy.stats.pearsonr(x_,y_)

(0.861950005631606, 5.122760847201171e-07)

In [83]:
np.corrcoef(x_,y_)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])