# Chapter 3: Descriptive and Inferential Statistics

## Descriptive Statistics
##### Example 3-1. Calculating mean in Python

In [1]:
sample = [1, 3, 2, 5, 7, 0, 2, 3]
mean = sum(sample) / len(sample)
print(mean)

2.875


##### Example 3-2. Calculating a weighted mean in Python

In [2]:
sample = [90, 80, 63, 87]
weights = [0.20, 0.20, 0.20, 0.40]

weighted_mean = sum(s * w for s,w in zip(sample, weights)) / sum(weights)
print(weighted_mean)

81.4


##### Example 3-3. Calculating a weighted mean in Python

In [3]:
sample = [90, 80, 63, 87]
weights = [1.0, 1.0, 1.0, 2.0]

weighted_mean = sum(s * w for s,w in zip(sample, weights)) / sum(weights)
print(weighted_mean)

81.4


##### Example 3-4. Calculating the median in Python

In [4]:
sample = [0, 1, 5, 7, 9, 10, 14]

def median(values):
    ordered = sorted(values)
    n = len(ordered)
    mid = int(n/2) - 1 if n%2 == 0 else int(n/2)
    
    if n%2 == 0:
        return (ordered[mid] + ordered[mid+1]) / 2
    else:
        return ordered[mid]
print(median(sample))

7


##### Example 3-5. Calculating mode in Python

In [5]:
from collections import defaultdict 
sample = [1, 3, 2, 5, 7, 0, 2, 3]

def mode(values):
    counts = defaultdict(lambda: 0)
    for s in values:
        counts[s] += 1
    max_count = max(counts.values())
    modes = [v for v in set(values) if counts[v] == max_count]
    return modes

print(mode(sample))

[2, 3]


##### Example 3-6. Calculating variance in Python

In [6]:
data = [0, 1, 5, 7, 9, 10, 14]

def variance(values):
    mean = sum(values) / len(values)
    _variance = sum((v - mean)**2 for v in values) / len(values)
    return _variance

print(variance(data))

21.387755102040813


##### Example 3-7. Calculating standard deviation in Python

In [7]:
from math import sqrt

data = [0, 1, 5, 7, 9, 10, 14]

def variance(values):
    mean = sum(values) / len(values)
    _variance = sum((v - mean)**2 for v in values) / len(values)
    return _variance

def std_dev(values):
    return sqrt(variance(values))

print(std_dev(data))

4.624689730353898


##### Example 3-8. Calculating standard deviation for a sample

In [8]:
data = [0, 1, 5, 7, 9, 10, 14]

def variance(values, is_sample: bool=False):
    mean = sum(values) / len(values)
    _variance = sum((v - mean)**2 for v in values) / (len(values) - (1 if is_sample else 0))
    return _variance

def std_dev(values, is_sample: bool=False):
    return sqrt(variance(values, is_sample))

print(f'VARIANCE = {variance(data, True)}')
print(f'STD DEV = {std_dev(data, True)}')

VARIANCE = 24.95238095238095
STD DEV = 4.99523582550223


##### Example 3-9. The normal distribution function in Python

In [9]:
# normal distribution, returns likelyhood
def normal_pdf(x: float, mean: float, std_dev: float) -> float:
    return(1.0 / (2.0 * math.pi * std_dev**2)**0.5) * math.exp(-1.0 * ((x-mean)**2 / (2.0*std_dev**2)))

##### Example 3-10. The normal distribution CDF in Python

In [10]:
from scipy.stats import norm

mean = 64.43
std_dev = 2.99

x = norm.cdf(64.43, mean, std_dev)
print(x)

0.5


##### Example 3-11. Getting a middle range probability using CDF

In [11]:
from scipy.stats import norm

mean = 64.43
std_dev = 2.99

x = norm.cdf(66, mean, std_dev) - norm.cdf(62, mean, std_dev)
print(x)

0.4920450147062894


##### Example 3-12. Using inverse CDF (called ppf()) in Python

In [12]:
x = norm.ppf(0.95, loc=64.43, scale=2.99)
print(x)

69.3481123445849


##### Example 3-13. Generating random numbers from a normal distribution

In [13]:
import random

for i in range(0, 1000):
    random_p = random.uniform(0.0, 1.0)
    random_weight = norm.ppf(random_p, loc=64.43, scale=2.99)
    print(random_weight)

62.39910470888278
66.0921183966826
65.45943902656828
63.510121728023115
66.32979534610415
60.7563507666232
64.21349954184448
65.30147872813014
61.54877652100183
66.58546861800053
67.06176767752174
67.62464955808359
59.18556486220335
68.2141277192889
62.16738585540882
64.36650239090947
62.671887728275976
63.76857896108007
64.84580886517941
64.962223842078
63.03957130282924
61.95085818118003
59.67489236095384
65.0323580619656
64.15734284630753
65.03859916207196
64.24769166022926
63.95470466527772
66.92757044293174
62.200975670455875
64.46795196228258
61.474577340216555
67.0348075994853
60.82059526929434
65.2859451545539
62.830295533223165
65.59897488966004
65.35373980016925
63.1624615041307
62.89851664466613
59.408919968894736
64.93554365060086
59.61767468001124
68.38072880178242
65.29747395709055
67.40705697079486
69.31040562944145
55.81752638049874
61.29722224802199
65.1147728052869
65.97590589798654
59.85276832053425
65.10606175053564
66.20958592945337
66.43586313677113
62.28250796916

##### Example 3-14. Turn Z-scores in x-values and vice versa

In [16]:
mean = 140000
std = 3000
x = 150000

def z_score(x, mean, std):
    return (x - mean) / std

def z_to_x(z, mean, std):
    return (z * std) + mean

# Convert to Z-score and then back to x
z = z_score(x, mean, std_dev)
back_to_x = z_to_x(z, mean, std)

print(f'Z-Score = {z}')
print(f'Back to x = {back_to_x}')

Z-Score = 3344.4816053511704
Back to x = 10173444.816053512


##### Example 3-15. Exploring the central limit theorem in Python

In [14]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

sample_size = 31
sample_count = 1000

# Central Limit Theorem, 1000 samples each with 31 random numbers between 0 and 1
x_values = [(sum([random.uniform(0.0, 1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
y_values = [1 for _ in range(sample_count)]

px.histogram(x=x_values, y=y_values, nbins=20).show()

##### Example 3-16. Retrieving a critical z-value

In [15]:
def critical_z_value(p):
    norm_dist = norm(loc=0.0, scale=1.0)
    left_tail_area = (1.0 - p) / 2.0
    upper_area = 1.0 - ((1.0 - p) / 2.0)
    return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
print(critical_z_value(p=0.95))

(-1.959963984540054, 1.959963984540054)


##### Example 3-17. Calculating a confidence interval in Python

In [16]:
def critical_z_value(p):
    norm_dist = norm(loc=0.0, scale=1.0)
    left_tail_area = (1.0 - p) / 2.0
    upper_area = 1.0 - ((1.0 - p) / 2.0)
    return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)

def confidence_interval(p, sample_mean, sample_std, n):
    lower, upper = critical_z_value(p)
    lower_ci = lower * (sample_std / sqrt(n))
    upper_ci = upper * (sample_std / sqrt(n))
    return sample_mean + lower_ci, sample_mean + upper_ci

print(confidence_interval(p=0.95, sample_mean=64.408, sample_std=2.05, n=31))

(63.68635915701992, 65.12964084298008)


##### Example 3-18. Calculating probability of recovery between 15 and 21 days

In [17]:
# Cold has a 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# 95% probability recovery time takes between 15 and 21 days
x = norm.cdf(21, mean, std_dev) - norm.cdf(15, mean, std_dev)

print(x)

0.9544997361036416


##### Example 3-19. Python code for getting x-value with 5% of area behind it

In [18]:
# Cold has a 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# What x-value has 5% of area behind it?
x = norm.ppf(0.05, mean, std_dev)

print(x)

15.53271955957279


##### Example 3-20. Calculating the one-tailed p-value

In [19]:
# Cold has a 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# Probability of 16 or less days
p_value = norm.cdf(16, mean, std_dev)

print(p_value)

0.09121121972586788


##### Example 3-21. Calculatin a range for a statistical significance of 5%

In [20]:
# Cold has a 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# What x-value has 2.5% of area behind it?
x1 = norm.ppf(0.025, mean, std_dev)

# What x-value has 97.5% of area behind it?
x2 = norm.ppf(0.975, mean, std_dev)

print(x1)
print(x2)

15.060054023189918
20.93994597681008


##### Example 3-22. Calculating the two-tailed p-value

In [21]:
# Cold has a 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# Probability of 16 or less days
p1 = norm.cdf(16, mean, std_dev)

# probability of 20 days or more
p2 = 1.0 - norm.cdf(20, mean, std_dev)

# p-value of both tails
p_value = p1 + p2

print(p_value)

0.18242243945173575


## The T-Distribution: Dealing with Small Samples
##### Example 3-23. Getting a critical value range with a T-distribution

In [22]:
from scipy.stats import t

# Get critical value range for 95% confidence with smaple size 25
n = 25
lower = t.ppf(0.025, df=n-1)
upper = t.ppf(0.975, df=n-1)

print(lower, upper)

-2.063898561628021 2.063898561628021
