<a href="https://colab.research.google.com/github/strzelnat/Statistics_basics/blob/main/lessons/basics/lesson9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scipy



In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import math
from scipy.stats import skewnorm

# **Confidence interval for a population propotion**

In [31]:
p = 30 / 200
n = 200

z_critical = stats.norm.ppf(0.975)
margin_of_error = z_critical * math.sqrt((p*(1-p))/n)

intervals = (p - margin_of_error, p + margin_of_error)
print(f'We are 90% confident that the ratio of population falls into this interval: {intervals[0]:.5f} and {intervals[1]:.5f}')

We are 90% confident that the ratio of population falls into this interval: 0.10051 and 0.19949


In [32]:
#using fuction
p = 30 / 200
n = 200

confidence_interval = stats.norm.interval(confidence = 0.95, loc = p, scale = math.sqrt((p*(1-p))/n))
print(f'We are 90% confident that the ratio of population falls into this interval: {confidence_interval[0]:.5f} and {confidence_interval[1]:.5f}')

We are 90% confident that the ratio of population falls into this interval: 0.10051 and 0.19949


# **Confidence interval for a paired samples**

$$
\bar{d} \pm t_{n-1, \alpha/2} \cdot \frac{S_d}{\sqrt{n}}
$$


In [53]:
size = 30
results_before = stats.norm.rvs(size=size, loc=14, scale=1)
change = stats.norm.rvs(size=size, loc=0.2, scale=0.5)
results_after = [x + y for x, y in zip(results_before, change)]

df = pd.DataFrame(list(zip(results_before, results_after)), columns=['before', 'after'])
df['diff'] = df['before'] - df['after']

print(f"Before: {df['before'].mean():.3f},  {df['before'].std():.3f}")
print(f"After: {df['after'].mean():.3f}, {df['after'].std():.3f}")


alpha = 0.1
alpha2 = alpha / 2
sample_size = len(df)
sample_mean = df['diff'].mean()
sample_std = df['diff'].std()
confidence_level = 1 - alpha

t_critical = stats.t.ppf(1 - alpha2, df=sample_size - 1)
print(f'T-critical = {t_critical:.3f}')

margin_of_error =  t_critical * sample_std / math.sqrt(sample_size)
confidence_interval = (sample_mean - margin_of_error, sample_mean + margin_of_error)

print(f'We are {confidence_level * 100:.1f}% sure that the effect in population will be in interval {confidence_interval[0]:.5f} and {confidence_interval[1]:.5f}')


Before: 13.762, 1.000
After: 13.850, 1.067
T-critical = 1.699
We are 90.0% sure that the effect in population will be in interval -0.26671 and 0.09054


In [58]:
#using function
confidence_interval = stats.t.interval(confidence = confidence_level, loc = sample_mean, df = sample_size - 1, scale = sample_std / math.sqrt(sample_size))
confidence_interval
print(f'We are {confidence_level * 100:.1f}% sure that the effect in population will be in interval {confidence_interval[0]:.5f} and {confidence_interval[1]:.5f}')

We are 90.0% sure that the effect in population will be in interval -0.26671 and 0.09054


# **Confidence interval for two independent samples**

\begin{aligned}
s_p^2 &= \frac{(n_1 - 1)s_1^2 + (n_2 - 1)s_2^2}{n_1 + n_2 - 2} \\
SE &= \sqrt{ s_p^2 \left( \frac{1}{n_1} + \frac{1}{n_2} \right) } \\
CI &= (\bar{X}_1 - \bar{X}_2) \pm t_{\alpha/2} \cdot SE
\end{aligned}


In [67]:
#two independent values and variance is known for populations (sometimes: well known) -> variances have to be similar!

doctors = stats.norm.rvs(size = 70, loc = 55, scale = 10)
therapists = stats.norm.rvs(size = 40, loc = 75, scale = 10)

mean_doc = doctors.mean()
std_doc = doctors.std()
ther_mean = therapists.mean()
ther_std = therapists.std()
count_doc = len(doctors)
count_ther = len(therapists)

print(min(doctors), max(doctors))
print(min(therapists), max(therapists))

alpha = 0.05
alpha2 = alpha / 2
confidence_level = 1 - alpha
z_critical = stats.norm.ppf(q = 1 - alpha2)

margin_of_error = z_critical * math.sqrt((std_doc**2 / count_doc) + (ther_std ** 2 / count_ther))
confidence_interval = ((ther_mean - mean_doc) - margin_of_error, (ther_mean - mean_doc) + margin_of_error)

print(f'We are {confidence_level * 100}% sure that the difference between results of therapists and doctors falls into inerval {confidence_interval[0]:.2f} and {confidence_interval[1]:.2f}')

28.578781697369898 77.85751676938938
54.781528879336655 89.34427608494711
We are 90.0% sure that the difference between results of therapists and doctors falls into inerval 11.81 and 18.05


In [70]:
#two independent values and variance is unknown for populations (sometimes: well known) -> variances have to be similar!


mean_doc = doctors.mean()
std_doc = doctors.std()
ther_mean = therapists.mean()
ther_std = therapists.std()
count_doc = len(doctors)
count_ther = len(therapists)

alpha = 0.5
alpha2 = alpha / 2
confidence_level = 1 - alpha
t_critical = stats.t.ppf(q = 1 - alpha2, df = count_doc + count_ther - 2)
var_p = (((count_doc - 1) *  std_doc ** 2 )  +  ((count_ther - 1) *  ther_std ** 2 ) ) / (count_doc + count_ther - 2)
margin_of_error = t_critical * math.sqrt((var_p/ count_doc) + (var_p / count_ther))
confidence_interval = (((mean_doc - ther_mean) - margin_of_error), ((mean_doc - ther_mean) + margin_of_error))
print(f'We are {confidence_level * 100}% sure that the difference between results of therapists and doctors falls into inerval {confidence_interval[0]:.2f} and {confidence_interval[1]:.2f}')

We are 50.0% sure that the difference between results of therapists and doctors falls into inerval -16.29 and -13.57
