# Einführung Statistik - Konfidenzintervalle

In [2]:
# Imports:
import pandas as pd
import seaborn as sns
import scipy.stats as st
import statsmodels.stats.api as sms

In [3]:
# Daten einlesen und fehlende Werte entfernen (Annahme: MCAR):
penguins_df = sns.load_dataset('penguins')
penguins_df.dropna(inplace=True)
penguins_df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [5]:
# Uns interessiert folgende Spalte:
penguins_df['body_mass_g'].head()

0    3750.0
1    3800.0
2    3250.0
4    3450.0
5    3650.0
Name: body_mass_g, dtype: float64

In [9]:
# Größe der Stichprobe:
n = len(penguins_df)
print(n)

333


In [7]:
# Mittelwert der Stichprobe:
sample_mean = penguins_df['body_mass_g'].mean()
print(sample_mean)

4207.057057057057


In [8]:
# Standardfehler für Spalte errechnen:
std_error = st.sem(penguins_df['body_mass_g'])
print(std_error)

44.12555413389445


In [None]:
# 1 - alpha/2 = 0.975

In [11]:
# z-Wert (hier t-Wert, mit derselben Rolle!) ermitteln:
t_val = st.t.ppf(0.975, n -1)  #t_val ist für uns dasselbe wie z-Wert!
print(t_val)

1.9671350567188735


In [12]:
# Unteres Intervallende:
lower_bound = sample_mean - t_val * std_error
print(lower_bound)

4120.256132623127


In [13]:
# Oberes Intervallende:
upper_bound = sample_mean + t_val * std_error
print(upper_bound)

4293.857981490987


In [14]:
# Das Ganze als Funktion:
def mean_confidence_interval(series: pd.Series, confidence: float=0.95) -> tuple:
    alpha = 1 - confidence
    mean = series.mean()
    n = len(series)
    std_error = st.sem(series)
    t_val = st.t.ppf(1 - alpha/2, n - 1)
    lower_bound = mean - t_val * std_error
    upper_bound = mean + t_val * std_error
    return lower_bound, upper_bound

In [15]:
mean_confidence_interval(penguins_df['body_mass_g'])

(np.float64(4120.256132623127), np.float64(4293.857981490987))

In [16]:
# Option 2: Rechnen per mit einer scipy-Funktion:
st.t.interval(
    0.95,
    n - 1,
    loc=sample_mean,
    scale=std_error,
)

(np.float64(4120.256132623127), np.float64(4293.857981490987))

In [19]:
# Alternativ mit statsmodels:
sms.DescrStatsW(penguins_df['body_mass_g']).tconfint_mean()  # standardmäßig alpha bei 0.05

(np.float64(4092.740160248099), np.float64(4321.373953866015))

In [None]:
sms.DescrStatsW(penguins_df['body_mass_g']).tconfint_mean(alpha=0.01)  # Für 99%-KI.