# 05 - Statistics Basics for Data Engineering

Build intuition for descriptive statistics used in data quality and monitoring.


## 1. Build a metrics dataset


In [1]:
import pandas as pd
import numpy as np

daily_metrics = pd.DataFrame({
    'date': pd.date_range('2026-01-01', periods=12, freq='D'),
    'revenue': [120, 140, 90, 160, 200, 180, 130, 150, 170, 110, 190, 210],
    'room_nights': [2, 3, 1, 4, 4, 3, 2, 3, 3, 2, 4, 5],
})
daily_metrics['occupancy_rate'] = (daily_metrics['room_nights'] / 5).round(2)

daily_metrics


Unnamed: 0,date,revenue,room_nights,occupancy_rate
0,2026-01-01,120,2,0.4
1,2026-01-02,140,3,0.6
2,2026-01-03,90,1,0.2
3,2026-01-04,160,4,0.8
4,2026-01-05,200,4,0.8
5,2026-01-06,180,3,0.6
6,2026-01-07,130,2,0.4
7,2026-01-08,150,3,0.6
8,2026-01-09,170,3,0.6
9,2026-01-10,110,2,0.4


## 2. Descriptive statistics


In [2]:
summary = daily_metrics[['revenue', 'room_nights', 'occupancy_rate']].describe()
summary


Unnamed: 0,revenue,room_nights,occupancy_rate
count,12.0,12.0,12.0
mean,154.166667,3.0,0.6
std,37.527767,1.128152,0.22563
min,90.0,1.0,0.2
25%,127.5,2.0,0.4
50%,155.0,3.0,0.6
75%,182.5,4.0,0.8
max,210.0,5.0,1.0


## 3. Central tendency and spread


In [3]:
revenue_mean = daily_metrics['revenue'].mean()
revenue_median = daily_metrics['revenue'].median()
revenue_std = daily_metrics['revenue'].std()

print(f'Mean: {revenue_mean:.2f}')
print(f'Median: {revenue_median:.2f}')
print(f'Standard deviation: {revenue_std:.2f}')


Mean: 154.17
Median: 155.00
Standard deviation: 37.53


## 4. Percentiles and outliers (IQR)


In [4]:
q1 = daily_metrics['revenue'].quantile(0.25)
q3 = daily_metrics['revenue'].quantile(0.75)
iqr = q3 - q1

upper_bound = q3 + 1.5 * iqr
lower_bound = q1 - 1.5 * iqr

outliers = daily_metrics[(daily_metrics['revenue'] < lower_bound) | (daily_metrics['revenue'] > upper_bound)]
print('Outliers:')
print(outliers)


Outliers:
Empty DataFrame
Columns: [date, revenue, room_nights, occupancy_rate]
Index: []


## 5. Z-scores (standardization)


In [5]:
daily_metrics['revenue_z'] = (daily_metrics['revenue'] - revenue_mean) / revenue_std
daily_metrics[['date', 'revenue', 'revenue_z']]


Unnamed: 0,date,revenue,revenue_z
0,2026-01-01,120,-0.910437
1,2026-01-02,140,-0.377498
2,2026-01-03,90,-1.709845
3,2026-01-04,160,0.15544
4,2026-01-05,200,1.221318
5,2026-01-06,180,0.688379
6,2026-01-07,130,-0.643968
7,2026-01-08,150,-0.111029
8,2026-01-09,170,0.42191
9,2026-01-10,110,-1.176906


## 6. Correlation for monitoring


In [6]:
correlation = daily_metrics[['revenue', 'room_nights', 'occupancy_rate']].corr()
correlation


Unnamed: 0,revenue,room_nights,occupancy_rate
revenue,1.0,0.923327,0.923327
room_nights,0.923327,1.0,1.0
occupancy_rate,0.923327,1.0,1.0


## Next Steps

Continue your learning with:
- **06_statistics_advanced.ipynb** - Sampling, confidence intervals, experiments
- **07_data_quality_checks.ipynb** - Data profiling and validation
