# Statistical Operations

**Module 03 | Notebook 02**

---

## Objective
By the end of this notebook, you will master:
- Measures of central tendency (mean, median, mode)
- Measures of dispersion (variance, std, range)
- Percentiles and quantiles
- Correlation and covariance
- Histogram and binning

In [2]:
import numpy as np
np.set_printoptions(precision=3)
np.random.seed(42)

---
## 1. Measures of Central Tendency

In [3]:
data = np.array([12, 15, 18, 22, 25, 28, 30, 35, 40, 100])
print(f"Data: {data}")

Data: [ 12  15  18  22  25  28  30  35  40 100]


In [4]:
# Mean (average)
mean = np.mean(data)
print(f"Mean: {mean}")
print(f"Method syntax: {data.mean()}")

Mean: 32.5
Method syntax: 32.5


In [5]:
# Median (middle value - robust to outliers)
median = np.median(data)
print(f"Median: {median}")

# Notice: median is 26.5, mean is 32.5 (affected by outlier 100)

Median: 26.5


In [6]:
# Weighted average
values = np.array([70, 80, 90])
weights = np.array([0.3, 0.3, 0.4])  # Must sum to 1 for percentage weights

weighted_avg = np.average(values, weights=weights)
print(f"Weighted average: {weighted_avg}")

Weighted average: 81.0


In [7]:
# 2D with axis
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Array:\n{arr2d}")

print(f"Mean (all): {np.mean(arr2d)}")
print(f"Mean (axis=0, columns): {np.mean(arr2d, axis=0)}")
print(f"Mean (axis=1, rows): {np.mean(arr2d, axis=1)}")

Array:
[[1 2 3]
 [4 5 6]
 [7 8 9]]
Mean (all): 5.0
Mean (axis=0, columns): [4. 5. 6.]
Mean (axis=1, rows): [2. 5. 8.]


---
## 2. Measures of Dispersion

In [8]:
data = np.array([10, 12, 14, 16, 18, 20])
print(f"Data: {data}")

Data: [10 12 14 16 18 20]


In [9]:
# Variance
# Population variance (default)
var_pop = np.var(data)
print(f"Population variance: {var_pop}")

# Sample variance (ddof=1 for Bessel's correction)
var_sample = np.var(data, ddof=1)
print(f"Sample variance (ddof=1): {var_sample}")

Population variance: 11.666666666666666
Sample variance (ddof=1): 14.0


In [10]:
# Standard deviation
std_pop = np.std(data)
std_sample = np.std(data, ddof=1)

print(f"Population std: {std_pop}")
print(f"Sample std (ddof=1): {std_sample}")

Population std: 3.415650255319866
Sample std (ddof=1): 3.7416573867739413


In [11]:
# Range
data_range = np.ptp(data)  # peak-to-peak
print(f"Range: {data_range}")
print(f"Manual: {data.max() - data.min()}")

Range: 10
Manual: 10


In [12]:
# Interquartile range (IQR)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

print(f"Q1 (25th percentile): {q1}")
print(f"Q3 (75th percentile): {q3}")
print(f"IQR: {iqr}")

Q1 (25th percentile): 12.5
Q3 (75th percentile): 17.5
IQR: 5.0


---
## 3. Percentiles and Quantiles

In [13]:
data = np.random.normal(100, 15, 1000)  # Mean=100, Std=15, 1000 samples
print(f"Data shape: {data.shape}")
print(f"Data range: [{data.min():.1f}, {data.max():.1f}]")

Data shape: (1000,)
Data range: [51.4, 157.8]


In [14]:
# Percentiles (0-100 scale)
p10 = np.percentile(data, 10)
p50 = np.percentile(data, 50)  # Same as median
p90 = np.percentile(data, 90)

print(f"10th percentile: {p10:.2f}")
print(f"50th percentile (median): {p50:.2f}")
print(f"90th percentile: {p90:.2f}")

10th percentile: 81.33
50th percentile (median): 100.38
90th percentile: 119.58


In [15]:
# Multiple percentiles at once
percentiles = np.percentile(data, [25, 50, 75])
print(f"Quartiles: {percentiles}")

Quartiles: [ 90.286 100.38  109.719]


In [16]:
# Quantiles (0-1 scale)
q = np.quantile(data, [0.25, 0.5, 0.75])
print(f"Quantiles: {q}")

# Same as percentile but with 0-1 scale

Quantiles: [ 90.286 100.38  109.719]


In [17]:
# nanpercentile - handles NaN values
data_with_nan = np.array([1, 2, np.nan, 4, 5])

# Regular percentile with NaN
print(f"percentile with NaN: {np.percentile(data_with_nan, 50)}")

# nanpercentile ignores NaN
print(f"nanpercentile: {np.nanpercentile(data_with_nan, 50)}")

percentile with NaN: nan
nanpercentile: 3.0


---
## 4. Handling NaN Values

In [18]:
data = np.array([1, 2, np.nan, 4, 5, np.nan, 7])
print(f"Data: {data}")

Data: [ 1.  2. nan  4.  5. nan  7.]


In [19]:
# Regular functions propagate NaN
print(f"mean: {np.mean(data)}")
print(f"sum: {np.sum(data)}")

mean: nan
sum: nan


In [20]:
# nan-aware functions ignore NaN
print(f"nanmean: {np.nanmean(data)}")
print(f"nansum: {np.nansum(data)}")
print(f"nanstd: {np.nanstd(data)}")
print(f"nanmin: {np.nanmin(data)}")
print(f"nanmax: {np.nanmax(data)}")

nanmean: 3.8
nansum: 19.0
nanstd: 2.1354156504062622
nanmin: 1.0
nanmax: 7.0


In [21]:
# Count non-NaN values
count_valid = np.count_nonzero(~np.isnan(data))
print(f"Valid count: {count_valid}")

Valid count: 5


---
## 5. Correlation and Covariance

In [22]:
# Generate correlated data
np.random.seed(42)
x = np.random.randn(100)
y = 2 * x + np.random.randn(100) * 0.5  # y is correlated with x
z = np.random.randn(100)  # z is independent

print(f"x shape: {x.shape}")
print(f"y shape: {y.shape}")

x shape: (100,)
y shape: (100,)


In [23]:
# Correlation coefficient
# Returns correlation matrix
corr_xy = np.corrcoef(x, y)
print(f"Correlation matrix (x, y):\n{corr_xy}")
print(f"Correlation coefficient: {corr_xy[0, 1]:.4f}")

Correlation matrix (x, y):
[[1.    0.965]
 [0.965 1.   ]]
Correlation coefficient: 0.9655


In [24]:
# Compare correlations
print(f"Corr(x, y): {np.corrcoef(x, y)[0, 1]:.4f}")
print(f"Corr(x, z): {np.corrcoef(x, z)[0, 1]:.4f}")

Corr(x, y): 0.9655
Corr(x, z): 0.1908


In [25]:
# Covariance
# cov returns covariance matrix
cov_xy = np.cov(x, y)
print(f"Covariance matrix:\n{cov_xy}")
print(f"Cov(x, y): {cov_xy[0, 1]:.4f}")

Covariance matrix:
[[0.825 1.59 ]
 [1.59  3.29 ]]
Cov(x, y): 1.5905


In [26]:
# Multiple variables
data = np.vstack([x, y, z])
print(f"Data shape: {data.shape}")

# Full correlation matrix
corr_matrix = np.corrcoef(data)
print(f"Correlation matrix:\n{corr_matrix}")

Data shape: (3, 100)
Correlation matrix:
[[1.    0.965 0.191]
 [0.965 1.    0.181]
 [0.191 0.181 1.   ]]


---
## 6. Histogram and Binning

In [27]:
data = np.random.normal(0, 1, 1000)

In [28]:
# np.histogram - compute histogram
counts, bin_edges = np.histogram(data, bins=10)

print(f"Counts: {counts}")
print(f"Bin edges: {bin_edges}")
print(f"Number of bins: {len(counts)}, edges: {len(bin_edges)}")

Counts: [  8  25  88 180 219 231 139  78  25   7]
Bin edges: [-2.896 -2.299 -1.701 -1.104 -0.506  0.091  0.689  1.286  1.884  2.481
  3.079]
Number of bins: 10, edges: 11


In [29]:
# Custom bins
custom_bins = [-3, -2, -1, 0, 1, 2, 3]
counts, edges = np.histogram(data, bins=custom_bins)
print(f"Custom bin counts: {counts}")

Custom bin counts: [ 19 126 335 349 142  28]


In [30]:
# Normalized histogram (density)
counts_density, edges = np.histogram(data, bins=10, density=True)
print(f"Density (sums to integral of 1): {counts_density}")

Density (sums to integral of 1): [0.013 0.042 0.147 0.301 0.367 0.387 0.233 0.131 0.042 0.012]


In [31]:
# np.bincount - count occurrences of integers
int_data = np.array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3])
counts = np.bincount(int_data)
print(f"Data: {int_data}")
print(f"Bincount: {counts}")  # [1, 2, 3, 4] - count of 0, 1, 2, 3

Data: [0 1 1 2 2 2 3 3 3 3]
Bincount: [1 2 3 4]


In [32]:
# np.digitize - bin indices
data = np.array([0.5, 1.5, 2.5, 3.5, 4.5])
bins = np.array([1, 2, 3, 4])

bin_indices = np.digitize(data, bins)
print(f"Data: {data}")
print(f"Bins: {bins}")
print(f"Bin indices: {bin_indices}")

Data: [0.5 1.5 2.5 3.5 4.5]
Bins: [1 2 3 4]
Bin indices: [0 1 2 3 4]


---
## 7. Summary Statistics

In [33]:
def describe(arr):
    """Generate summary statistics for an array."""
    return {
        'count': len(arr),
        'mean': np.mean(arr),
        'std': np.std(arr),
        'min': np.min(arr),
        '25%': np.percentile(arr, 25),
        '50%': np.percentile(arr, 50),
        '75%': np.percentile(arr, 75),
        'max': np.max(arr)
    }

data = np.random.normal(100, 15, 1000)
stats = describe(data)

for key, value in stats.items():
    print(f"{key}: {value:.2f}")

count: 1000.00
mean: 100.39
std: 14.78
min: 54.71
25%: 90.72
50%: 100.10
75%: 110.17
max: 147.90


---
## Key Points Summary

**Central Tendency:**
- `mean`: Average, sensitive to outliers
- `median`: Middle value, robust to outliers
- `average`: Weighted average

**Dispersion:**
- `var`: Variance (ddof=0 population, ddof=1 sample)
- `std`: Standard deviation
- `ptp`: Peak-to-peak (range)

**Percentiles:**
- `percentile`: 0-100 scale
- `quantile`: 0-1 scale
- Use `nanpercentile` for data with NaN

**Correlation:**
- `corrcoef`: Correlation matrix (-1 to 1)
- `cov`: Covariance matrix

---
## Interview Tips

**Q1: Difference between population and sample variance?**
> - Population: Divide by N (`ddof=0`)
> - Sample: Divide by N-1 (`ddof=1`) - Bessel's correction
> - Use sample when data is a subset of larger population

**Q2: When to use median vs mean?**
> - Mean: Symmetric distributions, no extreme outliers
> - Median: Skewed data, presence of outliers
> - Example: Income data uses median (skewed by billionaires)

**Q3: How to detect outliers using IQR?**
> - Calculate IQR = Q3 - Q1
> - Lower bound = Q1 - 1.5 * IQR
> - Upper bound = Q3 + 1.5 * IQR
> - Points outside bounds are outliers

**Q4: Correlation vs Covariance?**
> - Covariance: Measures linear relationship, units depend on variables
> - Correlation: Normalized covariance, always -1 to 1, dimensionless

---
## Practice Exercises

### Exercise 1: Calculate Z-scores

In [34]:
# Standardize data to have mean=0 and std=1 (Z-score normalization)
data = np.array([85, 90, 78, 92, 88, 76, 95, 89])


In [35]:
# Solution
data = np.array([85, 90, 78, 92, 88, 76, 95, 89])

z_scores = (data - np.mean(data)) / np.std(data)
print(f"Original: {data}")
print(f"Z-scores: {z_scores}")
print(f"New mean: {z_scores.mean():.10f}")
print(f"New std: {z_scores.std():.10f}")

Original: [85 90 78 92 88 76 95 89]
Z-scores: [-0.262  0.544 -1.39   0.866  0.222 -1.713  1.35   0.383]
New mean: -0.0000000000
New std: 1.0000000000


### Exercise 2: Find outliers using IQR method

In [36]:
data = np.array([10, 12, 14, 15, 18, 22, 24, 100, 105])
# Find outliers using IQR method


In [37]:
# Solution
data = np.array([10, 12, 14, 15, 18, 22, 24, 100, 105])

q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = data[(data < lower_bound) | (data > upper_bound)]
print(f"Q1: {q1}, Q3: {q3}, IQR: {iqr}")
print(f"Bounds: [{lower_bound:.1f}, {upper_bound:.1f}]")
print(f"Outliers: {outliers}")

Q1: 14.0, Q3: 24.0, IQR: 10.0
Bounds: [-1.0, 39.0]
Outliers: [100 105]


### Exercise 3: Moving average

In [38]:
# Calculate 3-period moving average
prices = np.array([100, 102, 104, 103, 105, 107, 109, 108, 110])


In [41]:
# Solution
prices = np.array([100, 102, 104, 103, 105, 107, 109, 108, 110])
window = 3

# Method 1: Using convolution
weights = np.ones(window) / window
ma = np.convolve(prices, weights, mode='valid')
print(f"Moving average (convolve): {ma}")

# Method 2: Using cumsum
cumsum = np.cumsum(prices)
ma2 = (cumsum[window-1:] - np.concatenate([[0], cumsum[:-window]])[:-window+1 or None]) / window
print(f"Moving average (cumsum): {ma2}")

Moving average (convolve): [102. 103. 104. 105. 107. 108. 109.]


ValueError: operands could not be broadcast together with shapes (7,) (5,) 

In [42]:
prices = np.array([100, 102, 104, 103, 105, 107, 109, 108, 110])
window = 3

cumsum = np.cumsum(prices)
# Take cumsum from the end of the window
ma2 = cumsum.copy()
# Subtract the cumulative sum from before the window started
ma2[window:] = cumsum[window:] - cumsum[:-window]
# Slice to get only valid window results and divide
ma2 = ma2[window - 1:] / window

print(f"Moving average (cumsum): {ma2}")

Moving average (cumsum): [102. 103. 104. 105. 107. 108. 109.]


---
## Next Notebook
**03_linear_algebra.ipynb** - Matrix operations, dot products, eigenvalues, and decompositions.