In [1]:
import numpy as np
import pandas as pd

In [2]:
state = pd.read_csv("./practical-statistics-for-data-scientists/data/state.csv")
state.head()

Unnamed: 0,State,Population,Murder.Rate,Abbreviation
0,Alabama,4779736,5.7,AL
1,Alaska,710231,5.6,AK
2,Arizona,6392017,4.7,AZ
3,Arkansas,2915918,5.6,AR
4,California,37253956,4.4,CA


## Measures of Location/Centrality
- mean
- median
- trimmed mean
- weighted mean

In [3]:
print("Arithmetic mean:", state.Population.mean())
print("The Median is:", state.Population.median())

Arithmetic mean: 6162876.3
The Median is: 4436369.5


In [4]:
from scipy.stats import hmean
# Locations measures of the murder rate
# Question: Are we sure we want to take the mean of a rate? Is there a better choice here?
# If memory serves, the appropriate way to take the average of a set of rates is the harmonic mean

# Measures of Location/Center
print("Arithmetic mean:", state["Murder.Rate"].mean())
print("The Median is:", state["Murder.Rate"].median())
print("Harmonic mean is:", hmean(state["Murder.Rate"]))

Arithmetic mean: 4.066
The Median is: 4.0
Harmonic mean is: 3.168854094682334


In [9]:
def weighted_mean(x, weights):
    x = pd.Series(x)
    weights = pd.Series(weights)
    numerator = np.dot(x, weights)
    denominator = weights.sum()
    return numerator / denominator

assert weighted_mean([1, 2, 3, 4, 5], [1, 1, 1, 1, 1]) == 3.0
assert weighted_mean([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]) == 2.3333333333333335
assert weighted_mean([1, 2, 3], [0, 0, 1]) == 3
assert weighted_mean([1, 2, 3], [1, 0, 0]) == 1
assert weighted_mean([1, 2, 3], [1, 1, 0]) == 1.5

In [10]:
# Weighted mean with numpy
a = np.average(state["Murder.Rate"], weights=state["Population"])
print("Weighted mean is:", a)

Weighted mean is: 4.445833981123393


In [11]:
print("Weighted mean:", weighted_mean(state["Murder.Rate"], state.Population))

Weighted mean: 4.445833981123392


In [12]:
from scipy.stats import trim_mean
print("Population Mean:", state["Population"].mean())
print("5% Trimmed mean:", trim_mean(state["Population"], 0.05))
print("10% Trimmed mean:", trim_mean(state["Population"], 0.1))
print("20% Trimmed mean:", trim_mean(state["Population"], 0.2))

Population Mean: 6162876.3
5% Trimmed mean: 5316411.543478261
10% Trimmed mean: 4783697.125
20% Trimmed mean: 4413915.966666667


## Let's Work with Variability!

In [15]:
x = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 100])

print("Median is:", x.median())
print("Mean is:", x.mean())

Median is: 5.5
Mean is: 14.5


In [29]:
# Mean Absolute Deviation from the mean
# AKA Average absolute deviation from the mean
deviations = (x - x.mean()).abs()
deviations.mean()

assert deviations.mean() == x.mad()
print("Mean absolute deviation from the mean", x.mad())

Mean absolute deviation from the mean 17.1


In [30]:
# Average deviation from the median:
# This is less sensitibe to the outlier than the mean absolute deviation from the mean...
# This is still very sensitive to outliers, like the 1_000_000
deviations = (x - x.median()).abs()

print("Average deviation from the median is", deviations.mean())

Average deviation from the median is 11.5


In [31]:
# Median absolute deviation from the median
# SUPER robust to the high outlier
deviations = (x - x.median()).abs()
median_deviation_from_the_median = deviations.median()
print("Median deviation from the median is:", median_deviation_from_the_median)

Median deviation from the median is: 2.5


In [None]:
# Something about the relationships between these measures makes me want to dive deeper into Taleb's "Statistical Consequences of Fat Tails"

### The Gedek/Bruce/Bruce book says "Mean Absolute Deviation"  == L1
- Mean absolute deviation from the mean is the same scalar as taking the l1 norm of that same vector?
- `x.mad() == np.linalg.norm(x, 1) / len(x)`

In [86]:
# The Gedek/Bruce/Bruce book says "Mean Absolute Deviation" is also known as the L1 norm or the Manhattan norm
# TODO: Prove it!
print("Pandas .mad() is", x.mad())
print("Numpy  .mad() is", (x - x.mean()).abs().sum() / len(x))

Pandas .mad() is 17.1
Numpy  .mad() is 17.1


In [85]:
print("Numpy's L1 norm is", np.linalg.norm(x - x.mean(), 1)) 
print("Manual L1 norm is", (x - x.mean()).abs().sum()) # norm of the distance between x and x_bar

Numpy's L1 norm is 171.0
Manual L1 norm is 171.0


In [93]:
print(np.linalg.norm(x, 1))
print(x.abs().sum())

145.0
145
