# Using Scipy statistics
*This notebook is based on course material (https://people.duke.edu/~ccc14/bios-821-2017/) from Cliburn Chan *

In [None]:
%matplotlib inline

In [None]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)
plt.style.use('seaborn-notebook')

## Random number generators

### Create a distribution object

In [None]:
d = stats.norm()

In [None]:
d.mean(), d.std(), d.var()

### Get random variables

In [None]:
d.rvs(10)

### Get the PDF

In [None]:
x = np.linspace(-3,3,100)

In [None]:
y = d.pdf(x)

In [None]:
plt.plot(x, y)

### Get the CDF

In [None]:
y = d.cdf(x)

In [None]:
plt.plot(x, y)

### Get percentiles

In [None]:
xp = d.ppf(y)

In [None]:
plt.plot(xp, y)

## Example 

Assume height has a normal distribution. Suppose the average male height is 70 inches (5'10"), with a standard deviation of 4 inches.

Make a plot of the distribution of heights between 50 and 90 inches.


In [None]:
d = stats.norm(loc=70, scale=4)

In [None]:
hs = np.linspace(50, 90, 100)

In [None]:
plt.plot(hs, d.pdf(hs))

What height would a man need to be in the top 10 percentile?

In [None]:
d.ppf(0.9)

In [None]:
xp = np.linspace(d.ppf(0.9), 90, 100)
y1 = np.zeros_like(xp)
y2 = d.pdf(xp)
plt.plot(hs, d.pdf(hs))
plt.fill_between(xp, y1, y2, color='red', alpha=0.5)

What fraction of men have heights between 60 and 70 inches?

In [None]:
d.cdf(70) - d.cdf(60)

In [None]:
xp = np.linspace(60, 70, 100)
y1 = np.zeros_like(xp)
y2 = d.pdf(xp)
plt.plot(hs, d.pdf(hs))
plt.fill_between(xp, y1, y2, color='red', alpha=0.5)

What percentile would you be if your height was 76 inches?

In [None]:
d.cdf(76)

In [None]:
xs = 50, 76, 76
ys = d.cdf(76), d.cdf(76), 0
plt.plot(hs, d.cdf(hs))
plt.plot(xs, ys, color='red', alpha=0.5)

## Using simulation

Simulate heights of 1 million men

In [None]:
n = int(1e6)
data = d.rvs(n)

What height do you need to be in the top 10 percentile?

In [None]:
data.sort()
data[int(0.9 * n)]

What fraction of men have heights between 60 and 70 inches?

In [None]:
((data < 70).sum() - (data < 60).sum())/n

What percentile would you be if your height was 76 inches?

In [None]:
(data < 76).sum()/n

## Estimating parameters

In [None]:
n = 50
mu_ = 100
sd_ = 15
d_ = stats.norm(mu_, sd_)
xs = d_.rvs(n)

In [None]:
mu, sd = stats.norm.fit(xs)
d = stats.norm(mu, sd)

In [None]:
d1 = stats.norm(mu, sd/np.sqrt(n))
ci = d1.interval(0.95)

In [None]:
plt.hist(xs, histtype='step', linewidth=1)
xp = np.linspace(xs.min() - sd_, xs.max() + sd_, 100)
yp = d.pdf(xp)
plt.plot(xp, yp, color='red')
plt.axvline(mu, color='black')
ymin, ymax = plt.ylim()
plt.fill_between(ci, ymin, ymax, color='grey', alpha=0.5)
plt.title('Fitted normal distribution with 95% CI',
         fontsize=16)