# 04 Testing the Data Demo

# Types of Testing

1. Non Parametric Testing     2. Parametric Testing   3.A/B Testing

## 1. Non Parametric Testing

### Chi Square Testing

In [10]:
import numpy as np
import pandas as pd
import scipy.stats as stats

national = pd.DataFrame(["white"] * 100000 + ["hispanic"] * 60000 + \
                        ["black"] * 50000 + ["asian"] * 15000 + ["other"] * 35000)

minnesota = pd.DataFrame(["white"] * 600 + ["hispanic"] * 300 + \
                         ["black"] * 250 + ["asian"] * 75 + ["other"] * 150)

national_table = pd.crosstab(index=national[0], columns="count")
minnesota_table = pd.crosstab(index=minnesota[0], columns="count")

print("National")
print(national_table)
print(" ")
print("Minnesota")
print(minnesota_table)

observed = minnesota_table

national_ratios = national_table/len(national)  # Get population ratios

expected = national_ratios * len(minnesota)   # Get expected counts

chi_squared_stat = (((observed-expected)**2)/expected).sum()

print(chi_squared_stat)


crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
                      df = 4)   # Df = number of variable categories - 1

print("Critical value")
print(crit)

p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,  # Find the p-value
                             df=4)
print("P value")
print(p_value)

stats.chisquare(f_obs= observed,   # Array of observed counts
                f_exp= expected)   # Array of expected counts

National
col_0      count
0               
asian      15000
black      50000
hispanic   60000
other      35000
white     100000
 
Minnesota
col_0     count
0              
asian        75
black       250
hispanic    300
other       150
white       600
col_0
count    18.194805
dtype: float64
Critical value
9.487729036781154
P value
[0.00113047]


Power_divergenceResult(statistic=array([18.19480519]), pvalue=array([0.00113047]))

## 2. Parametric Testing

Z Testing              T Testing       P  Testing ANOVA Testing

### Z Testing

In [1]:
def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):
    from numpy import sqrt, abs, round
    from scipy.stats import norm
    pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)
    z = ((X1 - X2) - mudiff)/pooledSE
    pval = 2*(1 - norm.cdf(abs(z)))
    return round(z, 3), round(pval, 4)

z, p = twoSampZ(28, 33, 0, 14.1, 9.5, 75, 50)
print(z, p)

-2.369 0.0179


### T Testing

In [3]:
import pandas as pd
from scipy.stats import ttest_ind, ttest_ind_from_stats

df=pd.read_csv('T-test.csv')
print(df)

var1=df['Low_Light'].values
var2=df['High_Light'].values

print(var1)
print(var2)

t, p = ttest_ind(var1, var2, equal_var=False)
print("ttest_ind:            t = %g  p = %g" % (t, p))

   Low_Light  High_Light
0         49          45
1         31          40
2         43          59
3         31          58
4         40          55
5         44          50
6         49          46
7         48          53
8         33          43
[49 31 43 31 40 44 49 48 33]
[45 40 59 58 55 50 46 53 43]
ttest_ind:            t = -2.66075  p = 0.0172063


In [2]:
## Import the packages
import numpy as np
from scipy import stats

## Define 2 random distributions
#Sample Size
N = 10
#Gaussian distributed data with mean = 2 and var = 1
a = np.random.randn(N) + 2
#Gaussian distributed data with with mean = 0 and var = 1
b = np.random.randn(N)

## Calculate the Standard Deviation
#Calculate the variance to get the standard deviation

#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
var_a = a.var(ddof=1)
var_b = b.var(ddof=1)

#std deviation
s = np.sqrt((var_a + var_b)/2)
print(s)

## Calculate the t-statistics
t = (a.mean() - b.mean())/(s*np.sqrt(2/N))
print(t)

## Compare with the critical t-value
#Degrees of freedom
df = 2*N - 2

#p-value after comparison with the t
p = 1 - stats.t.cdf(t,df=df)

print("t = " + str(t))
print("p = " + str(2*p))
#Note that we multiply the p value by 2 because its a twp tail t-test
### You can see that after comparing the t statistic with the critical t value (computed internally)
# we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean
# of the two distributions are different and statistically significant.
## Cross Checking with the internal scipy function
t2, p2 = stats.ttest_ind(a,b)
print("t = " + str(t2))
print("p = " + str(2*p2))

1.3666602264872862
3.8682657022682223
t = 3.8682657022682223
p = 0.0011263185513268326
t = 3.8682657022682214
p = 0.0022526371026537254


### Anova Testing

In [11]:
import pandas as pd
from scipy.stats import ttest_ind, ttest_ind_from_stats
import numpy as np


df=pd.read_csv("fish.csv")
print(df)

var1=df['food 1'].values
var2=df['food 2'].values
var3=df['food 3'].values
var4=df['food 4'].values

print(var1)
print(var2)
print(var3)
print(var4)

v1=np.array(var1)
v2=np.array(var2)
v3=np.array(var3)
v4=np.array(var3)

from scipy import stats
print(stats.f_oneway(v1, v2, v3, v4))

import numpy as np
from scipy import stats
food1 = np.array([ 31, 32, 53 ])
food2 = np.array([ 79, 48, 49 ])
food3= np.array([ 55, 46, 45])

print(stats.f_oneway(food1, food2, food3))

   food 1  food 2  food 3  food 4
0    60.8    68.7   102.6    87.9
1    57.0    67.7   102.1    84.2
2    65.0    74.0   100.2    83.1
3    58.6    66.3    96.5    85.7
4    61.7    69.8     NaN    90.3
[60.8 57.  65.  58.6 61.7]
[68.7 67.7 74.  66.3 69.8]
[102.6 102.1 100.2  96.5   nan]
[87.9 84.2 83.1 85.7 90.3]
F_onewayResult(statistic=nan, pvalue=nan)
F_onewayResult(statistic=1.8181818181818181, pvalue=0.24138718539465462)


## 3. A/B Testing

In [7]:
import numpy as np
from scipy import stats
data={
    'A':{'views':42,'signup':2},
    'B':{'views':85,'signup':11}
}

posteriors={variations:stats.beta(logs['signup'],logs['views']-logs['signup'])
            for variations, logs in data.items()}
print(posteriors['A'].mean)

print(posteriors['A'].ppf(0.025),posteriors['A'].ppf(0.975))

<bound method rv_frozen.mean of <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000028C9D3CC080>>
0.00596311824104283 0.12855402038414632


# 4. Distribution Testing

## Binomial Distribution using A/B Testing - Not Working Pymc3

In [None]:
import pymc3 as pm
import seaborn as sb

n = 1000
obs_v1 = 680
obs_v2 = 700

print("Visitors on page A ",obs_v1)
print("Visitors on page B ",obs_v2)

with pm.Model() as model: # context management
    # define priors
    prior_v1 = pm.Beta('prior_v1', alpha=2, beta=2)
    prior_v2 = pm.Beta('prior_v2', alpha=2, beta=2)

    # define likelihood
    like_v1 = pm.Binomial('like_v1', n=n, p=prior_v1, observed=obs_v1)
    like_v2 = pm.Binomial('like_v2', n=n, p=prior_v2, observed=obs_v2)

# define metrics
pm.Deterministic('difference', prior_v2 - prior_v1)
pm.Deterministic('relation', (prior_v2/prior_v1) - 1)

# inference
trace = pm.sample(draws=50000, step=pm.Metropolis(), start=pm.find_MAP(), progressbar=True)

_ = pm.traceplot(trace[1000:], grid=True)

## Poison Distribution using t testing

In [8]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import math


np.random.seed(6)

population_ages1 = stats.poisson.rvs(loc=18, mu=35, size=150000)
population_ages2 = stats.poisson.rvs(loc=18, mu=10, size=100000)
population_ages = np.concatenate((population_ages1, population_ages2))

minnesota_ages1 = stats.poisson.rvs(loc=18, mu=30, size=30)
minnesota_ages2 = stats.poisson.rvs(loc=18, mu=10, size=20)
minnesota_ages = np.concatenate((minnesota_ages1, minnesota_ages2))

print( population_ages.mean() )
print( minnesota_ages.mean() )

print(stats.ttest_1samp(a= minnesota_ages,               # Sample data
                 popmean= population_ages.mean()) ) # Pop mean

stats.t.ppf(q=0.025,  # Quantile to check
            df=49)  # Degrees of freedom

43.000112
39.26
Ttest_1sampResult(statistic=-2.5742714883655027, pvalue=0.013118685425061678)


-2.0095752344892093

# End of Testing the Data Demo