# School weigh case

In [1]:
from scipy.stats import ttest_1samp

In [2]:
from scipy.stats import norm

## If we know the data

In [5]:
data = [87,68,69,71,85,86,76] #example weights
#This could also come from a data frame, from a list or if we know the data and we write it manually

In [6]:
test = ttest_1samp(a=data,popmean=85) #popmean = population mean
# one sample, one sided (we only look to see if its below the mean, we don't care if it's above. If we cared, we would need to do a 2-sided test)

In [7]:
print(test)


Ttest_1sampResult(statistic=-2.378172554291756, pvalue=0.054904857540624415)


In [12]:
# There is no extreme result - we don't have enough evidence (not all the cases are underweight which would be needed to take a conclusions in such a small sample)
# We can't claim that boys are are underweight (the pvalue is higher than 0.05, and we are working in a standard interval of confidence of 95%)
# We could also check the critical value for a p value of 0,05 and compare it to our statistic value (if statistic value is lower it means that the H0 is not true)
# But both, p value and critical value should confirm that H0 is null, if one of them fails, then we cannot confirm that the H0 is null (so we cannot confirm that there is significal evidence)
# We would accept the H0 + reject the HA

## If we don't know the data

In [11]:
# if we don't have the readings/observations we can do the following:

In [10]:
sample_mean = 80.94
pop_mean = 85
sample_std = 11.6
n = 25

In [14]:
import math

In [15]:
t_statistic = (sample_mean-pop_mean)/(sample_std/(math.sqrt(n-1)))

In [16]:
t_statistic

-1.7146428199482255

In [17]:
#p value
p_value=norm.cdf(t_statistic)

In [18]:
p_value

0.04320536648684992

In [20]:
# critical value
critical_value = norm.ppf(0.05)

In [21]:
critical_value

-1.6448536269514729

In [22]:
# As t_statistic value is lower than critical_value we can say that boys are underweight (and p value is lower than 0.05)

# Anova test example

In [23]:
import pandas as pd

In [24]:
df = pd.read_excel('anova_class_example_data.xlsx', sheet_name = 'data_collected')

In [25]:
df

Unnamed: 0,Display_design,Percent_increase_in_sales
0,1,575
1,2,565
2,3,600
3,4,725
4,1,542
5,2,593
6,3,651
7,4,700
8,1,530
9,2,590


In [26]:
import statsmodels.api as sm

In [27]:
from statsmodels.formula.api import ols #ordinary least squares

In [29]:
# samples should be independent
# samples should be similar period of time (in that case, same 5 days)
# more than 2 samples
# interested in difference not which is better/worse

model = ols('Percent_increase_in_sales ~ C(Display_design)', data=df).fit()
# first column is the one we want to see the impact on and the second one is the variation we apply

In [30]:
results_table=sm.stats.anova_lm(model)

In [31]:
results_table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(Display_design),3.0,66870.55,22290.183333,66.797073,2.882866e-09
Residual,16.0,5339.2,333.7,,


In [32]:
# F is the F statistic for the Anova test
# PR is the p-value
# df are degrees of freedom to look in the F table (number of display designs and number of observations)
# We are trying to understand if there is a difference in sales if we use different designs

In [33]:
# As P value is < 0.01 we can reject H0 and say there is a difference in sales depending on display

In [None]:
# We need to check also the F value
# F value is higher than the critical value (checked in the F table)

# We can say that design does matter and reject the H0. There is a significant difference.