In [None]:
# load in our libraries
import pandas as pd # pandas for dataframes
from scipy.stats import ttest_ind # just the t-test from scipy.stats
from scipy.stats import probplot # for a qqplot
import matplotlib.pyplot as plt 
import pylab # for a qqplot

#read in our data
cereals = pd.read_csv('../input/cereal.csv')

# check out the first few lines
cereals.head()


In [None]:
# plot a qqplot to check normality
probplot(cereals['sodium'], dist='norm', plot=pylab)

So, if we decided to set our alpha (the highest p-value that would still allow us to reject the null) to 0.05 before we started, we would reject the null (i.e. can be pretty sure that there's not not a difference between these two groups). Statistic is the actual value of the t-test, and the pvalue is the probability that we saw a difference this large between our two groups just due to chance if they were actually drawn from the same underlying population.

In [None]:
# get the sodium for hot cereals
hotCereals = cereals['sodium'][cereals['type'] == 'H']

# get the sodium for cold cereals
coldCereals = cereals['sodium'][cereals['type'] == 'C']
# compare them
ttest_ind(hotCereals, coldCereals, equal_var=False)

In [None]:
# let's look at the means (averages) of each group to see which is larger
print('Mean sodium for the hot cereals:')
print(hotCereals.mean())

print('Mean sodium for the cold cereals:')
print(coldCereals.mean())

Now plot for the two cereal types, with each as a different color.

In [None]:
# plot the cold cereals
plt.hist(coldCereals, alpha=0.5, label='cold')
# and the hot cereals
plt.hist(hotCereals, alpha=0.5, label='hot')
# and add a legend
plt.legend(loc='upper right')
# add a title
plt.title("Sodium(mg) content of cereals by type")
# label the axes
plt.xlabel('Sodium in milligrams') # label the x axes
plt.ylabel('Count') # label the y axes