In [1]:
import numpy as np
import pandas as pd


In [2]:
dogs = pd.read_csv('dog_data.csv')

In [3]:
# Subset to just whippets, terriers, and pitbulls
dogs_wtp = dogs[dogs.breed.isin(['whippet', 'terrier', 'pitbull'])]

# Subset to just poodles and shihtzus
dogs_ps = dogs[dogs.breed.isin(['poodle', 'shihtzu'])]

In [4]:
print(dogs.head())

   is_rescue  weight  tail_length  age  color  likes_children  \
0          0       6         2.25    2  black               1   
1          0       4         5.36    4  black               0   
2          0       7         3.63    3  black               0   
3          0       5         0.19    2  black               0   
4          0       5         0.37    1  black               1   

   is_hypoallergenic      name      breed  
0                  0      Huey  chihuahua  
1                  0   Cherish  chihuahua  
2                  1     Becka  chihuahua  
3                  0     Addie  chihuahua  
4                  1  Beverlee  chihuahua  


In [5]:
whippet_rescue = dogs.is_rescue[dogs.breed=="whippet"]

In [8]:
num_whippet_rescues = np.sum(whippet_rescue)
print(num_whippet_rescues)

6


In [9]:
num_whippet = len(whippet_rescue)
print(num_whippet)

100


Testing These hypotheses:
    
Null: 8% of whippets are rescues
    
Alternative: more or less than 8% of whippets are rescues

In [11]:
#focused on a single binary categorical variable, which indicates whether or not each whippet is a rescue. We want to compare the number of rescues in our sample to a hypothetical population-level proportion of 0.08
from scipy.stats import binom_test


In [12]:
pval = binom_test(num_whippet_rescues, num_whippet, 0.08)
print(pval)
if pval < 0.05:
    print("The proportion of whippets who are rescues is significantly different from 8%")
else:
    print("The proportion of whippets who are rescues is not significantly different from 8%")

0.5811780106238105
The proportion of whippets who are rescues is not significantly different from 8%


In [13]:
wt_whippets = dogs.weight[dogs.breed=="whippet"]
wt_terrier = dogs.weight[dogs.breed=="terrier"]
wt_pitbull = dogs.weight[dogs.breed=="pitbull"]

Testing these hypotheses:
    
Null: whippets, terriers, and pitbulls all weigh the same amount on average
    
Alternative: whippets, terriers, and pitbulls do not all weigh the same amount on average (at least one pair of breeds has differing average weights)

In [14]:
#This test addresses an association between two variables: a non-binary categorical variable (breed, with three possible options) and a quantitative variable (weight).

from scipy.stats import f_oneway

In [16]:
Fstat, pval = f_oneway(wt_whippets, wt_terrier, wt_pitbull)
print(pval)
if pval < 0.05:
    print("At least one pair of breeds have significantly different weights")
else:
    print("No pairs have significantly different weights")

3.276415588274815e-17
At least one pair of breeds have significantly different weights


In [17]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [19]:
dogs_wtp = dogs[dogs.breed.isin(['whippet', 'terrier', 'pitbull'])]
output = pairwise_tukeyhsd(endog = dogs_wtp.weight, groups = dogs_wtp.breed)
print(output)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj   lower  upper  reject
-----------------------------------------------------
pitbull terrier   -13.24  0.001 -16.728 -9.752   True
pitbull whippet    -3.34 0.0639  -6.828  0.148  False
terrier whippet      9.9  0.001   6.412 13.388   True
-----------------------------------------------------


In [20]:
print("pitbull and terrier ; terrier and whippet have significantly different weights")

pitbull and terrier ; terrier and whippet have significantly different weights


Do poodles and shihtzu come in different colors?

In [21]:
dogs_ps = dogs[dogs.breed.isin(['poodle', 'shihtzu'])]

In [22]:
Xtab = pd.crosstab(dogs_ps.color, dogs_ps.breed)
print(Xtab)

breed  poodle  shihtzu
color                 
black      17       10
brown      13       36
gold        8        6
grey       52       41
white      10        7


Testing these hypotheses:
    
Null: There is an association between breed (poodle vs. shihtzu) and color.
    
Alternative: There is not an association between breed (poodle vs. shihtzu) and color.

In [23]:
from scipy.stats import chi2_contingency

In [24]:
chi2, pval, dof, exp = chi2_contingency(Xtab)
print(pval)
if pval < 0.05:
    print("poodles and shihtzu come in different colors")
else:
    print("poodles and shihtzu don't come in different colors")

0.005302408293244593
poodles and shihtzu come in different colors
