# Explore notes


In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats

import wrangle
import explore

RAND_SEED = 357

In [2]:
df = wrangle.make_pet_dataframe()

Returning saved csv files.


In [3]:
df

Unnamed: 0,animal_id,name,outcome_type,animal_type,sex_upon_outcome,breed,color,outcome_subtype,found_location,intake_type,intake_condition,sex_upon_intake,outcome_date,intake_date,target_outcome,age_at_outcome,age_at_intake
0,A859339,*Bodhi,Adoption,Dog,Spayed Female,German Shepherd,Black/Brown,no subtype,1156 W Cesar Chavez Street in Austin (TX),Stray,Normal,Spayed Female,2022-06-29,2022-06-12,Adoption,365,365
1,A860179,Alloy,Transfer,Dog,Neutered Male,German Shepherd Mix,Black/Brown,Partner,Austin (TX),Owner Surrender,Normal,Neutered Male,2022-06-29,2022-06-23,Transfer,1461,1461
2,A860475,no name,Transfer,Cat,Intact Female,Domestic Shorthair,Black,Partner,On Ih 35 Between Exit 240 And 241 in Austin (TX),Stray,Injured,Intact Female,2022-06-29,2022-06-29,Transfer,28,28
3,A860434,no name,Euthanasia,Other,Unknown,Bat,Brown,Rabies Risk,8350 Bluff Springs Road Apt 1515 in Austin (TX),Wildlife,Normal,Unknown,2022-06-29,2022-06-28,Other,365,365
4,A854024,Charles,Transfer,Cat,Neutered Male,Domestic Medium Hair,Orange Tabby,Snr,16600 Sydney Carol in Austin (TX),Stray,Injured,Intact Male,2022-06-29,2022-03-29,Transfer,730,730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113939,A664258,Sylvio,Return to Owner,Dog,Neutered Male,Weimaraner Mix,Silver,no subtype,Fm 1626/Manchaca Rd in Travis (TX),Stray,Normal,Neutered Male,2013-10-01,2013-10-01,Other,2557,2557
113940,A648744,Claire,Return to Owner,Dog,Spayed Female,Anatol Shepherd Mix,White/Tricolor,no subtype,Fm 1626/Manchaca Rd in Travis (TX),Stray,Normal,Spayed Female,2013-10-01,2013-10-01,Other,365,365
113941,A664236,no name,Transfer,Cat,Unknown,Domestic Shorthair Mix,Orange/White,Partner,Abia in Austin (TX),Stray,Normal,Unknown,2013-10-01,2013-10-01,Transfer,7,7
113942,A664237,no name,Transfer,Cat,Unknown,Domestic Shorthair Mix,Orange/White,Partner,Abia in Austin (TX),Stray,Normal,Unknown,2013-10-01,2013-10-01,Transfer,7,7


In [4]:
df['target_outcome'].value_counts(dropna=False)

Adoption           49533
Transfer           37751
Other              26128
Return to Owner      532
Name: target_outcome, dtype: int64

## Split the data into `train` etc

In [5]:
def split_data(df):
    '''splits the pet dataframe into train, test and validate subsets
    
    Args:
        df (DataFrame) : dataframe to split
    Return:
        train, test, validate (DataFrame) :  dataframes split from the original dataframe
    '''
    #make train and test
    train, test = train_test_split(df, train_size = 0.8, stratify=df.target_outcome, random_state=RAND_SEED)
    #make validate
    train, validate = train_test_split(train, train_size = 0.7, stratify=train.target_outcome, random_state=RAND_SEED)
    return train, validate, test

train, validate, test = split_data(df)

In [6]:
train.shape, validate.shape, test.shape

((63808, 17), (27347, 17), (22789, 17))

## Data exploration

In [7]:
df = wrangle.make_pet_dataframe()
train, validate, test = wrangle.split_data(df)

Returning saved csv files.


### Exploring the target varibale

In [8]:
train.head()

Unnamed: 0,animal_id,name,outcome_type,animal_type,sex_upon_outcome,breed,color,outcome_subtype,found_location,intake_type,intake_condition,sex_upon_intake,outcome_date,intake_date,target_outcome,age_at_outcome,age_at_intake
10281,A839456,A839456,Transfer,Dog,Intact Male,Labrador Retriever/Great Pyrenees,Tan/Red Tick,Partner,Vargas Street in Austin (TX),Stray,Normal,Intact Male,2021-07-25,2021-07-22,Transfer,92,61
47922,A772120,*Fancy,Transfer,Cat,Intact Female,Domestic Medium Hair Mix,Calico/White,Partner,7704 Lazy Creek Drive in Austin (TX),Stray,Normal,Intact Female,2018-05-19,2018-05-14,Transfer,30,30
60340,A750706,Charlie,Return to Owner,Dog,Spayed Female,Maltese/Toy Poodle,White,no subtype,4501 Belfield in Austin (TX),Public Assist,Normal,Spayed Female,2017-06-09,2017-05-31,Other,730,730
80855,A698675,Parker,Transfer,Dog,Neutered Male,Pit Bull Mix,Black/White,Partner,5317 Wellington in Austin (TX),Stray,Injured,Intact Male,2015-12-08,2015-03-16,Transfer,730,730
112519,A666918,no name,Transfer,Other,Intact Female,Californian Mix,Seal Point,Partner,Austin (TX),Owner Surrender,Normal,Intact Female,2013-11-13,2013-11-09,Transfer,365,365


In [9]:
train.target_outcome.value_counts(normalize=True)

Adoption           0.434710
Transfer           0.331322
Other              0.229297
Return to Owner    0.004670
Name: target_outcome, dtype: float64

In [10]:
adoption_rate = train.target_outcome.value_counts(normalize=True)[0]

Adoption rate is 0.43

break down the rates based on the following columns: animal_type, sex_upon_outcome, intake_type, intake_condition, sex_upon_intake

In [11]:
def get_percent_outcome(df, cat_cols = ['animal_type', 'sex_upon_outcome', 'intake_type', 'intake_condition', 'sex_upon_intake']):
    outputs = []
    for cat in cat_cols:
        for subcat in list(df[cat].unique()):
            for outcome in list(df['target_outcome'].unique()):
                output = {
                        'column':cat,
                        'column_subcat':subcat,
                        'outcome':outcome,
                        'total':(df[df[cat]==subcat].target_outcome == outcome).sum(),
                        'proportion': (df[df[cat]==subcat].target_outcome == outcome).mean()
                }
                outputs.append(output)
    return pd.DataFrame(outputs)

get_percent_outcome(train)

Unnamed: 0,column,column_subcat,outcome,total,proportion
0,animal_type,Dog,Transfer,8528,0.265182
1,animal_type,Dog,Other,8478,0.263628
2,animal_type,Dog,Adoption,14940,0.464567
3,animal_type,Dog,Return to Owner,213,0.006623
4,animal_type,Cat,Transfer,11952,0.440643
...,...,...,...,...,...
143,sex_upon_intake,Unknown,Return to Owner,1,0.000160
144,sex_upon_intake,Neutered Male,Transfer,1519,0.232904
145,sex_upon_intake,Neutered Male,Other,2880,0.441582
146,sex_upon_intake,Neutered Male,Adoption,2059,0.315701


In [12]:
prop_df = explore.get_percent_outcome(train)

What categories have a higher/lower adoption rate than the general rate?

In [13]:
#these have a higher adoption rate
prop_df[(prop_df.proportion > adoption_rate) & (prop_df.outcome == 'Adoption')]

Unnamed: 0,column,column_subcat,outcome,total,proportion
2,animal_type,Dog,Adoption,14940,0.464567
6,animal_type,Cat,Adoption,12300,0.453473
18,animal_type,Livestock,Adoption,4,0.444444
30,sex_upon_outcome,Spayed Female,Adoption,13288,0.697203
34,sex_upon_outcome,Neutered Male,Adoption,13086,0.647565
42,intake_type,Stray,Adoption,20755,0.449505
50,intake_type,Owner Surrender,Adoption,6250,0.585919
62,intake_type,Abandoned,Adoption,225,0.551471
66,intake_condition,Normal,Adoption,25238,0.46971
102,intake_condition,Medical,Adoption,40,0.57971


In [14]:
#these have a lower adoption rate
prop_df[(prop_df.proportion < adoption_rate) & (prop_df.outcome == 'Adoption')]

Unnamed: 0,column,column_subcat,outcome,total,proportion
10,animal_type,Other,Adoption,364,0.088008
14,animal_type,Bird,Adoption,130,0.342105
22,sex_upon_outcome,Intact Male,Adoption,586,0.06441
26,sex_upon_outcome,Intact Female,Adoption,672,0.073219
38,sex_upon_outcome,Unknown,Adoption,106,0.016919
46,intake_type,Public Assist,Adoption,498,0.147294
54,intake_type,Wildlife,Adoption,5,0.001639
58,intake_type,Euthanasia Request,Adoption,5,0.039062
70,intake_condition,Injured,Adoption,1100,0.277358
74,intake_condition,Aged,Adoption,39,0.178082


What does this look like as a crosstab?

In [15]:
pd.crosstab(train.target_outcome, train.animal_type)

animal_type,Bird,Cat,Dog,Livestock,Other
target_outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adoption,130,12300,14940,4,364
Other,134,2789,8478,2,3228
Return to Owner,0,83,213,0,2
Transfer,116,11952,8528,3,542


In [16]:
prop_df[(prop_df.column_subcat != 'Cat') & (prop_df.column_subcat != 'Dog')]

Unnamed: 0,column,column_subcat,outcome,total,proportion
8,animal_type,Other,Transfer,542,0.131044
9,animal_type,Other,Other,3228,0.780464
10,animal_type,Other,Adoption,364,0.088008
11,animal_type,Other,Return to Owner,2,0.000484
12,animal_type,Bird,Transfer,116,0.305263
...,...,...,...,...,...
143,sex_upon_intake,Unknown,Return to Owner,1,0.000160
144,sex_upon_intake,Neutered Male,Transfer,1519,0.232904
145,sex_upon_intake,Neutered Male,Other,2880,0.441582
146,sex_upon_intake,Neutered Male,Adoption,2059,0.315701


In [17]:
color_prop = explore.get_percent_outcome(train, cat_cols=['color'])

In [20]:
color_prop[(color_prop.proportion > adoption_rate) & (prop_df.outcome == 'Adoption')]

Unnamed: 0,column,column_subcat,outcome,total,proportion
6,color,Calico/White,Adoption,44,0.60274
14,color,Black/White,Adoption,3009,0.465357
30,color,White/Tan,Adoption,405,0.46445
34,color,Red/White,Adoption,303,0.509244
38,color,Buff,Adoption,124,0.459259
42,color,Blue,Adoption,545,0.447823
46,color,Orange Tabby,Adoption,894,0.439096
50,color,Brown Tabby,Adoption,1883,0.452862
54,color,Orange Tabby/White,Adoption,512,0.517172
58,color,Tricolor,Adoption,692,0.496413


In [19]:
train[train.color == 'Orange']

Unnamed: 0,animal_id,name,outcome_type,animal_type,sex_upon_outcome,breed,color,outcome_subtype,found_location,intake_type,intake_condition,sex_upon_intake,outcome_date,intake_date,target_outcome,age_at_outcome,age_at_intake
51543,A765080,Wesley,Return to Owner,Cat,Neutered Male,Domestic Shorthair Mix,Orange,no subtype,2901 Barton Skwy in Austin (TX),Stray,Normal,Neutered Male,2018-01-22,2018-01-12,Other,1826,1826
19660,A817986,no name,Disposal,Cat,Intact Female,Domestic Shorthair,Orange,no subtype,Austin (TX),Stray,Sick,Intact Female,2020-05-30,2020-05-30,Other,7,7
8648,A841929,no name,Transfer,Cat,Neutered Male,Domestic Shorthair,Orange,Partner,7581 Chevy Chase Dr in Austin (TX),Stray,Normal,Neutered Male,2021-09-10,2021-09-07,Transfer,2922,2922
65807,A741707,no name,Transfer,Cat,Intact Female,Domestic Shorthair Mix,Orange,Partner,Mcneil & Parmer in Austin (TX),Stray,Normal,Intact Female,2017-01-08,2017-01-07,Transfer,28,28
104294,A681278,no name,Euthanasia,Other,Unknown,Bat,Orange,Rabies Risk,3205 French Pl in Austin (TX),Wildlife,Nursing,Unknown,2014-06-14,2014-06-14,Other,365,365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31608,A798911,no name,Euthanasia,Cat,Unknown,Domestic Shorthair,Orange,At Vet,Unknown in Austin (TX),Stray,Sick,Unknown,2019-07-02,2019-07-02,Other,365,365
18954,A819865,*95 Grams,Transfer,Cat,Intact Female,Domestic Shorthair,Orange,Partner,19043 Littig Road in Travis (TX),Stray,Nursing,Intact Female,2020-07-07,2020-07-07,Transfer,3,3
3212,A852994,no name,Died,Other,Unknown,Lizard,Orange,In Kennel,8416 North Ih 35 in Austin (TX),Stray,Normal,Unknown,2022-03-14,2022-03-10,Other,365,365
42034,A781741,no name,Transfer,Cat,Unknown,Domestic Shorthair Mix,Orange,Partner,Parmer Lane And Ih 35 in Austin (TX),Stray,Normal,Unknown,2018-10-05,2018-10-04,Transfer,7,7


In [21]:
breed_prop = explore.get_percent_outcome(train, cat_cols=['breed'])

In [26]:
breed_prop[(breed_prop.proportion > adoption_rate) & (breed_prop.outcome == 'Adoption')].sort_values('total', ascending=False)

Unnamed: 0,column,column_subcat,outcome,total,proportion
66,breed,Domestic Shorthair,Adoption,3016,0.501413
58,breed,Labrador Retriever Mix,Adoption,1559,0.510478
42,breed,Chihuahua Shorthair Mix,Adoption,1366,0.460864
6,breed,Domestic Medium Hair Mix,Adoption,760,0.469426
46,breed,German Shepherd Mix,Adoption,724,0.525781
...,...,...,...,...,...
4498,breed,Queensland Heeler/Basset Hound,Adoption,1,1.000000
4510,breed,Dachshund Wirehair/Toy Poodle,Adoption,1,1.000000
4522,breed,Basset Hound/Australian Shepherd,Adoption,1,1.000000
4534,breed,Spanish Water Dog,Adoption,1,1.000000


In [45]:
train.groupby(['animal_type', 'outcome_type']).outcome_subtype.value_counts()['Cat']

outcome_type     outcome_subtype
Adoption         no subtype         8513
                 Foster             3736
                 Offsite              49
                 Barn                  2
Died             In Kennel           232
                 In Foster           139
                 At Vet               29
                 Enroute              24
                 no subtype           13
                 In Surgery           10
                 Emergency             1
Disposal         no subtype           52
Euthanasia       Suffering          1003
                 At Vet               68
                 Rabies Risk          50
                 Medical              47
                 no subtype           10
                 Aggressive            2
                 Underage              1
Missing          In Foster             7
                 In Kennel             5
                 Possible Theft        2
                 no subtype            2
Relocate         no subt

In [43]:
train[train['outcome_type']=='Transfer'].outcome_subtype.value_counts()

Partner      17521
SCRP          1657
Snr           1630
Out State      317
In State         6
Barn             6
Emer             4
Name: outcome_subtype, dtype: int64

## what types of animals besides cats and dogs are there?

In [47]:
train[~train['animal_type'].isin(['Cat', 'Dog'])].breed.value_counts()

Bat                   1029
Bat Mix                949
Raccoon                319
Raccoon Mix            309
Rabbit Sh Mix          182
                      ... 
Lop-Mini/Hotot           1
Prairie Dog Mix          1
Dutch/Angora-Satin       1
Jersey Wooly Mix         1
Flemish Giant            1
Name: breed, Length: 175, dtype: int64

In [49]:
train[~train['animal_type'].isin(['Cat', 'Dog'])].intake_type.value_counts()

Wildlife              3050
Stray                  741
Owner Surrender        462
Public Assist          251
Abandoned               15
Euthanasia Request       6
Name: intake_type, dtype: int64

In [51]:
train[~train['animal_type'].isin(['Cat', 'Dog'])].shape

(4525, 17)

In [52]:
3050/4525

0.6740331491712708

#### Hypothesis test

Consider the following hypotheses:

$H_0$ : Owner surrenders have the same proportion of adoptions as not owner surrender intakes

$H_a$ : Owner surrenders have a differing proportion of adoptions as not owner surrend intakes

In [94]:
def chi_square_test(df, cat_cols, target_col = 'target_outcome', alpha=0.05):
    outputs = []
    for cat in cat_cols:
        for subcat in list(df[cat].unique()):
            for target_col_subcat in list(df['target_outcome'].unique()):
                observed = pd.crosstab(train[target_col]==target_col_subcat, train[cat]==subcat)
                chi2, p, degf, expected = stats.chi2_contingency(observed)
                output = {
                        'target_column':target_col,
                        'column':cat,
                        'target_col_subcat':target_col_subcat,
                        'column_subcat':subcat,
                        'null_hypothesis':f"{target_col_subcat} independent of {subcat}",
                        'chi2':chi2,
                        'p':p,
                        'reject_null':p < alpha
                }
                outputs.append(output)
    return pd.DataFrame(outputs)
    
chi_square_test(train, cat_cols=['intake_type'])

Unnamed: 0,target_column,column,target_col_subcat,column_subcat,null_hypothesis,chi2,p,reject_null
0,target_outcome,intake_type,Transfer,Stray,Transfer independent of Stray,949.43573,1.7595510000000002e-208,True
1,target_outcome,intake_type,Other,Stray,Other independent of Stray,2331.172012,0.0,True
2,target_outcome,intake_type,Adoption,Stray,Adoption independent of Stray,148.592018,3.521442e-34,True
3,target_outcome,intake_type,Return to Owner,Stray,Return to Owner independent of Stray,13.349043,0.0002585545,True
4,target_outcome,intake_type,Transfer,Public Assist,Transfer independent of Public Assist,495.223676,1.040428e-109,True
5,target_outcome,intake_type,Other,Public Assist,Other independent of Public Assist,4302.285964,0.0,True
6,target_outcome,intake_type,Adoption,Public Assist,Adoption independent of Public Assist,1198.933667,1.040023e-262,True
7,target_outcome,intake_type,Return to Owner,Public Assist,Return to Owner independent of Public Assist,0.924705,0.3362427,False
8,target_outcome,intake_type,Transfer,Owner Surrender,Transfer independent of Owner Surrender,1.358723,0.2437589,False
9,target_outcome,intake_type,Other,Owner Surrender,Other independent of Owner Surrender,1619.270879,0.0,True


In [82]:
observed = pd.crosstab(train.target_outcome=='Adoption', train.intake_type=='Owner Surrender')
observed

intake_type,False,True
target_outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
False,31653,4417
True,21488,6250


In [73]:
chi2, p, degf, expected = stats.chi2_contingency(observed)

chi2, p

(1190.9720525606824, 5.5889486451688915e-261)

In [91]:
train[(train.animal_type=='Livestock')]

Unnamed: 0,animal_id,name,outcome_type,animal_type,sex_upon_outcome,breed,color,outcome_subtype,found_location,intake_type,intake_condition,sex_upon_intake,outcome_date,intake_date,target_outcome,age_at_outcome,age_at_intake
8707,A841166,no name,Adoption,Livestock,Unknown,Potbelly Pig,Tan,Offsite,6500 W Parmer Ln in Austin (TX),Stray,Normal,Unknown,2021-09-08,2021-08-22,Adoption,730,730
80893,A715047,no name,Transfer,Livestock,Unknown,Goat Mix,Brown,Partner,Quicksilver & S Pleasant Valley in Austin (TX),Public Assist,Normal,Unknown,2015-12-07,2015-10-30,Transfer,365,365
17956,A808934,no name,Adoption,Livestock,Unknown,Pig,Pink,Foster,East Martin Luther King Boulevard And Greenwoo...,Stray,Normal,Unknown,2020-08-21,2019-11-16,Adoption,365,92
90705,A701250,no name,Transfer,Livestock,Intact Female,Pig Mix,Pink,Partner,12Th St And Airport Blvd in Austin (TX),Stray,Normal,Intact Female,2015-05-11,2015-04-26,Transfer,730,730
18656,A811675,no name,Adoption,Livestock,Intact Female,Goat,Black/White,Foster,1012 Arthur Stiles Rd in Austin (TX),Stray,Normal,Intact Female,2020-07-22,2020-01-07,Adoption,730,730
34158,A795191,Loki,Return to Owner,Livestock,Intact Male,Pig,White,no subtype,7259 Wardman Dr in Travis (TX),Stray,Normal,Intact Male,2019-05-18,2019-05-17,Other,152,152
36030,A778432,no name,Adoption,Livestock,Intact Female,Pig Mix,White/Black,Foster,600 West William Cannon Drive in Austin (TX),Stray,Normal,Intact Female,2019-04-02,2018-08-14,Adoption,305,92
41680,A782370,no name,Died,Livestock,Unknown,Pig,Black,At Vet,5704 Melody Lane in Travis (TX),Stray,Normal,Unknown,2018-10-14,2018-10-14,Other,14,14
79157,A718910,no name,Transfer,Livestock,Intact Male,Pig Mix,White,Partner,12707 N Mopac Expy in Austin (TX),Stray,Normal,Intact Male,2016-01-27,2016-01-09,Transfer,365,365
