### Notebook: Time Independent Modeling

29 May 2024

Objectives:
- Make file dataframe file from AK and RY on 28 May
- Look over confusion matrix output
- Are there any better classification methods to use OR ways to maximize RandomForest?
- Utilize new package: `CatBoost`



In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------------------------
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error

# ------------------------------------------------
from catboost import CatBoostClassifier, Pool

# ------------------------------------------------

data_path = "../../main_data/locale_specific_data/"

### Update to dataframe --> New file:

In [None]:
dog_df = pd.read_csv(data_path + "Sonoma_nontime_cleaned_ry.csv").drop(["Unnamed: 0"], axis=1)

dog_df['sex_bin'] = 1*(dog_df.sex=="Female")

size_dict = {'PUPPY':0, 'TOY':1, 'SMALL':2, 'MED':3, 'LARGE':4,'X-LRG':5}
dog_df['size_bin'] = dog_df['size'].map(size_dict)
del size_dict

dog_df['breed_pit'] = 1*(dog_df.breed.str.contains("PIT"))

dog_df['color_mix'] = 1*((dog_df.color.str.contains("/"))|(dog_df.color.str.contains("TRICOLOR"))|(dog_df.color.str.contains("BRINDLE")))

intake_dict = {'STRAY':0, 'OWNER SURRENDER':1, 'ADOPTION RETURN':2, 'CONFISCATE':3, 'TRANSFER':4,'QUARANTINE':5, "BORN HERE":6}
dog_df['intake_bin'] = dog_df['intake_type'].map(intake_dict)
del intake_dict

dog_df.head(10)

In [None]:
dog_df.to_csv(data+path + "Sonoma_time_independent_data_akry.csv", index=False)

In [None]:
del dog_df

### Return to time-independent modeling


In [2]:
dog_df = pd.read_csv(data_path + "Sonoma_time_independent_data_akry.csv")

dog_df.head()

Unnamed: 0,breed,color,sex,size,date_of_birth,animal_id,intake_date,outcome_date,days_in_shelter,intake_type,...,outcome_adopt,outcome_adopt_subtype,intake_age,outcome_age,pop_control,sex_bin,size_bin,breed_pit,color_mix,intake_bin
0,PIT BULL,GRAY/WHITE,Female,MED,2012-09-03,A296009,2013-08-22,2014-04-14,235,TRANSFER,...,False,none,0.97,1.61,1,1,3,1,1,4
1,PIT BULL,BLUE/WHITE,Female,MED,2012-10-09,A294143,2013-08-31,2014-04-14,226,STRAY,...,False,none,0.89,1.51,1,1,3,1,1,0
2,CAROLINA DOG/MIX,GOLD/WHITE,Male,MED,2007-09-07,A281788,2013-09-19,2014-07-30,314,STRAY,...,False,none,6.03,6.9,0,0,3,0,1,0
3,CHIHUAHUA SH,TAN,Male,TOY,2007-10-01,A297574,2013-10-01,2014-01-01,92,OWNER SURRENDER,...,False,none,6.0,6.25,1,0,1,0,0,1
4,PIT BULL,TAN/WHITE,Male,MED,2012-10-29,A298002,2013-10-08,2014-05-22,226,STRAY,...,True,realtime,0.95,1.57,1,0,3,1,1,0


In [3]:
dog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6505 entries, 0 to 6504
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   breed                  6505 non-null   object 
 1   color                  6505 non-null   object 
 2   sex                    6505 non-null   object 
 3   size                   6505 non-null   object 
 4   date_of_birth          6505 non-null   object 
 5   animal_id              6505 non-null   object 
 6   intake_date            6505 non-null   object 
 7   outcome_date           6505 non-null   object 
 8   days_in_shelter        6505 non-null   int64  
 9   intake_type            6505 non-null   object 
 10  intake_subtype         6505 non-null   object 
 11  outcome_type           6505 non-null   object 
 12  outcome_subtype        6505 non-null   object 
 13  intake_condition       6505 non-null   object 
 14  outcome_condition      6505 non-null   object 
 15  dob_

In [4]:
dog_df.columns

Index(['breed', 'color', 'sex', 'size', 'date_of_birth', 'animal_id',
       'intake_date', 'outcome_date', 'days_in_shelter', 'intake_type',
       'intake_subtype', 'outcome_type', 'outcome_subtype', 'intake_condition',
       'outcome_condition', 'dob_season', 'intake_season', 'outcome_season',
       'outcome_adopt', 'outcome_adopt_subtype', 'intake_age', 'outcome_age',
       'pop_control', 'sex_bin', 'size_bin', 'breed_pit', 'color_mix',
       'intake_bin'],
      dtype='object')

#### Try: `CatBoost`

In [5]:
feature_list = ['sex',
                'size',
                'breed', 
                'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]

dog_df[feature_list]

Unnamed: 0,sex,size,breed,color,days_in_shelter,outcome_season,intake_age,intake_condition,intake_type
0,Female,MED,PIT BULL,GRAY/WHITE,235,1,0.97,HEALTHY,TRANSFER
1,Female,MED,PIT BULL,BLUE/WHITE,226,1,0.89,HEALTHY,STRAY
2,Male,MED,CAROLINA DOG/MIX,GOLD/WHITE,314,2,6.03,UNTREATABLE,STRAY
3,Male,TOY,CHIHUAHUA SH,TAN,92,0,6.00,TREATABLE/MANAGEABLE,OWNER SURRENDER
4,Male,MED,PIT BULL,TAN/WHITE,226,1,0.95,HEALTHY,STRAY
...,...,...,...,...,...,...,...,...,...
6500,Female,PUPPY,LABRADOR RETR/MIX,BR BRINDLE,16,1,0.52,UNKNOWN,OWNER SURRENDER
6501,Female,PUPPY,LABRADOR RETR/MIX,BL BRINDLE,16,1,0.52,UNKNOWN,OWNER SURRENDER
6502,Male,SMALL,LABRADOR RETR/MIX,BLACK,0,1,0.25,UNKNOWN,STRAY
6503,Male,MED,SCHNAUZER MIN/MIX,WHITE,16,1,3.35,UNKNOWN,STRAY


In [18]:
pd.crosstab(dog_df.outcome_adopt, dog_df.intake_condition)#, normalize=True)

intake_condition,HEALTHY,TREATABLE/MANAGEABLE,TREATABLE/REHAB,UNKNOWN,UNTREATABLE
outcome_adopt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,938,386,229,855,508
True,2747,216,322,302,2


In [19]:
pd.crosstab(dog_df.outcome_adopt, dog_df.intake_type)#, normalize=True)

intake_type,ADOPTION RETURN,BORN HERE,CONFISCATE,OWNER SURRENDER,QUARANTINE,STRAY,TRANSFER
outcome_adopt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,76,6,175,686,104,1813,56
True,243,0,113,600,14,2517,102


In [20]:
pd.crosstab(dog_df.outcome_adopt, dog_df.outcome_season)#, normalize=True)

outcome_season,0,1,2,3
outcome_adopt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,790,779,457,890
True,1029,838,640,1082


In [None]:
### BEWARE ------
### Executing this code block will take > 2 hours, even if you use GPU
### -------------

kfold = StratifiedKFold(n_splits=9, shuffle=True, random_state=1342)

scores = np.zeros((9, 15))

i = 1
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    for j in range(2, 15):

        print('Iter: ', i, ', Training Adopted length: ', len(y_t == True), 'Test Adopted length: ', len(y_h == True))
        pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)
        clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=j, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)
        print("Tree Depth Level: ", j)
        #print(clf.tree_count_)
        scores[i, j] = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
        #print()
        #print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
        #print()
        #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
        #print()
        del clf, pool   

    del X_t, y_t, X_h, y_h
    i+=1

    print('-'*50)

del i, kfold

print(scores)

Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  2
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  3
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  4
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  5
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  6
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  7
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  8
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  9
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  10
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  11
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  12
Iter:  1 , Trainin

IndexError: index 9 is out of bounds for axis 0 with size 9

In [9]:
tree_depth = 6
cv_splits  = 9

kfold  = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1342)
scores = pd.DataFrame(columns=['accuracy','tree_count'], index=range(cv_splits))

i = 0
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)

    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)

    scores.loc[i, "tree_count"] = clf.tree_count_
    scores.loc[i, "accuracy"]   = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold



[[205 119]
 [ 62 337]]

              precision    recall  f1-score   support

       False       0.77      0.63      0.69       324
        True       0.74      0.84      0.79       399

    accuracy                           0.75       723
   macro avg       0.75      0.74      0.74       723
weighted avg       0.75      0.75      0.75       723


--------------------------------------------------

[[192 132]
 [ 56 343]]

              precision    recall  f1-score   support

       False       0.77      0.59      0.67       324
        True       0.72      0.86      0.78       399

    accuracy                           0.74       723
   macro avg       0.75      0.73      0.73       723
weighted avg       0.75      0.74      0.73       723


--------------------------------------------------

[[201 123]
 [ 39 360]]

              precision    recall  f1-score   support

       False       0.84      0.62      0.71       324
        True       0.75      0.90      0.82       399

   

In [10]:
print(scores)

   accuracy tree_count
0  0.749654        325
1  0.739972        156
2  0.775934        183
3  0.759336         67
4  0.760719        381
5  0.775934        341
6  0.788382        225
7  0.757618        101
8  0.767313        174


In [14]:
scores.accuracy.max()

0.7883817427385892

In [21]:
feature_list = ['sex',
                'size',
                'breed', 
                'color',
                'days_in_shelter',
                #'outcome_season',
                'intake_age',
                'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]

In [22]:
tree_depth = 6
cv_splits  = 9

kfold  = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1342)
scores = pd.DataFrame(columns=['accuracy','tree_count'], index=range(cv_splits))

i = 0
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)

    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)

    scores.loc[i, "tree_count"] = clf.tree_count_
    scores.loc[i, "accuracy"]   = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(clf.get_feature_importance(data=pool, prettified=True, verbose=True))
    #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    #print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold


[[204 120]
 [ 50 349]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    44.687183
1        intake_age    14.809136
2   days_in_shelter    12.106054
3             breed     7.723649
4       intake_type     7.296947
5              size     6.716718
6             color     4.099134
7               sex     2.561180
--------------------------------------------------

[[197 127]
 [ 53 346]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    41.118999
1        intake_age    17.173321
2   days_in_shelter    14.696060
3              size     6.775665
4             breed     6.596914
5       intake_type     6.221486
6             color     4.520786
7               sex     2.896768
--------------------------------------------------

[[205 119]
 [ 43 356]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    37.949987
1   days_i

In [23]:
print(scores)

   accuracy tree_count
0  0.764869        134
1  0.751037        158
2  0.775934        218
3  0.762102         71
4  0.762102        326
5   0.77455        157
6  0.782849        240
7  0.764543        204
8  0.765928        247


In [24]:
feature_list = ['sex',
                'size',
                'breed', 
                'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                'intake_condition'#,
                #'intake_type'
                #, 'outcome_adopt_subtype'
                ]

tree_depth = 6
cv_splits  = 9

kfold  = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1342)
scores = pd.DataFrame(columns=['accuracy','tree_count'], index=range(cv_splits))

i = 0
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)

    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)

    scores.loc[i, "tree_count"] = clf.tree_count_
    scores.loc[i, "accuracy"]   = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(clf.get_feature_importance(data=pool, prettified=True, verbose=True))
    #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    #print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold, feature_list, tree_depth, cv_splits

print(scores)


[[198 126]
 [ 56 343]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    37.575473
1        intake_age    15.705557
2   days_in_shelter    14.860988
3             breed    10.034758
4              size     7.842613
5             color     5.959523
6    outcome_season     4.876211
7               sex     3.144877
--------------------------------------------------

[[193 131]
 [ 68 331]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    39.890494
1        intake_age    16.730834
2   days_in_shelter    15.662778
3              size     8.170861
4             breed     6.774359
5             color     5.544953
6    outcome_season     4.377750
7               sex     2.847970
--------------------------------------------------

[[201 123]
 [ 53 346]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    30.110737
1        i

In [25]:
feature_list = ['sex',
                'size',
                'breed', 
                'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                #'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]

tree_depth = 6
cv_splits  = 9

kfold  = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1342)
scores = pd.DataFrame(columns=['accuracy','tree_count'], index=range(cv_splits))

i = 0
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)

    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)

    scores.loc[i, "tree_count"] = clf.tree_count_
    scores.loc[i, "accuracy"]   = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(clf.get_feature_importance(data=pool, prettified=True, verbose=True))
    #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    #print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold, feature_list, tree_depth, cv_splits

print(scores)


[[188 136]
 [ 76 323]]

Used dataset leave statistics for fstr calculation
        Feature Id  Importances
0       intake_age    32.693962
1  days_in_shelter    18.016858
2      intake_type    14.430935
3             size    11.639833
4            breed    11.327696
5              sex     4.980864
6   outcome_season     4.060819
7            color     2.849033
--------------------------------------------------

[[189 135]
 [ 88 311]]

Used dataset leave statistics for fstr calculation
        Feature Id  Importances
0       intake_age    26.321440
1  days_in_shelter    19.887734
2            breed    12.860188
3             size    11.787302
4            color     9.631042
5      intake_type     9.386849
6   outcome_season     5.741136
7              sex     4.384309
--------------------------------------------------

[[193 131]
 [ 80 319]]

Used dataset leave statistics for fstr calculation
        Feature Id  Importances
0       intake_age    25.989050
1  days_in_shelter    19.19905

In [26]:
feature_list = ['sex',
                'size',
                #'breed', 
                #'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]

tree_depth = 6
cv_splits  = 9

kfold  = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1342)
scores = pd.DataFrame(columns=['accuracy','tree_count'], index=range(cv_splits))

i = 0
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)

    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)

    scores.loc[i, "tree_count"] = clf.tree_count_
    scores.loc[i, "accuracy"]   = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(clf.get_feature_importance(data=pool, prettified=True, verbose=True))
    #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    #print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold, feature_list, tree_depth, cv_splits

print(scores)


[[204 120]
 [ 53 346]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    33.609949
1        intake_age    19.097442
2   days_in_shelter    18.064091
3              size     9.901519
4       intake_type     8.118878
5    outcome_season     7.670436
6               sex     3.537684
--------------------------------------------------

[[188 136]
 [ 54 345]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    45.705706
1        intake_age    16.933106
2   days_in_shelter    15.113082
3              size     7.775112
4       intake_type     7.263494
5    outcome_season     4.819298
6               sex     2.390202
--------------------------------------------------

[[195 129]
 [ 45 354]]

Used dataset leave statistics for fstr calculation
         Feature Id  Importances
0  intake_condition    48.125656
1   days_in_shelter    14.810101
2        intake_age    13.331008
3         

In [27]:
dog_df.columns

Index(['breed', 'color', 'sex', 'size', 'date_of_birth', 'animal_id',
       'intake_date', 'outcome_date', 'days_in_shelter', 'intake_type',
       'intake_subtype', 'outcome_type', 'outcome_subtype', 'intake_condition',
       'outcome_condition', 'dob_season', 'intake_season', 'outcome_season',
       'outcome_adopt', 'outcome_adopt_subtype', 'intake_age', 'outcome_age',
       'pop_control', 'sex_bin', 'size_bin', 'breed_pit', 'color_mix',
       'intake_bin'],
      dtype='object')