### Notebook: Time Independent Modeling

29 May 2024

Objectives:
- Make file dataframe file from AK and RY on 28 May
- Look over confusion matrix output
- Are there any better classification methods to use OR ways to maximize RandomForest?
- Utilize new package: `CatBoost`



In [43]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------------------------
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error

# ------------------------------------------------
from catboost import CatBoostClassifier, Pool

# ------------------------------------------------

data_path = "../../main_data/locale_specific_data/"

### Update to dataframe --> New file:

In [None]:
dog_df = pd.read_csv(data_path + "Sonoma_nontime_cleaned_ry.csv").drop(["Unnamed: 0"], axis=1)

dog_df['sex_bin'] = 1*(dog_df.sex=="Female")

size_dict = {'PUPPY':0, 'TOY':1, 'SMALL':2, 'MED':3, 'LARGE':4,'X-LRG':5}
dog_df['size_bin'] = dog_df['size'].map(size_dict)
del size_dict

dog_df['breed_pit'] = 1*(dog_df.breed.str.contains("PIT"))

dog_df['color_mix'] = 1*((dog_df.color.str.contains("/"))|(dog_df.color.str.contains("TRICOLOR"))|(dog_df.color.str.contains("BRINDLE")))

intake_dict = {'STRAY':0, 'OWNER SURRENDER':1, 'ADOPTION RETURN':2, 'CONFISCATE':3, 'TRANSFER':4,'QUARANTINE':5, "BORN HERE":6}
dog_df['intake_bin'] = dog_df['intake_type'].map(intake_dict)
del intake_dict

dog_df.head(10)

In [None]:
dog_df.to_csv(data+path + "Sonoma_time_independent_data_akry.csv", index=False)

In [None]:
del dog_df

### Return to time-independent modeling


In [9]:
dog_df = pd.read_csv(data_path + "Sonoma_time_independent_data_akry.csv")

dog_df.head()

Unnamed: 0,breed,color,sex,size,date_of_birth,animal_id,intake_date,outcome_date,days_in_shelter,intake_type,...,outcome_adopt,outcome_adopt_subtype,intake_age,outcome_age,pop_control,sex_bin,size_bin,breed_pit,color_mix,intake_bin
0,PIT BULL,GRAY/WHITE,Female,MED,2012-09-03,A296009,2013-08-22,2014-04-14,235,TRANSFER,...,False,none,0.97,1.61,1,1,3,1,1,4
1,PIT BULL,BLUE/WHITE,Female,MED,2012-10-09,A294143,2013-08-31,2014-04-14,226,STRAY,...,False,none,0.89,1.51,1,1,3,1,1,0
2,CAROLINA DOG/MIX,GOLD/WHITE,Male,MED,2007-09-07,A281788,2013-09-19,2014-07-30,314,STRAY,...,False,none,6.03,6.9,0,0,3,0,1,0
3,CHIHUAHUA SH,TAN,Male,TOY,2007-10-01,A297574,2013-10-01,2014-01-01,92,OWNER SURRENDER,...,False,none,6.0,6.25,1,0,1,0,0,1
4,PIT BULL,TAN/WHITE,Male,MED,2012-10-29,A298002,2013-10-08,2014-05-22,226,STRAY,...,True,realtime,0.95,1.57,1,0,3,1,1,0


In [16]:
dog_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6505 entries, 0 to 6504
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   breed                  6505 non-null   object 
 1   color                  6505 non-null   object 
 2   sex                    6505 non-null   object 
 3   size                   6505 non-null   object 
 4   date_of_birth          6505 non-null   object 
 5   animal_id              6505 non-null   object 
 6   intake_date            6505 non-null   object 
 7   outcome_date           6505 non-null   object 
 8   days_in_shelter        6505 non-null   int64  
 9   intake_type            6505 non-null   object 
 10  intake_subtype         6505 non-null   object 
 11  outcome_type           6505 non-null   object 
 12  outcome_subtype        6505 non-null   object 
 13  intake_condition       6505 non-null   object 
 14  outcome_condition      6505 non-null   object 
 15  dob_

In [10]:
dog_df.columns

Index(['breed', 'color', 'sex', 'size', 'date_of_birth', 'animal_id',
       'intake_date', 'outcome_date', 'days_in_shelter', 'intake_type',
       'intake_subtype', 'outcome_type', 'outcome_subtype', 'intake_condition',
       'outcome_condition', 'dob_season', 'intake_season', 'outcome_season',
       'outcome_adopt', 'outcome_adopt_subtype', 'intake_age', 'outcome_age',
       'pop_control', 'sex_bin', 'size_bin', 'breed_pit', 'color_mix',
       'intake_bin'],
      dtype='object')

#### Try: `CatBoost`

In [74]:
feature_list = ['sex',
                'size',
                'breed', 
                'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]

dog_df[feature_list]

Unnamed: 0,sex,size,breed,color,days_in_shelter,outcome_season,intake_age,intake_condition,intake_type
0,Female,MED,PIT BULL,GRAY/WHITE,235,1,0.97,HEALTHY,TRANSFER
1,Female,MED,PIT BULL,BLUE/WHITE,226,1,0.89,HEALTHY,STRAY
2,Male,MED,CAROLINA DOG/MIX,GOLD/WHITE,314,2,6.03,UNTREATABLE,STRAY
3,Male,TOY,CHIHUAHUA SH,TAN,92,0,6.00,TREATABLE/MANAGEABLE,OWNER SURRENDER
4,Male,MED,PIT BULL,TAN/WHITE,226,1,0.95,HEALTHY,STRAY
...,...,...,...,...,...,...,...,...,...
6500,Female,PUPPY,LABRADOR RETR/MIX,BR BRINDLE,16,1,0.52,UNKNOWN,OWNER SURRENDER
6501,Female,PUPPY,LABRADOR RETR/MIX,BL BRINDLE,16,1,0.52,UNKNOWN,OWNER SURRENDER
6502,Male,SMALL,LABRADOR RETR/MIX,BLACK,0,1,0.25,UNKNOWN,STRAY
6503,Male,MED,SCHNAUZER MIN/MIX,WHITE,16,1,3.35,UNKNOWN,STRAY


In [89]:
kfold = StratifiedKFold(n_splits=9, shuffle=True, random_state=1342)

scores = np.zeros((9, 15))

i = 1
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    for j in range(2, 15):

        print('Iter: ', i, ', Training Adopted length: ', len(y_t == True), 'Test Adopted length: ', len(y_h == True))
        pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)
        clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=j, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)
        print("Tree Depth Level: ", j)
        #print(clf.tree_count_)
        scores[i, j] = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
        #print()
        #print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
        #print()
        #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
        #print()
        del clf, pool   

    del X_t, y_t, X_h, y_h
    i+=1

    print('-'*50)

del i, kfold

print(scores)

Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  2
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  3
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  4
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  5
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  6
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  7
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  8
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  9
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  10
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  11
Iter:  1 , Training Adopted length:  5782 Test Adopted length:  723
Tree Depth Level:  12
Iter:  1 , Trainin

IndexError: index 9 is out of bounds for axis 0 with size 9

In [101]:
score_df = pd.DataFrame(scores, columns=["depth_"+str(i) for i in range(15)], index=["split_"+str(i) for i in range(9)])
score_df.drop(['depth_0','depth_1'], axis=1, inplace=True)
score_df.drop(['split_0'], axis=0, inplace=True)
score_df

Unnamed: 0,depth_2,depth_3,depth_4,depth_5,depth_6,depth_7,depth_8,depth_9,depth_10,depth_11,depth_12,depth_13,depth_14
split_1,0.757953,0.755187,0.762102,0.759336,0.749654,0.748271,0.751037,0.741355,0.745505,0.726141,0.737206,0.733057,0.731674
split_2,0.746888,0.748271,0.749654,0.749654,0.744122,0.742739,0.748271,0.75242,0.746888,0.73444,0.724758,0.737206,0.726141
split_3,0.764869,0.7787,0.771784,0.781466,0.775934,0.767635,0.766252,0.753804,0.751037,0.746888,0.748271,0.749654,0.746888
split_4,0.760719,0.766252,0.763485,0.75657,0.759336,0.755187,0.763485,0.753804,0.764869,0.75242,0.757953,0.739972,0.737206
split_5,0.759336,0.759336,0.763485,0.771784,0.760719,0.757953,0.773167,0.762102,0.755187,0.763485,0.763485,0.762102,0.75657
split_6,0.773167,0.775934,0.782849,0.773167,0.775934,0.780083,0.773167,0.7787,0.757953,0.759336,0.749654,0.745505,0.741355
split_7,0.781466,0.781466,0.788382,0.782849,0.788382,0.777317,0.780083,0.782849,0.780083,0.784232,0.781466,0.770401,0.773167
split_8,0.754848,0.756233,0.760388,0.759003,0.757618,0.764543,0.761773,0.764543,0.759003,0.759003,0.753463,0.742382,0.738227


In [103]:
score_df.max()


depth_2     0.781466
depth_3     0.781466
depth_4     0.788382
depth_5     0.782849
depth_6     0.788382
depth_7     0.780083
depth_8     0.780083
depth_9     0.782849
depth_10    0.780083
depth_11    0.784232
depth_12    0.781466
depth_13    0.770401
depth_14    0.773167
dtype: float64

In [None]:
kfold = StratifiedKFold(n_splits=9, shuffle=True, random_state=1342)

i = 1
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)
    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=6, task_type="GPU").fit(pool, eval_set=(X_h, y_h), verbose=False)
    print(clf.tree_count_)
    #scores[i, j] = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold


#### Question: Is there a connection between adoption and how the dogs came to the shelter?


In [None]:
pd.crosstab(dog_df.intake_type, dog_df.outcome_type)                

In [None]:
sns.violinplot(data=dog_df[dog_df.outcome_adopt==True],
            x='days_in_shelter', 
            y='intake_type',
            hue="intake_type", split=True, palette="muted", inner="quartile")

#### Question: What can we determine about use of adoption strategies?

In [None]:
dog_df.outcome_adopt.sum()

In [None]:
sns.scatterplot(data=dog_df[dog_df.outcome_adopt==True],
                y='days_in_shelter', x='intake_age', 
                hue='outcome_adopt_subtype', palette='hls', edgecolor='black', alpha=0.50 )

In [None]:
sns.stripplot(data=dog_df[dog_df.outcome_adopt==True],
              y='days_in_shelter', x='outcome_adopt_subtype',
              hue='size', palette="muted")

In [None]:
sns.catplot(data=dog_df[dog_df.outcome_adopt==True],
            y='days_in_shelter', x='outcome_adopt_subtype',
            col="sex",
            hue='size', palette="muted")

In [None]:
sns.catplot(data=dog_df[dog_df.outcome_adopt==True],
            y='days_in_shelter', x='outcome_adopt_subtype',
            col="size",
            hue='sex', palette="muted", aspect=0.5)