### Notebook: Visualizing Time Independent Models

30 May 2024

Objectives:
- Further analysis of features
- Use data from work begun in nontemporal_modeling_03 workbook to visualize models

Follow-up on 03


In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------------------------
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error

# ------------------------------------------------
from catboost import CatBoostClassifier, Pool

# ------------------------------------------------

data_path = "../../main_data/locale_specific_data/"

In [2]:
dog_df = pd.read_csv(data_path + "Sonoma_time_independent_data_akry.csv")

dog_df.head()

Unnamed: 0,breed,color,sex,size,date_of_birth,animal_id,intake_date,outcome_date,days_in_shelter,intake_type,...,outcome_adopt,outcome_adopt_subtype,intake_age,outcome_age,pop_control,sex_bin,size_bin,breed_pit,color_mix,intake_bin
0,PIT BULL,GRAY/WHITE,Female,MED,2012-09-03,A296009,2013-08-22,2014-04-14,235,TRANSFER,...,False,none,0.97,1.61,1,1,3,1,1,4
1,PIT BULL,BLUE/WHITE,Female,MED,2012-10-09,A294143,2013-08-31,2014-04-14,226,STRAY,...,False,none,0.89,1.51,1,1,3,1,1,0
2,CAROLINA DOG/MIX,GOLD/WHITE,Male,MED,2007-09-07,A281788,2013-09-19,2014-07-30,314,STRAY,...,False,none,6.03,6.9,0,0,3,0,1,0
3,CHIHUAHUA SH,TAN,Male,TOY,2007-10-01,A297574,2013-10-01,2014-01-01,92,OWNER SURRENDER,...,False,none,6.0,6.25,1,0,1,0,0,1
4,PIT BULL,TAN/WHITE,Male,MED,2012-10-29,A298002,2013-10-08,2014-05-22,226,STRAY,...,True,realtime,0.95,1.57,1,0,3,1,1,0


In [3]:
X_train, X_hold, y_train, y_hold = train_test_split(dog_df,
                                      dog_df.outcome_adopt,
                                      shuffle=True,
                                      stratify=dog_df.outcome_adopt,
                                      test_size=0.2,
                                      random_state=1342)

### Shouldn't all the cross-validation be done using only the training data?  So Kfold should be applied to X_train, y_train rather than to the original data set...  


feature_list = ['sex',
                'size',
                #'breed', 
                #'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]

tree_depth = 6
cv_splits  = 9

kfold  = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=1342)
scores = pd.DataFrame(columns=['accuracy','tree_count'], index=range(cv_splits))

i = 0
for t_idx, h_idx in kfold.split(dog_df, dog_df.outcome_adopt):
    X_t = dog_df[feature_list].iloc[t_idx]
    X_t = X_t.astype(str)
    y_t = dog_df.outcome_adopt.iloc[t_idx]
    X_h = dog_df[feature_list].iloc[h_idx]
    X_h = X_h.astype(str)
    y_h = dog_df.outcome_adopt.iloc[h_idx]

    pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)

    clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth).fit(pool, eval_set=(X_h, y_h), verbose=False)

    scores.loc[i, "tree_count"] = clf.tree_count_
    scores.loc[i, "accuracy"]   = accuracy_score(y_true = y_h, y_pred = clf.predict(X_h))
    print()
    print(confusion_matrix(y_true = y_h, y_pred = clf.predict(X_h)))
    print()
    print(clf.get_feature_importance(data=pool, prettified=True, verbose=True))
    #print(classification_report(y_true = y_h, y_pred = clf.predict(X_h)))
    #print()
    del clf, pool   
    del X_t, y_t, X_h, y_h
    i+=1
    print('-'*50)

del i, kfold, feature_list, tree_depth, cv_splits

print(scores)

In [21]:
feature_list = ['sex',
                'size',
                'breed', 
                #'color',
                'days_in_shelter',
                'outcome_season',
                'intake_age',
                'intake_condition',
                'intake_type'
                #, 'outcome_adopt_subtype'
                ]
tree_depth = 6
X_t = X_train[feature_list]
X_t = X_t.astype(str)
y_t = y_train
pool = Pool(X_t, y_t, cat_features=feature_list, feature_names=feature_list)
clf  = CatBoostClassifier(iterations=1000, learning_rate=0.1, loss_function="MultiClass", depth=tree_depth).fit(pool, eval_set=(X_t, y_t), verbose=False)

In [29]:
feature = 'intake_type'
# res = clf.calc_feature_statistics(X_t, y_t, feature, plot=True)
clf.calc_feature_statistics(X_t, target=none, feature=feature,plot=True)
# clf.calc_feature_statistics(X_t,
#                         target=None,
#                         feature=None,
#                         prediction_type='Class',
#                         cat_feature_values=None,
#                         plot=True,
#                         max_cat_features_on_plot=10,
#                         thread_count=-1,
#                         plot_file=None)

NameError: name 'none' is not defined