In [1]:
# import libraries 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import os
from pycaret.classification import *

In [2]:
# change working dir to top of work directory 
os.chdir('/home/rstudio/work')

In [3]:
# read in data and view the first few lines 
data_all_cols = pd.read_csv('data/source_data/petfinder_train_dataset.csv')

pd.set_option('display.max_columns', None)
data_all_cols.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [4]:
# keep only columns that make sense to use in the model 
data = data_all_cols.drop(['Name', 'Description','PetID'], axis=1)

# List of categorical columns so make it clear which need to be one hot encoded during set up
categorical_columns = [
    'Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3',
    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
    'Sterilized', 'Health', 'State'
]

# Convert specified columns to 'category'
data[categorical_columns] = data[categorical_columns].astype('category')


In [5]:
print(data.dtypes)

Type             category
Age                 int64
Breed1           category
Breed2           category
Gender           category
Color1           category
Color2           category
Color3           category
MaturitySize     category
FurLength        category
Vaccinated       category
Dewormed         category
Sterilized       category
Health           category
Quantity            int64
Fee                 int64
State            category
RescuerID          object
VideoAmt            int64
PhotoAmt          float64
AdoptionSpeed       int64
dtype: object


In [6]:
# split data so that we can train on some and predict on the rest
data_train = data.sample(frac=0.9, random_state=786).reset_index(drop=True)
data_predict = data.drop(data_train.index).reset_index(drop=True)

print('Data for modeling:' + str(data_train.shape))
print('Data for predictions:' + str(data_predict.shape))

Data for modeling:(13494, 21)
Data for predictions:(1499, 21)


In [7]:
# initialize the environment in pycaret
# creates transformation pipeline to prepare the data for modeling and deployment
s = setup(data_train, target = 'AdoptionSpeed', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,AdoptionSpeed
2,Target type,Multiclass
3,Original data shape,"(13494, 21)"
4,Transformed data shape,"(13494, 66)"
5,Transformed train set shape,"(9445, 66)"
6,Transformed test set shape,"(4049, 66)"
7,Numeric features,5
8,Categorical features,15
9,Preprocess,True


In [9]:
best = compare_models(exclude=['lightgbm'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.3985,0.0,0.3985,0.3704,0.3622,0.1886,0.1963,0.073
gbc,Gradient Boosting Classifier,0.3925,0.0,0.3925,0.428,0.3936,0.1932,0.198,0.858
et,Extra Trees Classifier,0.3868,0.6606,0.3868,0.3887,0.3845,0.185,0.1858,0.44
rf,Random Forest Classifier,0.3861,0.682,0.3861,0.4209,0.3862,0.185,0.1893,0.238
lda,Linear Discriminant Analysis,0.3786,0.0,0.3786,0.3812,0.3697,0.1686,0.1722,0.074
dt,Decision Tree Classifier,0.3736,0.5894,0.3736,0.3889,0.3772,0.1761,0.1771,0.079
lr,Logistic Regression,0.3705,0.0,0.3705,0.3597,0.3601,0.1575,0.1594,0.827
ada,Ada Boost Classifier,0.3656,0.0,0.3656,0.3839,0.3639,0.1576,0.1613,0.14
knn,K Neighbors Classifier,0.3299,0.5987,0.3299,0.3348,0.3292,0.115,0.1157,0.517
svm,SVM - Linear Kernel,0.3244,0.0,0.3244,0.3601,0.2852,0.1048,0.1204,0.101


In [None]:
## top four models are ridge classifier, gradient boosting classifier, extra trees classifier, and random forest classifier

In [10]:
lgbm = create_model('lightgbm')

Processing:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 