In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from datetime import datetime

from imblearn.over_sampling import SMOTE

In [2]:
train = pd.read_csv("classification/train.csv")
test = pd.read_csv("classification/test.csv")

In [3]:
train.head(2)

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2


In [4]:
test.head(2)

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1


In [5]:
# # TODO
# 1. Build model to predict breed category
# 2. Build model to predict pet categor

In [6]:
# check shape of train data
train.shape

(18834, 11)

In [7]:
# check datatype
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


In [8]:
train.columns

Index(['pet_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category',
       'pet_category'],
      dtype='object')

In [9]:
numeric_cols = ['length(m)', 'height(cm)']
categorical_cols = ['X1', 'X2', 'condition', 'color_type']
datetime_cols = ['issue_date', 'listing_date']

# since we are going to predict breed_category and pet_category, we are not considering them in column names

In [10]:
# drop duplicates
train.drop_duplicates(inplace=True)

In [11]:
# check missing values
train.isna().sum()

pet_id               0
issue_date           0
listing_date         0
condition         1477
color_type           0
length(m)            0
height(cm)           0
X1                   0
X2                   0
breed_category       0
pet_category         0
dtype: int64

In [12]:
# Feature engineering: convert issue_date and listing_date to timestamp to take the duration
for time_col in datetime_cols:
    train[time_col] = pd.to_datetime(train[time_col])
    train[time_col] = train[time_col].map(lambda date_var: datetime.timestamp(date_var))

In [13]:
# Let us take difference between listing and issue date
train.insert(1, "timedelta", train["listing_date"] - train["issue_date"])

In [14]:
train.head(2)

Unnamed: 0,pet_id,timedelta,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,6366300.0,1468089000.0,1474455000.0,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,160940820.0,1384972000.0,1545913000.0,1.0,White,0.72,14.19,13,9,0.0,2


In [15]:
# add timedelta to numeric col as it is continuous value
numeric_cols.append("timedelta")

In [16]:
# Remove redundant cols
redundant_cols = ["pet_id", "issue_date", "listing_date"]
train.drop(redundant_cols, axis=1, inplace=True)

In [17]:
# Convert string values from color_type to one hot encoded cols
train = pd.get_dummies(train, prefix=["color_type",])

In [18]:
# predict missing values using KNN inputer
imputer = KNNImputer(missing_values=np.NaN)
train[train.columns] = imputer.fit_transform(train)

In [19]:
# Check missing values
train.isna().sum()

timedelta                    0
condition                    0
length(m)                    0
height(cm)                   0
X1                           0
                            ..
color_type_Tortie Point      0
color_type_Tricolor          0
color_type_White             0
color_type_Yellow            0
color_type_Yellow Brindle    0
Length: 64, dtype: int64

In [20]:
# Form 
features = train.drop(["breed_category", "pet_category"], axis=1)
breed_target = train["breed_category"]
pet_target = train["pet_category"]

In [21]:
# split data into train - validation
train_features_breed, val_features_breed, train_target_breed, val_target_breed = train_test_split(features, breed_target, test_size = 0.2, random_state=42)
train_features_pet, val_features_pet, train_target_pet, val_target_pet = train_test_split(features, pet_target, test_size = 0.2, random_state=42)

In [22]:
train_features_breed.shape, train_target_breed.shape

((15067, 62), (15067,))

In [23]:
# Scale the numeric data
scaler_breed = StandardScaler()
scaler_pet = StandardScaler()

train_features_breed[numeric_cols] = scaler_breed.fit_transform(train_features_breed[numeric_cols])
val_features_breed[numeric_cols] = scaler_breed.transform(val_features_breed[numeric_cols])

train_features_pet[numeric_cols] = scaler_pet.fit_transform(train_features_pet[numeric_cols])
val_features_pet[numeric_cols] = scaler_pet.transform(val_features_pet[numeric_cols])

In [24]:
# Apply ML model

# random forest
# breed
rf_breed = RandomForestClassifier(n_estimators=200)
rf_breed.fit(train_features_breed, train_target_breed)
pred_breed_rf = rf_breed.predict(val_features_breed)
print(accuracy_score(pred_breed_rf, val_target_breed))

# pet
rf_pet = RandomForestClassifier(n_estimators=200)
rf_pet.fit(train_features_pet, train_target_pet)
pred_pet_rf = rf_pet.predict(val_features_pet)
print(accuracy_score(pred_pet_rf, val_target_pet))

# svm
# breed
svm_breed = SVC()
svm_breed.fit(train_features_breed, train_target_breed)
pred_breed_svm = svm_breed.predict(val_features_breed)
print(accuracy_score(pred_breed_svm, val_target_breed))

# pet
svm_pet = SVC()
svm_pet.fit(train_features_pet, train_target_pet)
pred_pet_svm = svm_pet.predict(val_features_pet)
print(accuracy_score(pred_pet_svm, val_target_pet))

0.8704539421290152
0.8911600743297053
0.84550039819485
0.8534642951951155


In [25]:
# Best accuracy is for Random Forest for both

In [26]:
# Pipelines
numeric_cols = ['length(m)', 'height(cm)']
categorical_cols = ['X1', 'X2', 'condition', 'color_type']
datetime_cols = ['issue_date', 'listing_date']
test.drop_duplicates(inplace=True)
# Feature engineering: convert issue_date and listing_date to timestamp to take the duration
for time_col in datetime_cols:
    test[time_col] = pd.to_datetime(test[time_col])
    test[time_col] = test[time_col].map(lambda date_var: datetime.timestamp(date_var))
test.insert(1, "timedelta", test["listing_date"] - test["issue_date"])
# add timedelta to numeric col as it is continuous value
numeric_cols.append("timedelta")
# Remove redundant cols
redundant_cols = ["pet_id", "issue_date", "listing_date"]
test.drop(redundant_cols, axis=1, inplace=True)

In [28]:
test

Unnamed: 0,timedelta,condition,color_type,length(m),height(cm),X1,X2
0,380561700.0,0.0,Black,0.87,42.73,0,7
1,15096240.0,1.0,Orange Tabby,0.06,6.71,0,1
2,172774260.0,1.0,Black,0.24,41.21,0,7
3,99213900.0,1.0,Black,0.29,8.46,7,1
4,40052520.0,1.0,Brown,0.71,30.92,0,7
...,...,...,...,...,...,...,...
8067,34095360.0,2.0,Brown,0.82,36.08,13,9
8068,68947200.0,0.0,Tan,0.49,27.54,13,9
8069,34017540.0,0.0,Black,0.98,37.19,0,7
8070,33502740.0,,Black,0.79,23.83,0,2


In [30]:
# Convert string values from color_type to one hot encoded cols
test = pd.get_dummies(test, prefix=["color_type",])

In [31]:
# impute for missing values
test[test.columns] = imputer.fit_transform(test)

In [32]:
test_breed = test.copy()
test_pet = test.copy()
test_breed[numeric_cols] = scaler_breed.transform(test[numeric_cols])
test_pet[numeric_cols] = scaler_pet.transform(test[numeric_cols])

In [33]:
train_cols = train_features_breed.columns
test_cols = test.columns
len(train_cols), len(test_cols)

(62, 60)

In [34]:
# Find cols missing from test_cols
for col in train_cols:
    if col not in test_cols:
        print(col)

color_type_Black Tiger
color_type_Brown Tiger


In [35]:
index_black_tiger = train_cols.get_loc("color_type_Black Tiger")
index_brown_tiger = train_cols.get_loc("color_type_Brown Tiger")

In [36]:
# Add cols with 0 value in respective indices
test_breed.insert(index_black_tiger, "color_type_Black Tiger",0)
test_breed.insert(index_brown_tiger, "color_type_Brown Tiger",0)

test_pet.insert(index_black_tiger, "color_type_Black Tiger",0)
test_pet.insert(index_brown_tiger, "color_type_Brown Tiger",0)

In [37]:
# Test model on test data pipeline
test_breed_preds = rf_breed.predict(test_breed)
test_pet_preds = rf_pet.predict(test_pet)

In [40]:
test["breed_category"] = test_breed_preds
test["pet_category"] = test_pet_preds

In [41]:
test

Unnamed: 0,timedelta,condition,length(m),height(cm),X1,X2,color_type_Agouti,color_type_Apricot,color_type_Black,color_type_Black Brindle,...,color_type_Tan,color_type_Torbie,color_type_Tortie,color_type_Tortie Point,color_type_Tricolor,color_type_White,color_type_Yellow,color_type_Yellow Brindle,breed_category,pet_category
0,380561700.0,0.0,0.87,42.73,0.0,7.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
1,15096240.0,1.0,0.06,6.71,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,172774260.0,1.0,0.24,41.21,0.0,7.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,99213900.0,1.0,0.29,8.46,7.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,40052520.0,1.0,0.71,30.92,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8067,34095360.0,2.0,0.82,36.08,13.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
8068,68947200.0,0.0,0.49,27.54,13.0,9.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
8069,34017540.0,0.0,0.98,37.19,0.0,7.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
8070,33502740.0,0.6,0.79,23.83,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0
