# Pet Adopting

In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.neighbors import KNeighborsClassifier

random.seed(0)

In [57]:
train=pd.read_csv('train.csv')
train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [58]:
train['breed_category']=train['breed_category'].astype(int)
train['issue_date']=pd.to_datetime(train['issue_date'])
train['listing_date']=pd.to_datetime(train['listing_date'])

In [59]:
train['Days_Shelter']=train['listing_date']-train['issue_date']
train['Days_Shelter']=train['Days_Shelter']/np.timedelta64(1,'D')
train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,Days_Shelter
0,ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0,1,73.684028
1,ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0,2,1862.740972
2,ANSL_69750,2014-09-28,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2,4,752.35
3,ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0,2,755.770833
4,ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0,1,52.401389


In [60]:
train.columns

Index(['pet_id', 'issue_date', 'listing_date', 'condition', 'color_type',
       'length(m)', 'height(cm)', 'X1', 'X2', 'breed_category', 'pet_category',
       'Days_Shelter'],
      dtype='object')

In [61]:
le=LabelEncoder()
le.fit(train['color_type'])
train['color_type']=le.transform(train['color_type'])

In [62]:
train['year']=(train['issue_date'].dt.year)-1990

In [63]:
train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,Days_Shelter,year
0,ANSL_69903,2016-07-10,2016-09-21 16:25:00,2.0,18,0.8,7.78,13,9,0,1,73.684028,26
1,ANSL_66892,2013-11-21,2018-12-27 17:47:00,1.0,53,0.72,14.19,13,9,0,2,1862.740972,23
2,ANSL_69750,2014-09-28,2016-10-19 08:24:00,,15,0.15,40.9,15,4,2,4,752.35,24
3,ANSL_71623,2016-12-31,2019-01-25 18:30:00,1.0,53,0.62,17.82,0,1,0,2,755.770833,26
4,ANSL_57969,2017-09-28,2017-11-19 09:38:00,2.0,2,0.5,11.06,18,4,0,1,52.401389,27


In [128]:
train['prod']=train['X1']*train['X2']

In [156]:
train['pet_id'][0][5:]

'69903'

In [157]:
_id=[]
for i in range(train.shape[0]):
    _id.append(int(train['pet_id'][i][5:]))
    

0         597.461538
1         571.726496
2        1162.500000
3                inf
4         805.125000
            ...     
18829            inf
18830     998.333333
18831     454.786325
18832     542.461538
18833            inf
Length: 18834, dtype: float64

# finding condition 

In [None]:
#not_useful as parameter finding

In [25]:
train_condition=train[~train['condition'].isnull()]
test_condition=train[train['condition'].isnull()]
x=train_condition[['color_type','year','prod',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter']]
y=train_condition['condition']
x_test=test_condition[['color_type','year','prod',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter']]
k = 13
neigh = KNeighborsClassifier(n_neighbors = k).fit(x,y)
yhat = neigh.predict(x_test)

In [43]:
#stacking
k=10
x=train[['color_type','year','prod',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter']]
y1=train['pet_category']
neigh = KNeighborsClassifier(n_neighbors = k).fit(x,y1)
pet_knn=neigh.predict(x)
train['knn1']=pet_knn

In [26]:
train.loc[test_condition.index,'condition']=yhat

In [27]:
train['condition'].isnull().value_counts()

False    18834
Name: condition, dtype: int64

In [187]:
train['pet_category'].value_counts()

2    10621
1     7184
4      941
0       88
Name: pet_category, dtype: int64

In [190]:
#for lgbm
train.loc[train['pet_category']==4,'pet_category']=3

# Modelling

In [235]:
#XG_boost implementation
n_estimators=[30,50,100]
Scores=[]
for i in n_estimators:
    X=train[[ 'condition','color_type','year','prod',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter']]   
    y1=train['pet_category']
    X_train, X_test, y1_train, y1_test = train_test_split(
    X, y1, test_size=0.33)
    xgb_cl = xgb.XGBClassifier(n_estimators=i,learning_rate=0.05,max_delta_step=5,max_depth=10,gamma=0)
    xgb_cl.fit(X_train,y1_train)
    prediction1=xgb_cl.predict(X_test)
    
    
    #Using the first prediction to get the second prediction
    X=train[[ 'condition','color_type','year','prod',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter','pet_category']]
    y2=train['breed_category']
    X_train, X_test, y2_train, y2_test = train_test_split(
    X, y2, test_size=0.33)
    xgb_cl.fit(X_train,y2_train)
    prediction2=xgb_cl.predict(X_test)
    s1=f1_score(prediction1,y1_test,average='weighted')
    s2=f1_score(prediction2,y2_test,average='weighted')
    accuracy=(s1+s2)/2*100
    Scores.append(accuracy)
    

In [236]:
#Pytorch implementation
train.shape

(18834, 15)

In [237]:
print(Scores)

[90.12993780129028, 89.94543542596143, 90.2935768566926]


# Metrics

In [111]:
s1=f1_score(prediction1,y1_test,average='weighted')
s2=f1_score(prediction2,y2_test,average='weighted')

In [112]:
accuracy=(s1+s2)/2*100

In [113]:
accuracy

90.41758488965613

# Submission

### After the correct model is selected with cross-validation and feature engineering

In [248]:
#Feature engineering on the test dataset
test=pd.read_csv('test.csv')
test['issue_date']=pd.to_datetime(test['issue_date'])
test['listing_date']=pd.to_datetime(test['listing_date'])
test['Days_Shelter']=test['listing_date']-test['issue_date']
test['Days_Shelter']=test['Days_Shelter']/np.timedelta64(1,'D')
test['color_type']=le.transform(test['color_type']) #The label encoder is fit on the training data
test['year']=(test['issue_date'].dt.year)-1990

In [254]:
#Taking the whole training dataset to train
Features=train[[ 'condition','color_type','year',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter']] 
target1=train['pet_category']

xgb_cl = xgb.XGBClassifier(n_estimators=100,learning_rate=0.05,max_delta_step=5,max_depth=10)
xgb_cl.fit(Features,target1)

test_features=test[[ 'condition','color_type','year',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter']] 
pet_predictions=xgb_cl.predict(test_features)

In [255]:
#Addding the first prediction to feature set to get the second prediction

Features=train[[ 'condition','color_type','year',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter','pet_category']]


test['pet_category']=pet_predictions
test_features=test[[ 'condition','color_type','year',
       'length(m)', 'height(cm)', 'X1', 'X2','Days_Shelter','pet_category']]

target2=train['breed_category']

xgb_cl = xgb.XGBClassifier(n_estimators=100,learning_rate=0.05,max_delta_step=5,max_depth=10)
xgb_cl.fit(Features,target2)

breed_predictions=xgb_cl.predict(test_features)

In [256]:
submit=pd.DataFrame()
submit['pet_id']=test['pet_id']
submit['breed_category']=breed_predictions
submit['pet_category']=pet_predictions

In [257]:
submit.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1,2
1,ANSL_76663,0,1
2,ANSL_58259,0,2
3,ANSL_67171,0,2
4,ANSL_72871,0,2


In [258]:
submit.to_csv('Submission.csv',index=False)