# Hackerearth ML Challenge : Adopt a Buddy 

Problem Description: [Hackerearth Link](https://www.hackerearth.com/challenges/competitive/hackerearth-machine-learning-challenge-pet-adoption/machine-learning/pet-adoption-9-5838c75b/) 

Leaderboard [Link](https://www.hackerearth.com/challenges/competitive/hackerearth-machine-learning-challenge-pet-adoption/leaderboard/pet-adoption-9-5838c75b/)

Rank: 115th

Final Score: 90.4

We have been given training and testing dataset which has columns like the Pet_Id , Condition , Color , Issue and Listing Date. The target variables are the breed_category and pet_category which we need to predict 

As there are two classes , the approach taken is training two models for each classes and then testing seperately and appending the final result. 

## Importing Dataset

In [None]:
import pandas as pd
df_train=pd.read_csv('../input/hackerearth-ml-challenge-pet-adoption/train.csv')
df_test=pd.read_csv('../input/hackerearth-ml-challenge-pet-adoption/test.csv')

In [None]:
print(df_train.head())
print(df_train.tail())

# Viewing the columns

In [None]:
print(df_train.columns)

# Displaying the Unique values

In [None]:
print(df_train['pet_category'].unique())

In [None]:
y_train=df_train['pet_category'].values
print(y_train)

In [None]:
print(df_train['condition'].unique())
print(df_train['color_type'].unique())
print(df_train['breed_category'].unique())

In [None]:
print(df_train['length(m)'].unique())
print(df_train['height(cm)'].unique())
print(df_train['X1'].unique())
print(df_train['X2'].unique())

# Calculating the no. of NAN values

Training dataset stats

In [None]:
print(df_train['length(m)'].isna().sum())
print(df_train['height(cm)'].isna().sum())
print(df_train['X1'].isna().sum())
print(df_train['X2'].isna().sum())

print(df_train['condition'].isna().sum())
print(df_train['color_type'].isna().sum())
print(df_train['breed_category'].isna().sum())

Testing dataset stats

In [None]:
print(df_test['length(m)'].isna().sum())
print(df_test['height(cm)'].isna().sum())
print(df_test['X1'].isna().sum())
print(df_test['X2'].isna().sum())

print(df_test['condition'].isna().sum())
print(df_test['color_type'].isna().sum())
#print(df_test['breed_category'].isna().sum())

**Observation:** The column 'condition' is having many Nan values

In [None]:
print(df_train.groupby(['condition']).size())

print(df_train[df_train['condition'].isnull()])

In [None]:
print(df_train[df_train['condition'].isnull()]['breed_category'].unique())

**Observation:** Only For breed_category 2.0 , condition value is null

In [None]:
df_train[df_train['breed_category']==2].count()

So, we can give a unique value for condition where it is null

# Clearing NAN value in 'condition' column

In [None]:
import numpy as np
df_train['condition']=df_train['condition'].replace(np.nan,3)

In [None]:
df_test['condition']=df_test['condition'].replace(np.nan,3)

In [None]:
print(df_train.groupby(['condition']).size())

print(df_train[df_train['condition'].isnull()])

# Finding difference between issue_date and listing_date in days

Calculating Difference and adding feature for training data

In [None]:
df_train['diff_days']=np.abs((pd.to_datetime(df_train['listing_date'].values)-pd.to_datetime(df_train['issue_date'].values)).days)

print(df_train['diff_days'].values)

Calculating Difference and adding feature for testing data

In [None]:
df_test['diff_days']=np.abs((pd.to_datetime(df_test['listing_date'].values)-pd.to_datetime(df_test['issue_date'].values)).days)

print(df_test['diff_days'].values)

Checking the correctness of difference (in days)

In [None]:
print(df_train['issue_date'][5], " ", df_train['listing_date'][5], " ",df_train['diff_days'][5])

So, we have added new feature 'diff_days' which describes difference in days between listing date and issue date. \


# Removing date-time columns

Dropping columns from training dataset

In [None]:
df_train_new=df_train.drop(columns=['issue_date','listing_date'])

print(df_train_new.head())
print(df_train_new.columns)

Dropping columns from testing dataset

In [None]:
df_test_new=df_test.drop(columns=['issue_date','listing_date'])

print(df_test_new.head())
print(df_test_new.columns)

# Encoding columns of categorical names with numbers

Encoding the color_type column in training dataset

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df_train_new["color_type_code"] = lb_make.fit_transform(df_train_new["color_type"])
df_train_new[["color_type", "color_type_code"]].head(11)

Encoding the color_type column in testing dataset

In [None]:
df_test_new["color_type_code"] = lb_make.transform(df_test_new["color_type"])
df_test_new[["color_type", "color_type_code"]].head(11)

Dropping the column from both the training and testing dataset

In [None]:
df_train_new['color_type_code'].unique()

df_train_new=df_train_new.drop(columns=['color_type'])

print(df_train_new.head(25))

In [None]:
df_test_new['color_type_code'].unique()

df_test_new=df_test_new.drop(columns=['color_type'])

print(df_test_new.head(25))

# Distribution of values in some features

In [None]:
print(df_train_new.columns)

Checking skewness of Length(m)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew 
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the distribution 
sns.distplot(df_train_new['length(m)'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Length")
ax.set(title="Length distribution")
sns.despine(trim=True, left=True)
plt.show()

print("skew value: ", skew(df_train_new['length(m)']))

Checking skewness of Height(cm)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew 
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the distribution 
sns.distplot(df_train_new['height(cm)'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="Height")
ax.set(title="Height distribution")
sns.despine(trim=True, left=True)
plt.show()

print("skew value: ", skew(df_train_new['height(cm)']))

Checking skewness of X1

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew 
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the distribution 
sns.distplot(df_train_new['X1'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="X1")
ax.set(title="X1 distribution")
sns.despine(trim=True, left=True)
plt.show()

print("skew value: ", skew(df_train_new['X1']))

Checking skewness of X2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew 
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the distribution 
sns.distplot(df_train_new['X2'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="X2")
ax.set(title="X2 distribution")
sns.despine(trim=True, left=True)
plt.show()

print("skew value: ", skew(df_train_new['X2']))

We noticed that only in column X1, there is high skewness, so we take the log transform of that column both in training and testing dataset

For Training Dataset

In [None]:
# to check skewness of X1 Score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew 
X1_trans=np.log(1+df_train_new['X1'].values)
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the distribution 
sns.distplot(X1_trans, color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="X1")
ax.set(title="X1 distribution")
sns.despine(trim=True, left=True)
plt.show()

print("skew value: ", skew(X1_trans))

For Testing Dataset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew 
X1_trans_test=np.log(1+df_test_new['X1'].values)
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the distribution 
sns.distplot(X1_trans, color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="X1")
ax.set(title="X1 distribution")
sns.despine(trim=True, left=True)
plt.show()

print("skew value: ", skew(X1_trans_test))

In [None]:
df_train_norm=df_train_new
df_train_norm['X1']=X1_trans

df_test_norm=df_test_new
df_test_norm['X1']=X1_trans_test

# Creating features on basis of magnitude of Length and Height

* Length: low (0 to 0.3) , medium (0.3 to 0.6) , high( 0.6 to 1.0)
* Height: low (0 to 15) , medium (15 to 30) , high(30 to 45)

In [None]:
df_train_norm['Low_Height']=np.where(df_train_norm['height(cm)']<=15,1,0)
df_train_norm['Medium_Height']=np.where(((df_train_norm['height(cm)']>15) & (df_train_norm['height(cm)']<=30)),1,0)
df_train_norm['High_Height']=np.where(df_train_norm['height(cm)']>30,1,0)

df_test_norm['Low_Height']=np.where(df_test_norm['height(cm)']<=15,1,0)
df_test_norm['Medium_Height']=np.where(((df_test_norm['height(cm)']>15) & (df_test_norm['height(cm)']<=30)),1,0)
df_test_norm['High_Height']=np.where(df_test_norm['height(cm)']>30,1,0)

In [None]:
df_train_norm['Low_Length']=np.where(df_train_norm['length(m)']<=0.3,1,0)
df_train_norm['Medium_Length']=np.where((df_train_norm['length(m)']>0.3) & (df_train_norm['length(m)']<=0.6),1,0)
df_train_norm['High_Length']=np.where(df_train_norm['length(m)']>0.6,1,0)

df_test_norm['Low_Length']=np.where(df_test_norm['length(m)']<=0.3,1,0)
df_test_norm['Medium_Length']=np.where((df_test_norm['length(m)']>0.3) & (df_test_norm['length(m)']<=0.6),1,0)
df_test_norm['High_Length']=np.where(df_test_norm['length(m)']>0.6,1,0)

In [None]:
print(df_train_norm.head(20))

# XGBoost training and validation

In [None]:
from sklearn.model_selection import train_test_split

Y=df_train_norm['pet_category'].values
X=df_train_norm.drop(columns=['pet_category','pet_id','breed_category'])

X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

model=XGBClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(f1_score(y_pred,y_test,average='weighted'))
print(accuracy_score(y_pred,y_test))



from sklearn.model_selection import train_test_split

Y=df_train_norm['breed_category'].values
X=df_train_norm.drop(columns=['pet_category','pet_id','breed_category'])

X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.2,random_state=0)


print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

model2=XGBClassifier()
model2.fit(X_train,y_train)
y_pred=model2.predict(X_test)
print(f1_score(y_pred,y_test,average='weighted'))
print(accuracy_score(y_pred,y_test))

XGBoost without any parameter tuning gave performance f1 score of 89.71 on submission 

# XGBoost Parameter tuning

Observations of performance with respect to learning rate:

* learning rate=0.01 (89.75)
* learning rate=0.1 (90.04)
* learning rate=0.4 (90.17)
* learning rate=0.6 (90.14)

Now the 'max_depth' parameter is decreased from 4 and tried with 3, 2 and 1. 

It is found that 4 is optimal

The subsample is increased from 0.8 but we found that it is optimal


Performance with respect to 'gamma' parameter original 
* gamme= 5 (90.57)
* gamma= 4 (90.7)

gamma=4 is optimal

In [None]:
from sklearn.model_selection import train_test_split

Y=df_train_norm['pet_category'].values
X=df_train_norm.drop(columns=['pet_category','pet_id','breed_category'])

X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.2,random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

model=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)

model.fit(X_train,y_train)
y_pred=model.predict(X_test)
print(f1_score(y_pred,y_test,average='weighted'))
print(accuracy_score(y_pred,y_test))



from sklearn.model_selection import train_test_split

Y=df_train_norm['breed_category'].values
X=df_train_norm.drop(columns=['pet_category','pet_id','breed_category'])

X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.2,random_state=0)


print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

model2=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)
model2.fit(X_train,y_train)
y_pred=model2.predict(X_test)
print(f1_score(y_pred,y_test,average='weighted'))
print(accuracy_score(y_pred,y_test))

# Training the final XGBoost tuned model

In [None]:
from sklearn.model_selection import train_test_split

Y=df_train_norm['pet_category'].values
X=df_train_norm.drop(columns=['pet_category','pet_id','breed_category'])


import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

modelx=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)

modelx.fit(X,Y)



from sklearn.model_selection import train_test_split

Y=df_train_norm['breed_category'].values
X=df_train_norm.drop(columns=['pet_category','pet_id','breed_category'])


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

modelx2=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4)
modelx2.fit(X,Y)


# Observations

90.81 performance score without train_test_split, i.e training on whole training Dataset

After this it is checked the diff_days column had negative values for **two cases** in the training dataset. So used np.abs() for calculation of difference. 

After retraining the model and testing it, final f1 score acheived on submission is 90.83

Apart from XgBoost classifier, I had tried with RandomForest, LGBM and CatBoost Classifier too, but it was found that XgBoost performed the best

# Further Feature Engineering

I did not stop at 90.83, but tried to add more features to train the model more efficiently and check whether any improvement is made.

# Converting Length to cm units, same as height

In [None]:
df_test_norm['length(cm)']=df_test_norm['length(m)']*100
df_train_norm['length(cm)']=df_train_norm['length(m)']*100


df_test_today=df_test_norm.drop(columns=['length(m)'])
df_train_today=df_train_norm.drop(columns=['length(m)'])

# Finding ratio of X2 : X1 and adding it as a feature

* X1 , X2 columns were removed but the performance decreased, so we include those

In [None]:
df_train_today['ratio']=df_train_today['X2']/(1+df_train_today['X1'])
df_test_today['ratio']=df_test_today['X2']/(1+df_test_today['X1'])

# Finding ratio of Height(cm) : Length(cm) and adding it as a feature

In [None]:
df_train_today['lhratio']=df_train_today['height(cm)']/(1+df_train_today['length(cm)'])
df_test_today['lhratio']=df_test_today['height(cm)']/(1+df_test_today['length(cm)'])

# Arriving at the final model

After tuning parameters like 'learning_rate' and 'max_depth' , I arrived at the final model which gave the final Public score of **91.06 (an improvement of 0.23)** ! Eureka!


In [None]:
from sklearn.model_selection import train_test_split

Y=df_train_today['pet_category'].values
X=df_train_today.drop(columns=['pet_category','pet_id','breed_category'])


import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

modelx=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.47,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1500, 
                      reg_alpha = 0.3,
                      max_depth=7, 
                      gamma=4,
                     random_state=42)

modelx.fit(X,Y)


from sklearn.model_selection import train_test_split

Y=df_train_today['breed_category'].values
X=df_train_today.drop(columns=['pet_category','pet_id','breed_category'])


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

modelx2=XGBClassifier(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.4,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=4,
                      random_state=42)
modelx2.fit(X,Y)


In [None]:
from sklearn.model_selection import train_test_split

#Y_test_fin=df_test_new['pet_category'].values
idx=df_test_today['pet_id'].values
X_test_fin=df_test_today.drop(columns=['pet_id'])


y_pred_fin=modelx.predict(X_test_fin)


from sklearn.model_selection import train_test_split

#Y_test_fin=df_test_new['pet_category'].values
idx=df_test_today['pet_id'].values
X_test_fin=df_test_today.drop(columns=['pet_id'])


y_pred_fin2=modelx2.predict(X_test_fin)


df_sub = pd.DataFrame({'pet_id': idx,
                   'breed_category': y_pred_fin2,
                   'pet_category': y_pred_fin})
df_sub.to_csv('submit.csv',index=False)

# Points to learn

* Feature Engineering is very important, always try to incorporate new features to see performance improvement.

* How to smartly convert DateTime values into numerical features in order to train your model more efficiently

* Parameter Tuning is equally critical

* Never lose Hope!