In [None]:
#imports 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE 



In [None]:
# Load the DF
train_df = pd.read_csv('trainPetID.csv')
test_df = pd.read_csv('testPetID.csv')

In [None]:
train_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [None]:
train_df.shape

(18834, 11)

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


# Preprocessing

1. handle missing values
2. convert into respective datatypes

In [None]:
#Fill missing values for condition column
train_df['condition'].fillna('missing', inplace = True)

In [None]:
train_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,missing,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2,Black,0.5,11.06,18,4,0.0,1


In [None]:
#function to convert the columns in the respective data types
def convert_to_datatype(df, cols, data_type):
  for i in cols:
    df[i] = df[i].astype(data_type)
  return df

# converting respective columns into respective data_types
train_df = convert_to_datatype(train_df, ['breed_category'], 'int64')
train_df = convert_to_datatype(train_df, ['pet_id', 'condition', 'color_type', 'X1', 'X2', 'breed_category', 'pet_category'], 'category')
train_df = convert_to_datatype(train_df, ['issue_date', 'listing_date'], 'datetime64[ns]')

In [None]:
#validate changes
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   pet_id          18834 non-null  category      
 1   issue_date      18834 non-null  datetime64[ns]
 2   listing_date    18834 non-null  datetime64[ns]
 3   condition       18834 non-null  category      
 4   color_type      18834 non-null  category      
 5   length(m)       18834 non-null  float64       
 6   height(cm)      18834 non-null  float64       
 7   X1              18834 non-null  category      
 8   X2              18834 non-null  category      
 9   breed_category  18834 non-null  category      
 10  pet_category    18834 non-null  category      
dtypes: category(7), datetime64[ns](2), float64(2)
memory usage: 1.5 MB


#Feature Engineering

1. DateTime Features
2. Date Difference Features
3. One-Hot Encoding

In [None]:
 # def condition_breed_category(df, col1, col2):
  # for i in col1: 
    # print (df[i])
    # if df[i] == 'missing':
      # for j in col2:
        # df[j] = 2
  # return df

# train_df = condition_breed_category(train_df, ['condition'], ['breed_category'])

In [None]:
train_df.loc[train_df['condition'] == 'missing', ['breed_category']] = 2

In [None]:
#For extracting additional date time features
def create_datetime_features(df, cols): 
    for i in cols: 
        df[i + '_year'] = df[i].dt.year
        df[i + '_month'] = df[i].dt.month
        df[i + '_week'] = df[i].dt.week
        df[i + '_day'] = df[i].dt.day
        df[i + '_hour'] = df[i].dt.hour
        df[i + '_dayofyear'] = df[i].dt.dayofyear
        df[i + '_dayofweek'] = df[i].dt.dayofweek
        df[i + '_quarter'] = df[i].dt.quarter
        df[i + '_isweekend'] = np.where(df[ i + '_dayofweek'].isin([5,6]), 1, 0)
    return df

# Convert time data into differences
train_df = create_datetime_features(train_df, ['issue_date', 'listing_date'])
train_df['listing_issue_diff'] = train_df['listing_date'] - train_df['issue_date']
train_df['listing_issue_diff'] = train_df['listing_issue_diff'].dt.days

#One-Hot Encoding Categorical Data 
train_df = pd.get_dummies(columns = ['condition', 'color_type', 'X1', 'X2'], data= train_df)

In [None]:
#area feature
train_df['area(m)'] = train_df['length(m)'] * train_df['height(cm)'] /100

In [None]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Columns: 117 entries, pet_id to area(m)
dtypes: category(3), datetime64[ns](2), float64(3), int64(19), uint8(90)
memory usage: 5.9 MB


#EDA

#Modeling

In [None]:
#Splitting X & Y (dropping the categories in x that aren't needed)
X = train_df.drop( ['breed_category', 'pet_category', 'issue_date', 'listing_date', 'pet_id'], axis = 1)
y = train_df[ ['breed_category', 'pet_category'] ]

In [None]:
cols_to_scale = list(X)
cols_to_scale.sort()

In [None]:
#performing z-score for normalizing the variables
sc = StandardScaler()
sc.fit(X[cols_to_scale])
X_Scaled = pd.DataFrame(sc.transform(X[cols_to_scale]), columns= X.columns)

In [None]:
#Validate the original output
X.describe()

Unnamed: 0,length(m),height(cm),issue_date_year,issue_date_month,issue_date_week,issue_date_day,issue_date_hour,issue_date_dayofyear,issue_date_dayofweek,issue_date_quarter,issue_date_isweekend,listing_date_year,listing_date_month,listing_date_week,listing_date_day,listing_date_hour,listing_date_dayofyear,listing_date_dayofweek,listing_date_quarter,listing_date_isweekend,listing_issue_diff,condition_0.0,condition_1.0,condition_2.0,condition_missing,color_type_Agouti,color_type_Apricot,color_type_Black,color_type_Black Brindle,color_type_Black Smoke,color_type_Black Tabby,color_type_Black Tiger,color_type_Blue,color_type_Blue Cream,color_type_Blue Merle,color_type_Blue Point,color_type_Blue Smoke,color_type_Blue Tabby,color_type_Blue Tick,color_type_Blue Tiger,...,color_type_Silver Tabby,color_type_Tan,color_type_Torbie,color_type_Tortie,color_type_Tortie Point,color_type_Tricolor,color_type_White,color_type_Yellow,color_type_Yellow Brindle,X1_0,X1_1,X1_2,X1_3,X1_4,X1_5,X1_6,X1_7,X1_8,X1_9,X1_10,X1_11,X1_12,X1_13,X1_14,X1_15,X1_16,X1_17,X1_18,X1_19,X2_0,X2_1,X2_2,X2_3,X2_4,X2_5,X2_6,X2_7,X2_8,X2_9,area(m)
count,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,...,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0
mean,0.502636,27.448832,2015.080121,6.824519,27.881332,15.750292,0.0,192.357757,3.01115,2.618031,0.294308,2017.426728,6.766805,27.523681,15.635818,14.20617,190.483487,3.063927,2.595784,0.291122,855.306786,0.333493,0.362058,0.226027,0.078422,0.000212,0.00069,0.245301,0.003504,0.001699,0.00292,5.3e-05,0.045237,0.000531,0.005522,0.001062,0.000319,0.020495,0.001115,0.000531,...,0.000956,0.071626,0.012849,0.019433,0.00138,0.024902,0.130243,0.007593,0.000796,0.570882,0.002708,0.001274,5.3e-05,0.000372,0.000159,0.000584,0.077519,0.002442,0.005628,0.000319,0.002814,0.003398,0.227249,0.000106,0.029415,0.032017,0.013805,0.029096,0.000159,0.000212,0.45094,0.009716,0.003398,0.06706,0.000637,0.000265,0.191568,0.00308,0.273123,0.1378
std,0.288705,13.019781,3.103141,3.300786,14.423269,8.798332,0.0,101.024393,2.015388,1.077423,0.455743,0.945423,3.599175,15.648101,8.81104,4.06714,110.171288,1.978714,1.155092,0.454292,1096.67499,0.471473,0.480608,0.418268,0.268842,0.014572,0.026264,0.430277,0.059095,0.041186,0.053962,0.007287,0.20783,0.023037,0.074106,0.032571,0.017846,0.141689,0.033374,0.023037,...,0.030901,0.257874,0.112626,0.138045,0.03713,0.15583,0.33658,0.086807,0.028211,0.494963,0.051968,0.035675,0.007287,0.019276,0.01262,0.024161,0.267421,0.049361,0.074811,0.017846,0.052974,0.058196,0.419066,0.010305,0.168971,0.176049,0.116683,0.168081,0.01262,0.014572,0.4976,0.098095,0.058196,0.250132,0.025234,0.016292,0.393546,0.05541,0.445575,0.109404
min,0.0,5.0,1994.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,2015.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,-76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,16.1725,2014.0,4.0,16.0,8.0,0.0,110.0,1.0,2.0,0.0,2017.0,4.0,13.0,8.0,12.0,91.0,1.0,2.0,0.0,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049447
50%,0.5,27.34,2016.0,7.0,29.0,16.0,0.0,197.0,3.0,3.0,0.0,2017.0,7.0,30.0,16.0,15.0,206.0,3.0,3.0,0.0,392.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109425
75%,0.76,38.89,2017.0,10.0,40.0,23.0,0.0,277.0,5.0,4.0,1.0,2018.0,10.0,41.0,23.0,17.0,286.0,5.0,4.0,1.0,1117.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.204836
max,1.0,50.0,2019.0,12.0,53.0,31.0,0.0,366.0,6.0,4.0,1.0,2019.0,12.0,52.0,31.0,23.0,366.0,6.0,4.0,1.0,8056.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.4978


In [None]:
#Validate Scaled Output (Mean = 0, STDEV = 1)
X_Scaled.describe()

Unnamed: 0,length(m),height(cm),issue_date_year,issue_date_month,issue_date_week,issue_date_day,issue_date_hour,issue_date_dayofyear,issue_date_dayofweek,issue_date_quarter,issue_date_isweekend,listing_date_year,listing_date_month,listing_date_week,listing_date_day,listing_date_hour,listing_date_dayofyear,listing_date_dayofweek,listing_date_quarter,listing_date_isweekend,listing_issue_diff,condition_0.0,condition_1.0,condition_2.0,condition_missing,color_type_Agouti,color_type_Apricot,color_type_Black,color_type_Black Brindle,color_type_Black Smoke,color_type_Black Tabby,color_type_Black Tiger,color_type_Blue,color_type_Blue Cream,color_type_Blue Merle,color_type_Blue Point,color_type_Blue Smoke,color_type_Blue Tabby,color_type_Blue Tick,color_type_Blue Tiger,...,color_type_Silver Tabby,color_type_Tan,color_type_Torbie,color_type_Tortie,color_type_Tortie Point,color_type_Tricolor,color_type_White,color_type_Yellow,color_type_Yellow Brindle,X1_0,X1_1,X1_2,X1_3,X1_4,X1_5,X1_6,X1_7,X1_8,X1_9,X1_10,X1_11,X1_12,X1_13,X1_14,X1_15,X1_16,X1_17,X1_18,X1_19,X2_0,X2_1,X2_2,X2_3,X2_4,X2_5,X2_6,X2_7,X2_8,X2_9,area(m)
count,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,...,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0
mean,6.47801e-16,1.412463e-17,-1.003676e-15,7.030432e-16,-6.201199e-16,1.855677e-16,-8.279614e-16,8.66521e-16,6.141065e-16,-3.2546560000000003e-17,-1.78155e-16,8.22244e-16,-1.328046e-15,1.2715e-15,-1.288066e-15,1.114749e-16,3.747515e-16,1.624879e-15,-4.122581e-16,3.689028e-16,1.611762e-15,8.913027e-16,-2.474791e-16,-5.439122e-16,-5.320217e-16,-7.345925e-16,-2.987064e-15,-1.219176e-15,-1.37187e-15,3.654764e-19,-6.827335000000001e-17,1.039915e-15,-6.860585e-16,-3.147459e-16,4.956428e-16,7.069124e-16,-5.334651e-16,1.262446e-15,-1.482676e-15,7.581974e-16,...,6.830419e-16,9.302554000000001e-17,7.529757e-16,1.483068e-16,3.917207e-16,1.208553e-15,1.38972e-16,6.228573e-16,-1.973204e-16,-5.018433e-16,-1.43254e-15,-2.307922e-15,-2.080669e-15,2.398778e-16,1.388446e-15,2.431479e-16,-1.443632e-16,2.845764e-16,-6.464276e-16,2.11994e-16,9.978096e-17,-2.862565e-16,7.379087e-17,0.0,-1.794135e-16,-2.279748e-16,-6.981071e-16,9.243016000000001e-17,-8.146823e-15,2.018962e-16,8.986004000000001e-17,1.108956e-16,5.888297e-17,2.41462e-16,-5.702375e-16,1.373071e-16,1.496331e-16,1.384802e-16,-1.094144e-13,6.733019e-17
std,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,...,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,0.0,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027,1.000027
min,-1.153414,-0.05210778,-0.01785145,-0.05312251,-0.05839261,-0.5422889,-0.01030545,-0.1740872,-0.1818668,-0.1183134,-0.1731135,-0.01262188,-0.03571998,-0.007286857,-0.01928228,-0.01262188,-0.02417419,-0.2898855,-0.04948102,-0.0752328,-0.01457487,-0.9062527,-0.09905456,-0.05839261,-0.2681041,-0.02524979,-0.01629564,-0.4867885,-0.05557922,-0.6129835,-1.259587,-0.01457487,-0.02628151,-0.5701155,-0.05930113,-0.04125466,-0.05411842,-0.007286857,-0.2176711,-0.02304857,...,-0.02628151,-0.03496697,-0.09458472,-0.06448774,-0.03571998,-0.01457487,-0.03092948,-0.2777622,-0.1140892,-0.1407766,-0.03718051,-0.1598052,-0.3869711,-0.08746851,-0.02823237,-0.7073602,-0.7533529,-0.540403,-0.2917111,-1.724255,-1.676532,-1.494119,-1.894224,0.0,-0.6457934,-1.764633,-1.5018,-1.863797,-6.793336,-1.741047,-1.661121,-1.548484,-1.719945,-3.493007,-0.6408439,-1.6023,-1.381558,-1.695055,-2.566887,-0.849232
25%,-1.153414,-0.05210778,-0.01785145,-0.05312251,-0.05839261,-0.5422889,-0.01030545,-0.1740872,-0.1818668,-0.1183134,-0.1731135,-0.01262188,-0.03571998,-0.007286857,-0.01928228,-0.01262188,-0.02417419,-0.2898855,-0.04948102,-0.0752328,-0.01457487,-0.9062527,-0.09905456,-0.05839261,-0.2681041,-0.02524979,-0.01629564,-0.4867885,-0.05557922,-0.6129835,-0.8076044,-0.01457487,-0.02628151,-0.5701155,-0.05930113,-0.04125466,-0.05411842,-0.007286857,-0.2176711,-0.02304857,...,-0.02628151,-0.03496697,-0.09458472,-0.06448774,-0.03571998,-0.01457487,-0.03092948,-0.2777622,-0.1140892,-0.1407766,-0.03718051,-0.1598052,-0.3869711,-0.08746851,-0.02823237,-0.7073602,-0.7533529,-0.540403,-0.2917111,-0.8661153,-0.8809054,-0.9979238,-0.8152481,0.0,-0.6457934,-0.855734,-0.5736349,-0.8237832,-0.3480827,-0.8750881,-0.8666424,-1.043092,-0.9030132,-0.542452,-0.6408439,-0.7687535,-0.515803,-0.928168,-0.4513744,-0.6714171
50%,0.8669914,-0.05210778,-0.01785145,-0.05312251,-0.05839261,-0.5422889,-0.01030545,-0.1740872,-0.1818668,-0.1183134,-0.1731135,-0.01262188,-0.03571998,-0.007286857,-0.01928228,-0.01262188,-0.02417419,-0.2898855,-0.04948102,-0.0752328,-0.01457487,-0.9062527,-0.09905456,-0.05839261,-0.2681041,-0.02524979,-0.01629564,-0.4867885,-0.05557922,-0.6129835,-0.2593659,-0.01457487,-0.02628151,-0.5701155,-0.05930113,-0.04125466,-0.05411842,-0.007286857,-0.2176711,-0.02304857,...,-0.02628151,-0.03496697,-0.09458472,-0.06448774,-0.03571998,-0.01457487,-0.03092948,-0.2777622,-0.1140892,-0.1407766,-0.03718051,-0.1598052,-0.3869711,-0.08746851,-0.02823237,-0.7073602,-0.7533529,-0.540403,-0.2917111,-0.008359228,0.02838204,-0.005532604,0.04595292,0.0,-0.6457934,0.05316467,0.35453,0.07756203,0.2964426,-0.009129487,0.04133354,-0.03230817,0.1408436,0.1951866,-0.6408439,0.06479304,0.3499517,0.1582547,-0.4513744,-0.4224762
75%,0.8669914,-0.05210778,-0.01785145,-0.05312251,-0.05839261,-0.5422889,-0.01030545,-0.1740872,-0.1818668,-0.1183134,-0.1731135,-0.01262188,-0.03571998,-0.007286857,-0.01928228,-0.01262188,-0.02417419,-0.2898855,-0.04948102,-0.0752328,-0.01457487,1.103445,-0.09905456,-0.05839261,-0.2681041,-0.02524979,-0.01629564,-0.4867885,-0.05557922,1.631365,0.612755,-0.01457487,-0.02628151,-0.5701155,-0.05930113,-0.04125466,-0.05411842,-0.007286857,-0.2176711,-0.02304857,...,-0.02628151,-0.03496697,-0.09458472,-0.06448774,-0.03571998,-0.01457487,-0.03092948,-0.2777622,-0.1140892,-0.1407766,-0.03718051,-0.1598052,-0.3869711,-0.08746851,-0.02823237,1.413707,1.327399,-0.540403,-0.2917111,0.878776,0.8240085,0.9868586,0.8378619,0.0,1.548483,0.9620633,1.282695,0.8402388,0.6187053,0.8914675,0.8358125,0.978476,0.8670049,0.6869458,1.560443,0.8983395,1.215706,0.8612341,0.6063817,0.2386306
max,0.8669914,19.19099,56.01785,18.82441,17.12546,1.844036,97.03608,5.744248,5.49853,8.452128,5.776557,79.22752,27.99554,137.2334,51.86108,79.22752,41.36643,3.449638,20.20977,13.29207,68.61122,1.103445,10.09545,17.12546,3.729895,39.60429,61.36611,2.05428,17.99234,1.631365,3.290653,68.61122,38.04956,1.754031,16.86309,24.23969,18.478,137.2334,4.594087,43.38663,...,38.04956,28.59842,10.57253,15.50682,27.99554,68.61122,32.33162,3.600202,8.76507,7.103451,26.89581,6.257618,2.584172,11.43269,35.42033,1.413707,1.327399,1.850471,3.428049,1.732116,1.733296,1.483054,1.718861,0.0,1.548483,1.567996,1.282695,1.741584,1.263231,1.722788,1.743788,1.483868,1.593166,2.162223,1.560443,1.454037,1.215706,1.564213,1.664138,6.566106


In [None]:
sm = SMOTE(random_state = 2) 
X_Scaled_os, y_pet_os = sm.fit_sample(X_Scaled, y['pet_category'].ravel()) 
# X_Scaled_os, y_breed_os = sm.fit_sample(X_Scaled, y['breed_category'].ravel())



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y, random_state = 2020, stratify=y, train_size = 0.8)

In [None]:
# MLP Classfier Model
mlp_breed = MLPClassifier(hidden_layer_sizes=(30, 8, 2), activation = 'relu', solver = 'adam', alpha = 0.0001, learning_rate = 'adaptive', learning_rate_init= 0.0005, max_iter= 2000, random_state= 2020, verbose = True)

In [None]:
mlp_breed.fit(X_Scaled, y['breed_category'])

Iteration 1, loss = 1.14801967
Iteration 2, loss = 1.03751937
Iteration 3, loss = 0.87623991
Iteration 4, loss = 0.74461400
Iteration 5, loss = 0.68703672
Iteration 6, loss = 0.65513451
Iteration 7, loss = 0.63221034
Iteration 8, loss = 0.61351476
Iteration 9, loss = 0.59752734
Iteration 10, loss = 0.58265863
Iteration 11, loss = 0.56824266
Iteration 12, loss = 0.55395912
Iteration 13, loss = 0.53976589
Iteration 14, loss = 0.52775202
Iteration 15, loss = 0.51580081
Iteration 16, loss = 0.50425011
Iteration 17, loss = 0.49456370
Iteration 18, loss = 0.48392698
Iteration 19, loss = 0.47460304
Iteration 20, loss = 0.46513494
Iteration 21, loss = 0.45517912
Iteration 22, loss = 0.44718875
Iteration 23, loss = 0.43859692
Iteration 24, loss = 0.43100198
Iteration 25, loss = 0.42255507
Iteration 26, loss = 0.41565653
Iteration 27, loss = 0.40844068
Iteration 28, loss = 0.40234934
Iteration 29, loss = 0.39593080
Iteration 30, loss = 0.38892555
Iteration 31, loss = 0.38327574
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(30, 8, 2), learning_rate='adaptive',
              learning_rate_init=0.0005, max_fun=15000, max_iter=2000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=2020, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [None]:
y_pred_breed = mlp_breed.predict(X_test)
breed_val = pd.DataFrame({'ACTUALS': y_test['breed_category'], 'PREDICTED': y_pred_breed})
pd.crosstab(breed_val['ACTUALS'], breed_val['PREDICTED'])

PREDICTED,0,1,2
ACTUALS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1740,60,0
1,127,1544,0
2,0,5,291


In [None]:
f1_score(breed_val['ACTUALS'], breed_val['PREDICTED'], average= 'weighted')

0.9489971902921917

In [None]:
mlp_pet = MLPClassifier(hidden_layer_sizes=(30, 8, 2), activation = 'relu', solver = 'adam', alpha = 0.0001, learning_rate = 'adaptive', learning_rate_init= 0.0005, max_iter= 2000, random_state= 2020, verbose = True)

In [None]:
mlp_pet.fit(X_Scaled_os, y_pet_os)

Iteration 1, loss = 1.40409023
Iteration 2, loss = 0.88234668
Iteration 3, loss = 0.60042901
Iteration 4, loss = 0.50784020
Iteration 5, loss = 0.46810762
Iteration 6, loss = 0.43835303
Iteration 7, loss = 0.41393099
Iteration 8, loss = 0.39282983
Iteration 9, loss = 0.37419701
Iteration 10, loss = 0.35680630
Iteration 11, loss = 0.34090776
Iteration 12, loss = 0.32722268
Iteration 13, loss = 0.31584526
Iteration 14, loss = 0.30442567
Iteration 15, loss = 0.29457555
Iteration 16, loss = 0.28601637
Iteration 17, loss = 0.27791093
Iteration 18, loss = 0.27072588
Iteration 19, loss = 0.26383869
Iteration 20, loss = 0.25816070
Iteration 21, loss = 0.25210085
Iteration 22, loss = 0.24751473
Iteration 23, loss = 0.24219497
Iteration 24, loss = 0.23829526
Iteration 25, loss = 0.23426175
Iteration 26, loss = 0.23066076
Iteration 27, loss = 0.22679414
Iteration 28, loss = 0.22374476
Iteration 29, loss = 0.21940828
Iteration 30, loss = 0.21489177
Iteration 31, loss = 0.21215269
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(30, 8, 2), learning_rate='adaptive',
              learning_rate_init=0.0005, max_fun=15000, max_iter=2000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=2020, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [None]:
y_pred_pet = mlp_pet.predict(X_test)
pet_val = pd.DataFrame({'ACTUALS': y_test['pet_category'], 'PREDICTED': y_pred_pet})
pd.crosstab(pet_val['ACTUALS'], pet_val['PREDICTED'])

PREDICTED,0,1,2,4
ACTUALS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,17,0,0,0
1,3,1256,177,1
2,0,113,1999,12
4,0,0,2,187


In [None]:
f1_score(pet_val['ACTUALS'], pet_val['PREDICTED'], average= 'weighted')

0.9178867234714158

#Test Data

In [None]:
test_df = pd.read_csv('testPetID.csv')
#Fill missing values for condition column
test_df['condition'].fillna('missing', inplace = True)

In [None]:
test_df['area(m)'] = test_df['length(m)'] * test_df['height(cm)'] /100

In [None]:
# converting respective columns into respective data_types
test_df = convert_to_datatype(test_df, ['pet_id', 'condition', 'color_type', 'X1', 'X2'], 'category')
test_df = convert_to_datatype(test_df, ['issue_date', 'listing_date'], 'datetime64[ns]')

In [None]:
# Convert time data into differences
test_df = create_datetime_features(test_df, ['issue_date', 'listing_date'])
test_df['listing_issue_diff'] = test_df['listing_date'] - test_df['issue_date']
test_df['listing_issue_diff'] = test_df['listing_issue_diff'].dt.days

#One-Hot Encoding Categorical Data 
test_df = pd.get_dummies(columns = ['condition', 'color_type', 'X1', 'X2'], data= test_df)

In [None]:
X_test_upload = test_df.drop( ['pet_id', 'issue_date', 'listing_date'] , axis = 1)

In [None]:
set(list(X_Scaled)) - set(list(X_test_upload))

{'X1_19', 'X1_3', 'color_type_Black Tiger', 'color_type_Brown Tiger'}

In [None]:
for i in ['X1_19', 'X1_3', 'color_type_Black Tiger', 'color_type_Brown Tiger']:
  X_test_upload[i] = 0
X_test_scaled = sc.transform(X_test_upload[cols_to_scale])

In [None]:
y_test_breed_op = mlp_breed.predict(X_test_scaled)
y_test_pet_op = mlp_pet.predict(X_test_scaled)
results = pd.DataFrame( { 'pet_id': test_df['pet_id'], 'breed_category': y_test_breed_op, 'pet_category': y_test_pet_op  })

In [None]:
results.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1,2
1,ANSL_76663,0,1
2,ANSL_58259,0,2
3,ANSL_67171,0,2
4,ANSL_72871,0,2


In [None]:
results.to_csv('result005.csv', index = False)