In this kernel I want to demonstrate how to extract features from the pet images using a pretrained network. Since there are often none or multiple images of different resoltuions and aspect ratio I make the following preprocessing steps:

- Take only profile picture (if existing else black)
- pad to square aspect ratio
- resize to 256


In [None]:
import cv2
import pandas as pd
import numpy as np
import os
from tqdm import tqdm, tqdm_notebook
train_df = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
img_size = 256
batch_size = 16

In [None]:
pet_ids = train_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

In [None]:
from keras.applications.densenet import preprocess_input, DenseNet121

In [None]:
#def resize_to_square(im):
#    old_size = im.shape[:2] # old_size is in (height, width) format
#    ratio = float(img_size)/max(old_size)
#    new_size = tuple([int(x*ratio) for x in old_size])
#    # new_size should be in (width, height) format
#    im = cv2.resize(im, (new_size[1], new_size[0]))
#    delta_w = img_size - new_size[1]
#    delta_h = img_size - new_size[0]
#    top, bottom = delta_h//2, delta_h-(delta_h//2)
#    left, right = delta_w//2, delta_w-(delta_w//2)
#    color = [0, 0, 0]
#    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
#    return new_im

#def load_image(path, pet_id):
#    image = cv2.imread(f'{path}{pet_id}-1.jpg')
#    new_image = resize_to_square(image)
#    new_image = preprocess_input(new_image)
#    return new_image

Lets define our model for feature extraction. Normally DenseNet121 would output 1024 features after GlobalAveragePooling. To further narrow it down, I again pool 4 features each.

In [None]:
#from keras.models import Model
#from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
#import keras.backend as K
#inp = Input((256,256,3))
#backbone = DenseNet121(input_tensor = inp, include_top = False)
#x = backbone.output
#x = GlobalAveragePooling2D()(x)
#x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
#x = AveragePooling1D(4)(x)
#out = Lambda(lambda x: x[:,:,0])(x)

#m = Model(inp,out)

In [None]:
#features = {}
#for b in tqdm_notebook(range(n_batches)):
#    start = b*batch_size
#    end = (b+1)*batch_size
#    batch_pets = pet_ids[start:end]
#    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
#    for i,pet_id in enumerate(batch_pets):
#        try:
#            batch_images[i] = load_image("../input/train_images/", pet_id)
#        except:
#            pass
#    batch_preds = m.predict(batch_images)
#    for i,pet_id in enumerate(batch_pets):
#        features[pet_id] = batch_preds[i]

In [None]:
#train_feats = pd.DataFrame.from_dict(features, orient='index')

We save the features as a csv to disk, so others can link and join the data frame with their train.csv

In [None]:
#train_feats.to_csv('train_img_features.csv')
#Getting features from csv
test_feats=pd.read_csv('../input/testimagefeatures/test_img_features.csv')
train_feats=pd.read_csv('../input/trainimagefeatures/train_img_features.csv')

In [None]:
#dropping unnamed columns
train_feats=train_feats.drop(train_feats.columns[train_feats.columns.str.contains('unnamed',case = False)],axis = 1)
test_feats=test_feats.drop(test_feats.columns[test_feats.columns.str.contains('unnamed',case = False)],axis = 1)

In [None]:
train_feats.head()

and repeat the procedure again for test images

In [None]:
test_df = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

In [None]:
pet_ids = test_df['PetID'].values
#n_batches = len(pet_ids) // batch_size + 1

In [None]:
#features = {}
#for b in tqdm_notebook(range(n_batches)):
#    start = b*batch_size
#    end = (b+1)*batch_size
#    batch_pets = pet_ids[start:end]
#    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
#    for i,pet_id in enumerate(batch_pets):
#        try:
#            batch_images[i] = load_image("../input/test_images/", pet_id)
#        except:
#            pass
#    batch_preds = m.predict(batch_images)
#    for i,pet_id in enumerate(batch_pets):
#        features[pet_id] = batch_preds[i]

In [None]:
#test_feats = pd.DataFrame.from_dict(features, orient='index')

In [None]:
#test_feats.to_csv('test_img_features.csv')
test_feats.head()

In [None]:
train_set=pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
train_set.head()

In [None]:
train_set.describe()

In [None]:
lst=train_set["Type"].tolist()
dog=0
cat=0
for i in lst:
    if lst[i]==1:
        dog+=1
    else:
        cat+=1
print("Dog:"+str(dog)+"Cat:"+str(cat))
#no of dogs=6861,cats=8132

In [None]:
#divide into output and input
X_train=train_set.drop(columns=["AdoptionSpeed"])
y_train=train_set.AdoptionSpeed

In [None]:
l=X_train["Age"].tolist()
newlst=[]
for i in l: 
    if i>=0 and i<=6:
        newlst.append(0)
    elif i>=7 and i<=12:
        newlst.append(1)
    elif i>=13 and i<=36:
        newlst.append(2)
    elif i>=37 and i<=60:
        newlst.append(3)
    elif i>=61 and i<=96:
        newlst.append(4)
    elif i>=97:
        newlst.append(5)

In [None]:
X_train["Age binned"]=newlst
X_train=X_train.drop(columns=["Age"])

In [None]:
X_train

In [None]:
#binning fee
feelst=[]
l=X_train["Fee"].tolist()
for i in l:
    if i==0:
        feelst.append(0)
    elif i>=1 and i<=25:
        feelst.append(1)
    elif i>=26 and i<=60:
        feelst.append(2)
    elif i>=61 and i<=108:
        feelst.append(3)
    elif i>=109 and i<=210:
        feelst.append(4)
    elif i>=211:
        feelst.append(5)
#X_train["Fees"]=X_train["Fee"]
X_train["Fee binned"]=feelst
X_train=X_train.drop(columns=["Fee"])

In [None]:
#converting description using NLP
from textblob import TextBlob
desc=X_train["Description"].tolist()
l=[]
for i in range(len(desc)):
    l.append(TextBlob(str(desc[i])).sentiment.polarity)
X_train["Description Polarity"]=l
Pid=X_train.PetID
X_train=X_train.drop(columns=["Description","RescuerID","Name","PetID"])

In [None]:
#one hot encoding state names
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
#X_train['States'] = lb.fit_transform(X_train['State']).tolist()
X_train = pd.concat([X_train, pd.get_dummies(X_train['State'])], axis=1)
X_train=X_train.drop(columns=["State"])

In [None]:
#getting correlations
X_train.join(y_train).corr()["AdoptionSpeed"].sort_values(ascending=False)

In [None]:
#add images to the df
#X_train.join(train_feats)

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X_train=X_train.drop(columns=[41415])
xTrain, xTest, yTrain, yTest = train_test_split(X_train, y_train, test_size = 0.2, random_state = 1)
#X_test=X_test.drop(columns=[41415])

In [None]:
testdf=pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
testdf
#changing the test set
l=testdf["Age"].tolist()
newlst=[]
for i in l: 
    if i>=0 and i<=6:
        newlst.append(0)
    elif i>=7 and i<=12:
        newlst.append(1)
    elif i>=13 and i<=36:
        newlst.append(2)
    elif i>=37 and i<=60:
        newlst.append(3)
    elif i>=61 and i<=96:
        newlst.append(4)
    elif i>=97:
        newlst.append(5)

testdf["Age binned"]=newlst
testdf=testdf.drop(columns=["Age"])

#binning fee
feelst=[]
l=testdf["Fee"].tolist()
for i in l:
    if i==0:
        feelst.append(0)
    elif i>=1 and i<=25:
        feelst.append(1)
    elif i>=26 and i<=60:
        feelst.append(2)
    elif i>=61 and i<=108:
        feelst.append(3)
    elif i>=109 and i<=210:
        feelst.append(4)
    elif i>=211:
        feelst.append(5)
#X_train["Fees"]=X_train["Fee"]
testdf["Fee binned"]=feelst
testdf=testdf.drop(columns=["Fee"])

#converting description using NLP
from textblob import TextBlob
desc=testdf["Description"].tolist()
l=[]
for i in range(len(desc)):
    l.append(TextBlob(str(desc[i])).sentiment.polarity)
testdf["Description Polarity"]=l
testdf=testdf.drop(columns=["Description","RescuerID","Name","PetID"])

#one hot encoding state names
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
#X_train['States'] = lb.fit_transform(X_train['State']).tolist()
testdf = pd.concat([testdf, pd.get_dummies(testdf['State'])], axis=1)
testdf=testdf.drop(columns=["State"])



In [None]:
#concatenating image features to get the final set of training features
TrainFeatures=X_train.join(train_feats.reset_index(drop=True))

In [None]:
#concatenating image features to get the final set of test features
TestFeatures=testdf.join(test_feats.reset_index(drop=True))

In [None]:
TrainFeatures.head()

In [None]:
TestFeatures.head()

In [None]:
TrainFeatures.join(y_train).corr()["AdoptionSpeed"].sort_values(ascending=False)

In [None]:
#image train-test
imgX_train, imgX_test, imgY_train, imgY_test = train_test_split(train_feats, y_train, test_size = 0.2, random_state = 1)

In [None]:
print('Training shape:', imgX_train.shape)
#print(X_train.shape[0], 'sample,',X_train.shape[1] ,'x',X_train.shape[2] ,'size grayscale image.\n')
print('Test shape:', imgX_test.shape)
#print(X_test.shape[0], 'sample,',X_test.shape[1] ,'x',X_test.shape[2] ,'size grayscale image.\n')

In [None]:
imgX_test

In [None]:
import tensorflow as tf
import warnings

In [None]:
imgX_train.shape

In [None]:
from tensorflow import keras
model = keras.Sequential([
    keras.layers.Dense(256,activation='relu',input_shape=(256,)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(5, activation=tf.nn.softmax)
])

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
X_train.shape

In [None]:
model.fit(imgX_train, imgY_train , epochs=50)

In [None]:
test_loss, test_acc = model.evaluate(imgX_test, imgY_test)
print('Test accuracy:', test_acc)

In [None]:
predictions = model.predict(imgX_test)

In [None]:
predictions[0]

In [None]:
allp=[]
for i in range(len(predictions)):
    allp.append(np.argmax(predictions[i]))

In [None]:
Pid=pd.DataFrame(pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')).PetID

In [None]:
output=pd.DataFrame(Pid)

In [None]:
Pid.shape

In [None]:
predictions = model.predict(test_feats)
allp=[]
for i in range(len(predictions)):
    allp.append(np.argmax(predictions[i]))


In [None]:
len(allp)

In [None]:
output['AdoptionRate']=allp

In [None]:
output.head()

In [None]:
#output.to_csv('submission.csv', sep=',', encoding='utf-8')

In [None]:
imgoutput=output.copy()
imgoutput=output.set_index("PetID")

In [None]:
output.head()

In [None]:
#output.to_csv('submission.csv', sep=',', encoding='utf-8')

In [None]:
#checking with all features
y_train.shape

In [None]:
#training on features
xTrain.shape

In [None]:
#X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size = 0.2, random_state = 1)

In [None]:
model = keras.Sequential([
    keras.layers.Dense(32,activation='relu',input_shape=(32,)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(5, activation=tf.nn.softmax)
])

In [None]:
from sklearn.model_selection import cross_val_score
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(xTrain, yTrain , epochs=25)

In [None]:
predictions = model.predict(testdf)
allp=[]
for i in range(len(predictions)):
    allp.append(np.argmax(predictions[i]))

In [None]:
allp

In [None]:
test_loss, test_acc = model.evaluate(xTest, yTest)
print('Test accuracy:', test_acc)

In [None]:
#cross validation
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
neural_network = KerasClassifier(build_fn=model, 
                                 epochs=25, 
                                 batch_size=100, 
                                 verbose=0)

In [None]:
#cross_val_score(neural_network, xTrain, yTrain, cv=3)

In [None]:
# Create hyperparameter space
epochs = [5, 10]
batches = [5, 10, 100]
optimizers = ['rmsprop', 'adam']

# Create hyperparameter options
hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches)

In [None]:
# Create grid search
#from sklearn.model_selection import GridSearchCV
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import cross_validate
#from sklearn.svm.libsvm import cross_validation
#grid = GridSearchCV(estimator=model, param_grid=hyperparameters)
#kf = cross_validation.KFold(len(xTrain), n_folds=10)
#score = cross_val_score(model, xTrain, yTrain, cv=kf ,n_jobs=-1,scoring="accuracy").mean()
# Fit grid search
#grid_result = grid.fit(xTrain, yTrain)

In [None]:
# View hyperparameters of best neural network
#grid_result.best_params_

In [None]:
# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=model, 
                                 epochs=50, 
                                 batch_size=100, 
                                 verbose=0)

In [None]:
# Evaluate neural network using three-fold cross-validation
#cross_val_score(neural_network, xTrain, yTrain, cv=3)

In [None]:
allp

In [None]:
output['AdoptionRate']=allp

In [None]:
imgoutput['AdoptionRateFeat']=allp

In [None]:
imgoutput.head()

In [None]:
imgoutput.to_csv('combined.csv', sep=',', encoding='utf-8')

In [None]:
imgoutput.head()

In [None]:
#combining the randomforest outputs
imgoutput['Random']=(pd.DataFrame(pd.read_csv('../input/random/random.csv'))).set_index('PetID').AdoptionSpeed

In [None]:
imgoutput.head()

In [None]:
#creating a new list that takes voting from dataframe columns
adimg=imgoutput.AdoptionRate.tolist()
adfeat=imgoutput.AdoptionRateFeat.tolist()
rand=imgoutput.Random.tolist()
voting=[]
for i in range(5):
    print(str(adimg[i])+' '+str(adfeat[i])+' '+str(rand[i]))

In [None]:
#adimg adfeat rand
voting=[]
for i in range(len(adimg)):
    vote=adimg[i]
    if vote==adfeat[i]:
        voting.append(vote)
    elif vote==rand[i]:
        voting.append(vote)
    elif adfeat[i]==rand[i]:
        voting.append(adfeat[i])
    elif adimg[i]==rand[i]:
        voting.append(adimg[i])
    else:
        voting.append(adfeat[i])

In [None]:
voting[0:10]

In [None]:
ans=imgoutput.copy()

In [None]:
ans.head()

In [None]:
ans['AdoptionSpeed']=voting
ans=ans.drop(columns=['AdoptionRateFeat','Random','AdoptionRate'])

In [None]:
ans.head()

In [None]:
ans.to_csv('submission.csv', sep=',', encoding='utf-8')

In [None]:
ans.head()