In [1]:
import cv2
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import warnings
import re

import numpy as np
import pandas as pd

from PIL import Image

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

warnings.filterwarnings("ignore")

plt.rcParams['figure.figsize'] = (12, 9)
plt.style.use('ggplot')

pd.options.display.max_rows = 64
pd.options.display.max_columns = 512

## Load Data

In [2]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
train['AdoptionSpeed'].astype(np.int32)
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

In [3]:
from keras.applications.densenet import preprocess_input, DenseNet121
from tqdm import tqdm, tqdm_notebook

img_size = 256
batch_size = 16
pet_ids = train.index
n_batches = len(pet_ids) // batch_size + 1

def resize_to_square(im):
    old_size = im.shape[:2] # old_size is in (height, width) format
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    # new_size should be in (width, height) format
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

pet_ids = train['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]

train_id = train['PetID'].values
test_id = test['PetID'].values

vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in train_id:
    try:
        with open('../input/petfinder-adoption-prediction/train_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

print(nf_count)
print(nl_count)
train.loc[:, 'vertex_x'] = vertex_xs
train.loc[:, 'vertex_y'] = vertex_ys
train.loc[:, 'bounding_confidence'] = bounding_confidences
train.loc[:, 'bounding_importance'] = bounding_importance_fracs
train.loc[:, 'dominant_blue'] = dominant_blues
train.loc[:, 'dominant_green'] = dominant_greens
train.loc[:, 'dominant_red'] = dominant_reds
train.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
train.loc[:, 'dominant_score'] = dominant_scores
train.loc[:, 'label_score'] = label_scores


vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in test_id:
    try:
        with open('../input/petfinder-adoption-prediction/test_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

print(nf_count)
test.loc[:, 'vertex_x'] = vertex_xs
test.loc[:, 'vertex_y'] = vertex_ys
test.loc[:, 'bounding_confidence'] = bounding_confidences
test.loc[:, 'bounding_importance'] = bounding_importance_fracs
test.loc[:, 'dominant_blue'] = dominant_blues
test.loc[:, 'dominant_green'] = dominant_greens
test.loc[:, 'dominant_red'] = dominant_reds
test.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
test.loc[:, 'dominant_score'] = dominant_scores
test.loc[:, 'label_score'] = label_scores

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.


HBox(children=(IntProgress(value=0, max=938), HTML(value='')))


341
2
128


In [4]:
df = pd.concat([train,test],ignore_index=True)

In [5]:
train_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))
test_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))
sentimental_analysis = train_sentiment_files + test_sentiment_files

In [6]:
score=[]
magnitude=[]
petid=[]
for filename in sentimental_analysis:
    with open(filename, 'r') as f:
        sentiment_file = json.load(f)
        file_sentiment = sentiment_file['documentSentiment']
        file_score =  sentiment_file['documentSentiment']['score']
        file_magnitude = sentiment_file['documentSentiment']['magnitude']
        score.append(file_score)
        magnitude.append(file_magnitude)
        petid.append(filename.replace('.json','').replace('../input/petfinder-adoption-prediction/train_sentiment/', '').replace('../input/petfinder-adoption-prediction/test_sentiment/', ''))

In [7]:
score_dict = dict(zip(petid,score))
magnitude_dict = dict(zip(petid,magnitude))

In [8]:
df['Score'] = df['PetID'].map(score_dict)
df['Score'][df.Score.isnull()] = 0
df['Magnitude'] = df['PetID'].map(magnitude_dict)
df['Magnitude'][df.Magnitude.isnull()] = 0
df.set_index('PetID',inplace=True)

## Core features

In [9]:
df.isnull().sum()

AdoptionSpeed          3948
Age                       0
Breed1                    0
Breed2                    0
Color1                    0
Color2                    0
Color3                    0
Description              14
Dewormed                  0
Fee                       0
FurLength                 0
Gender                    0
Health                    0
MaturitySize              0
Name                   1560
PhotoAmt                  0
Quantity                  0
RescuerID                 0
State                     0
Sterilized                0
Type                      0
Vaccinated                0
VideoAmt                  0
bounding_confidence       0
bounding_importance       0
dominant_blue             0
dominant_green            0
dominant_pixel_frac       0
dominant_red              0
dominant_score            0
label_score               0
vertex_x                  0
vertex_y                  0
Score                     0
Magnitude                 0
dtype: int64

### Name 
Categorize to with meaningful name, with meaningless name and without name.

#### Meaningless Rule
1. 1 or 2 letters
2. With the word "NO" "NOT" "YET" "NAME"
3. Start with numbers

In [10]:
def namevaild(name):
    if name == np.nan:
        return 0
    elif len(str(name)) < 3:
        return 1
    elif re.match(u'[0-9]', str(name).lower()):
        return 1
    elif len(set(str(name).lower().split(' ')+['no','not','yet','male','female','unnamed'])) != len(set(str(name).lower().split(' ')))+6:
        return 1
    else:
        return 2
df['Name_state'] = df['Name'].apply(namevaild)

### Fee

Binning into 0, (0,50], (50,100], (100,200], (200,500], (500, +inf)

In [11]:
df['Fee_per_pet'] = df.Fee/df.Quantity

df['Fee_Bin']=pd.factorize(pd.cut(df.Fee_per_pet,bins=[0,0.01,50,100,200,500,3000],right=False))[0]
fee_bin_dummies_df = pd.get_dummies(df['Fee_Bin']).rename(columns=lambda x: 'Fee_Bin_' + str(x))
df = pd.concat([df, fee_bin_dummies_df], axis=1)

### Quantity

Binning to [1,2,4,22]

In [12]:
df['Quantity_Bin']=pd.factorize(pd.cut(df.Quantity,bins=[1,2,4,22],right=False))[0]
quantity_bin_dummies_df = pd.get_dummies(df['Quantity_Bin']).rename(columns=lambda x: 'Quantity_Bin_' + str(x))
df = pd.concat([df, quantity_bin_dummies_df], axis=1)

### VideoAmt & PhotoAmt

In [13]:
df.VideoAmt = df.VideoAmt.apply(lambda x: 1 if x > 0 else 0)

df['PhotoAmt_Bin']=pd.factorize(pd.cut(df.PhotoAmt,bins=[0,1,2,4,31],right=False))[0]
photo_bin_dummies_df = pd.get_dummies(df['PhotoAmt_Bin']).rename(columns=lambda x: 'PhotoAmt_Bin_' + str(x))
df = pd.concat([df, photo_bin_dummies_df], axis=1)

### State

In [14]:
def map_state(state):
    if state == 41326:
        return 'Selangor'
    elif state == 41401:
        return 'Kuala_Lumpur'
    else:
        return 'Other_State'
df['State_Bin'] = df.State.apply(map_state)
state_bin_dummies_df = pd.get_dummies(df['State_Bin']).rename(columns=lambda x: 'State_' + str(x))
df = pd.concat([df, state_bin_dummies_df], axis=1)

### Rescuer
Binning the saving number of animals in total

In [15]:
rescuer_dict = df.RescuerID.value_counts().to_dict()
df['Rescuer_Num'] = df.RescuerID.map(rescuer_dict)
#df['Rescuer_Bin']=pd.factorize(pd.cut(df.Rescuer_Num,bins=[1,2,5],right=False))[0]
#df['Rescuer_Bin'].value_counts()
#rescuer_bin_dummies_df = pd.get_dummies(df['Rescuer_Bin']).rename(columns=lambda x: 'Rescuer_Bin_' + str(x))
#df = pd.concat([df, rescuer_bin_dummies_df], axis=1)

### Breed

#### Mix or Pure
We consider a cat/dog is mixed breed if:
1. Breed1_name or Breed2_name is Mixed_Breed
2. Breed1_name is NA
3. Breed1_name != Breed2_name

In [16]:
breeds = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')
breeds_dict = {k: v for k, v in zip(breeds['BreedID'], breeds['BreedName'])}
df['Breed1_name'] = df['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'NA')
df['Breed2_name'] = df['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'NA')

In [17]:
df['Breed'] = df['Breed1_name'] + '--' + df['Breed2_name']
def mix_breed(string):
    breed = string.split('--')
    if breed[0] in ['Mixed_Breed','NA']:
        return 1
    elif breed[1] == 'Mixed_Breed':
        return 1
    elif breed[1] == 'NA':
        return 0
    elif breed[0] != breed[1]:
        return 1
    else:
        return 0
df['Mixed_Breed'] = df.Breed.apply(mix_breed)
df[df.Mixed_Breed == 0].Breed.value_counts()

Domestic_Short_Hair--NA                                   4042
Domestic_Medium_Hair--NA                                  1264
Tabby--NA                                                  379
Domestic_Short_Hair--Domestic_Short_Hair                   320
Domestic_Long_Hair--NA                                     244
Shih_Tzu--NA                                               204
Poodle--NA                                                 153
Siamese--NA                                                152
Golden_Retriever--NA                                       123
Domestic_Medium_Hair--Domestic_Medium_Hair                 110
Calico--NA                                                 101
Persian--NA                                                100
American_Shorthair--NA                                      80
Oriental_Short_Hair--NA                                     79
Rottweiler--NA                                              76
Labrador_Retriever--NA                                 

### Description

In [18]:
df.Description[df.Description.isnull()] = ''
des_list = df.Description.values.tolist()

In [19]:
import unicodedata
import re

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

'''
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words 
'''

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    #words = replace_numbers(words)
    words = remove_stopwords(words)
    #words = stem_words(words)
    words = lemmatize_verbs(words)
    return words

'''
'''
word_bag = []

for i,item in enumerate(des_list):
    words = word_tokenize(item)
    words = normalize(words)
    word_bag.append(words)
df['Word_bag'] = word_bag

def wordjoin(x):
    return ' '.join(x)

df['Word_list'] = df['Word_bag'].apply(wordjoin)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(min_df = 0.02)
transformer=TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(df.Word_list))
weight=tfidf.toarray()


In [21]:
from sklearn.decomposition import PCA

n_components = 25
pca = PCA(n_components=n_components, random_state=42)
pca.fit(weight)
text_feature = pca.transform(weight)

columns = []
for i in range(n_components):
    columns.append('text_feature_'+str(i+1))

In [22]:
df = pd.concat([df,pd.DataFrame(text_feature, index = df.index, columns = columns)],axis = 1)

## Baseline

In [23]:
df.head()

Unnamed: 0_level_0,AdoptionSpeed,Age,Breed1,Breed2,Color1,Color2,Color3,Description,Dewormed,Fee,FurLength,Gender,Health,MaturitySize,Name,PhotoAmt,Quantity,RescuerID,State,Sterilized,Type,Vaccinated,VideoAmt,bounding_confidence,bounding_importance,dominant_blue,dominant_green,dominant_pixel_frac,dominant_red,dominant_score,label_score,vertex_x,vertex_y,Score,Magnitude,Name_state,Fee_per_pet,Fee_Bin,Fee_Bin_0,Fee_Bin_1,Fee_Bin_2,Fee_Bin_3,Fee_Bin_4,Fee_Bin_5,Quantity_Bin,Quantity_Bin_0,Quantity_Bin_1,Quantity_Bin_2,PhotoAmt_Bin,PhotoAmt_Bin_0,PhotoAmt_Bin_1,PhotoAmt_Bin_2,PhotoAmt_Bin_3,State_Bin,State_Kuala_Lumpur,State_Other_State,State_Selangor,Rescuer_Num,Breed1_name,Breed2_name,Breed,Mixed_Breed,Word_bag,Word_list,text_feature_1,text_feature_2,text_feature_3,text_feature_4,text_feature_5,text_feature_6,text_feature_7,text_feature_8,text_feature_9,text_feature_10,text_feature_11,text_feature_12,text_feature_13,text_feature_14,text_feature_15,text_feature_16,text_feature_17,text_feature_18,text_feature_19,text_feature_20,text_feature_21,text_feature_22,text_feature_23,text_feature_24,text_feature_25
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1
86e1089a3,2.0,3,299,0,1,7,0,Nibble is a 3+ month old ball of cuteness. He ...,2,100,1,1,1,1,Nibble,1.0,1,8480853f516546f6cf33aa88cd76c379,41326,2,2,2,0,0.8,1.0,21,20,0.39391,25,0.302789,0.990786,359,479,0.3,2.4,2,100.0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,Selangor,0,0,1,8,Tabby,,Tabby--NA,0,"[nibble, 3, month, old, ball, cuteness, energe...",nibble 3 month old ball cuteness energetic pla...,-0.093209,0.209134,-0.06559,-0.089364,0.057371,-0.101052,-0.041325,0.077994,-0.120373,0.086416,-0.055779,0.080694,-0.067014,-0.02415,-0.008719,0.068664,0.107313,0.155906,0.035038,0.147614,0.197929,-0.168415,-0.149806,-0.041435,0.100738
6296e909a,0.0,1,265,0,1,2,0,I just found it alone yesterday near my apartm...,3,0,2,1,1,2,No Name Yet,2.0,1,3082c7125d8fb66f7dd4bff4192c8b14,41401,3,2,3,0,0.8,1.0,17,24,0.261856,19,0.348178,0.981269,398,376,-0.2,0.7,1,0.0,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,Kuala_Lumpur,1,0,0,1,Domestic_Medium_Hair,,Domestic_Medium_Hair--NA,0,"[find, alone, yesterday, near, apartment, shak...",find alone yesterday near apartment shake brin...,-0.072781,-0.028536,-0.009663,-0.030343,-0.023296,0.05103,0.015351,0.147047,-0.023531,0.088097,0.209926,0.015091,0.028685,0.001174,0.017918,-0.03507,-0.011162,-0.015263,-0.09923,0.002396,-0.001354,-0.015721,-0.103807,-0.019219,-0.056726
3422e4906,3.0,1,307,0,2,7,0,Their pregnant mother was dumped by her irresp...,1,0,2,1,1,2,Brisco,7.0,1,fa90fa5b1ee11c86938398b60abc32cb,41326,2,1,1,0,0.8,1.0,51,55,0.055064,61,0.333318,0.960457,299,399,0.2,3.7,2,0.0,1,0,1,0,0,0,0,0,1,0,0,2,0,0,1,0,Selangor,0,0,1,459,Mixed_Breed,,Mixed_Breed--NA,1,"[pregnant, mother, dump, irresponsible, owner,...",pregnant mother dump irresponsible owner roads...,-0.019952,-0.161695,0.063043,-0.010414,-0.120409,-0.101301,0.009107,-0.026191,-0.08525,-0.13686,-0.04211,0.006422,0.152133,-0.161359,0.091899,-0.098695,0.039399,0.127994,-0.108353,-0.010166,-0.146156,0.009666,0.119911,-0.067618,-0.069331
5842f1ff5,2.0,4,307,0,1,2,0,"Good guard dog, very alert, active, obedience ...",1,150,1,2,1,2,Miko,8.0,1,9238e4f44c71a75282e62f7136c6b240,41401,2,1,1,0,0.8,1.0,47,50,0.127818,54,0.136823,0.978698,399,299,0.9,0.9,2,150.0,0,1,0,0,0,0,0,0,1,0,0,2,0,0,1,0,Kuala_Lumpur,1,0,0,50,Mixed_Breed,,Mixed_Breed--NA,1,"[good, guard, dog, alert, active, obedience, w...",good guard dog alert active obedience wait goo...,-0.024435,-0.167924,-0.075133,0.067833,-0.058635,-0.103897,0.026455,-0.133483,0.07171,-0.187254,0.095608,0.122091,-0.053384,-0.035501,-0.147009,-0.039287,-0.008487,0.171649,-0.00583,-0.113098,0.020278,0.229791,-0.053664,-0.029351,0.087653
850a43f90,2.0,1,307,0,1,0,0,This handsome yet cute boy is up for adoption....,2,0,1,1,1,2,Hunter,3.0,1,95481e953f8aed9ec3d16fc4509537e8,41326,2,1,2,0,0.8,0.98,21,19,0.126334,24,0.256168,0.984346,517,426,0.6,3.7,2,0.0,1,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,Selangor,0,0,1,134,Mixed_Breed,,Mixed_Breed--NA,1,"[handsome, yet, cute, boy, adoption, playful, ...",handsome yet cute boy adoption playful pal see...,-0.038015,-0.0517,0.006714,-0.007782,0.134559,0.240326,-0.099964,0.180326,0.179231,-0.066217,-0.242733,0.173212,0.122368,-0.041389,-0.021115,0.076349,0.067746,0.035073,0.000975,0.1136,-0.015166,0.018511,-0.010924,0.028255,0.06501


In [24]:
df_copy = df.drop(columns=['Description','Fee','Fee_per_pet','Name','PhotoAmt','Quantity','RescuerID','State','State_Bin','Fee_Bin','Quantity_Bin','PhotoAmt_Bin','Breed','Breed1_name','Breed2_name','Word_bag','Word_list'])

train = df_copy[df.AdoptionSpeed.notnull()]
test  = df_copy[df.AdoptionSpeed.isnull()]
print(train.shape, test.shape)

(14993, 72) (3948, 72)


In [25]:
train.head()

Unnamed: 0_level_0,AdoptionSpeed,Age,Breed1,Breed2,Color1,Color2,Color3,Dewormed,FurLength,Gender,Health,MaturitySize,Sterilized,Type,Vaccinated,VideoAmt,bounding_confidence,bounding_importance,dominant_blue,dominant_green,dominant_pixel_frac,dominant_red,dominant_score,label_score,vertex_x,vertex_y,Score,Magnitude,Name_state,Fee_Bin_0,Fee_Bin_1,Fee_Bin_2,Fee_Bin_3,Fee_Bin_4,Fee_Bin_5,Quantity_Bin_0,Quantity_Bin_1,Quantity_Bin_2,PhotoAmt_Bin_0,PhotoAmt_Bin_1,PhotoAmt_Bin_2,PhotoAmt_Bin_3,State_Kuala_Lumpur,State_Other_State,State_Selangor,Rescuer_Num,Mixed_Breed,text_feature_1,text_feature_2,text_feature_3,text_feature_4,text_feature_5,text_feature_6,text_feature_7,text_feature_8,text_feature_9,text_feature_10,text_feature_11,text_feature_12,text_feature_13,text_feature_14,text_feature_15,text_feature_16,text_feature_17,text_feature_18,text_feature_19,text_feature_20,text_feature_21,text_feature_22,text_feature_23,text_feature_24,text_feature_25
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
86e1089a3,2.0,3,299,0,1,7,0,2,1,1,1,1,2,2,2,0,0.8,1.0,21,20,0.39391,25,0.302789,0.990786,359,479,0.3,2.4,2,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,8,0,-0.093209,0.209134,-0.06559,-0.089364,0.057371,-0.101052,-0.041325,0.077994,-0.120373,0.086416,-0.055779,0.080694,-0.067014,-0.02415,-0.008719,0.068664,0.107313,0.155906,0.035038,0.147614,0.197929,-0.168415,-0.149806,-0.041435,0.100738
6296e909a,0.0,1,265,0,1,2,0,3,2,1,1,2,3,2,3,0,0.8,1.0,17,24,0.261856,19,0.348178,0.981269,398,376,-0.2,0.7,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,-0.072781,-0.028536,-0.009663,-0.030343,-0.023296,0.05103,0.015351,0.147047,-0.023531,0.088097,0.209926,0.015091,0.028685,0.001174,0.017918,-0.03507,-0.011162,-0.015263,-0.09923,0.002396,-0.001354,-0.015721,-0.103807,-0.019219,-0.056726
3422e4906,3.0,1,307,0,2,7,0,1,2,1,1,2,2,1,1,0,0.8,1.0,51,55,0.055064,61,0.333318,0.960457,299,399,0.2,3.7,2,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,459,1,-0.019952,-0.161695,0.063043,-0.010414,-0.120409,-0.101301,0.009107,-0.026191,-0.08525,-0.13686,-0.04211,0.006422,0.152133,-0.161359,0.091899,-0.098695,0.039399,0.127994,-0.108353,-0.010166,-0.146156,0.009666,0.119911,-0.067618,-0.069331
5842f1ff5,2.0,4,307,0,1,2,0,1,1,2,1,2,2,1,1,0,0.8,1.0,47,50,0.127818,54,0.136823,0.978698,399,299,0.9,0.9,2,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,50,1,-0.024435,-0.167924,-0.075133,0.067833,-0.058635,-0.103897,0.026455,-0.133483,0.07171,-0.187254,0.095608,0.122091,-0.053384,-0.035501,-0.147009,-0.039287,-0.008487,0.171649,-0.00583,-0.113098,0.020278,0.229791,-0.053664,-0.029351,0.087653
850a43f90,2.0,1,307,0,1,0,0,2,1,1,1,2,2,1,2,0,0.8,0.98,21,19,0.126334,24,0.256168,0.984346,517,426,0.6,3.7,2,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,134,1,-0.038015,-0.0517,0.006714,-0.007782,0.134559,0.240326,-0.099964,0.180326,0.179231,-0.066217,-0.242733,0.173212,0.122368,-0.041389,-0.021115,0.076349,0.067746,0.035073,0.000975,0.1136,-0.015166,0.018511,-0.010924,0.028255,0.06501


In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import cohen_kappa_score, make_scorer

kappa_scorer = make_scorer(cohen_kappa_score,weights='quadratic')

X_train = train.drop(columns = ['AdoptionSpeed'])
Y_train = train['AdoptionSpeed']
X_test = test.drop(columns = ['AdoptionSpeed'])

In [27]:
def test_rf_model(n_splits,params):
    X = train.drop(columns=['AdoptionSpeed'])
    y = train.AdoptionSpeed
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits([X,y])
    score = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rf = RandomForestClassifier(random_state=42)
        rf.set_params(**params)
        rf.fit(X_train,y_train)
        y_test = y_test.astype(np.int32)
        y_learned = rf.predict(X_train).astype(np.int32)
        y_predict = rf.predict(X_test).astype(np.int32)
        print('On training set: ', cohen_kappa_score(y_train,y_learned,weights='quadratic'))
        score.append(cohen_kappa_score(y_test,y_predict,weights='quadratic'))
        print('On testing set: ', score[-1])
    print('The final score: ', np.mean(score))
    return rf,np.mean(score)

In [28]:
def test_gb_model(n_splits,params):
    X = train.drop(columns=['AdoptionSpeed'])
    y = train.AdoptionSpeed
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits([X,y])
    score = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        gb = GradientBoostingClassifier(random_state=42)
        gb.set_params(**params)
        gb.fit(X_train,y_train)
        y_test = y_test.astype(np.int32)
        y_learned = gb.predict(X_train).astype(np.int32)
        y_predict = gb.predict(X_test).astype(np.int32)
        print('On training set: ', cohen_kappa_score(y_train,y_learned,weights='quadratic'))
        score.append(cohen_kappa_score(y_test,y_predict,weights='quadratic'))
        print('On testing set: ', score[-1])
    print('The final score: ', np.mean(score))
    return gb,np.mean(score)

In [29]:
def test_ada_model(n_splits,params):
    X = train.drop(columns=['AdoptionSpeed'])
    y = train.AdoptionSpeed
    kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)
    kf.get_n_splits([X,y])
    score = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        ada = AdaBoostClassifier(random_state=42)
        ada.set_params(**params)
        ada.fit(X_train,y_train)
        y_test = y_test.astype(np.int32)
        y_learned = ada.predict(X_train).astype(np.int32)
        y_predict = ada.predict(X_test).astype(np.int32)
        print('On training set: ', cohen_kappa_score(y_train,y_learned,weights='quadratic'))
        score.append(cohen_kappa_score(y_test,y_predict,weights='quadratic'))
        print('On testing set: ', score[-1])

    print('The final score: ', np.mean(score))
    return ada,np.mean(score)

In [None]:
# Deep rf
rf1,rf1_score = test_rf_model(4,{'criterion': 'gini', 'max_depth': 13, 'n_estimators': 1800})
rf1_train= rf1.predict(X_train).astype(np.int32)
rf1_pred = rf1.predict(X_test).astype(np.int32)

# Shallow rf
rf2,rf2_score = test_rf_model(4,{'criterion': 'gini', 'max_depth': 10, 'n_estimators': 1500})
rf2_train= rf2.predict(X_train).astype(np.int32)
rf2_pred = rf2.predict(X_test).astype(np.int32)

# Deep gb
gb1,gb1_score = test_gb_model(4,{'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 5})
gb1_train= gb1.predict(X_train).astype(np.int32)
gb1_pred = gb1.predict(X_test).astype(np.int32)

# Shallow gb
gb2,gb2_score = test_gb_model(4,{'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 4})
gb2_train= gb2.predict(X_train).astype(np.int32)
gb2_pred = gb2.predict(X_test).astype(np.int32)

On training set:  0.9112312717044015
On testing set:  0.39396004643213345
On training set:  0.8871532180675413
On testing set:  0.37903767396095833
On training set:  0.8868436928284754
On testing set:  0.35026455091873643
On training set:  0.883950103010763
On testing set:  0.35101371682966476
The final score:  0.3685689970353732
On training set:  0.625671038661847
On testing set:  0.3356467054416472
On training set:  0.6367623526312529
On testing set:  0.34050834538607155
On training set:  0.6358911999166121
On testing set:  0.36765826004113233
On training set:  0.6284422674812976
On testing set:  0.3331156198967068
The final score:  0.3442322326913895
On training set:  0.7554347742344407
On testing set:  0.36642784883972235
On training set:  0.7514555683154787
On testing set:  0.3874620276396701


In [None]:
# Deep ada
ada1,ada1_score = test_ada_model(4,{'base_estimator':DecisionTreeClassifier(max_depth=4),'n_estimators': 150, 'learning_rate':0.05})
ada1_train= ada1.predict(X_train).astype(np.int32)
ada1_pred = ada1.predict(X_test).astype(np.int32)

# Shallow ada
ada2,ada2_score = test_ada_model(4,{'base_estimator':DecisionTreeClassifier(max_depth=3),'n_estimators': 150, 'learning_rate':0.05})
ada2_train= ada2.predict(X_train).astype(np.int32)
ada2_pred = ada2.predict(X_test).astype(np.int32)

In [None]:
train_list= [rf1_train,rf2_train,gb1_train,gb2_train,ada1_train,ada2_train]
pred_list = [rf1_pred,rf2_pred,gb1_pred,gb2_pred,ada1_pred,ada2_pred]

In [None]:
prediction = pd.DataFrame({'PetID': test.index})

for item in pred_list:
    prediction = pd.concat([prediction,pd.DataFrame({'AdoptionSpeed': item})],axis=1,ignore_index=True)
prediction.set_index(0,inplace=True)

In [None]:
validation = pd.DataFrame({'PetID': train.index})

for item in train_list:
    validation = pd.concat([validation,pd.DataFrame({'AdoptionSpeed': item})],axis=1,ignore_index=True)
validation.set_index(0,inplace=True)
validation['AdoptionSpeed'] = train['AdoptionSpeed']

In [None]:
rf = RandomForestClassifier(random_state=42,max_depth=6,n_estimators=100)
rf.fit(validation[[x for x in range(1,7)]],validation['AdoptionSpeed'])
prediction['AdoptionSpeed'] = rf.predict(prediction).astype(np.int32)

In [None]:
submission = prediction.drop(columns=[x for x in range(1,7)])
submission['PetID'] = submission.index
submission.reset_index(inplace=True)
submission = submission[['PetID','AdoptionSpeed']]
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)