# Kaggle Competition: Predict at which speed a pet is adopted

In [None]:
# Import Packages

#Dataframe packages
import json
import glob
import pandas as pd
import numpy as np
import os
from PIL import Image
import requests
from io import BytesIO
import cv2
import numpy as np
from collections import Counter
from functools import partial
import scipy as sp
import torch
from torchvision import datasets, models, transforms

#Plot packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# LightGBM
import lightgbm as lgb
import scipy as sp

# Load scikit's classifier library
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold,RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,OrdinalEncoder, StandardScaler,KBinsDiscretizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import cohen_kappa_score,mean_squared_error, accuracy_score, confusion_matrix, f1_score,classification_report

# Evaluation
from sklearn.metrics import cohen_kappa_score,make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import xgboost as xgb


#Oversampling
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA

## Sentiment Data

In [None]:
sentimental_analysis_train = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))
sentimental_analysis_test = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))

print('num of train sentiment files: {}'.format(len(sentimental_analysis_train)))
print('num of train sentiment files: {}'.format(len(sentimental_analysis_test)))

In [None]:
# Define Empty lists
score=[]
magnitude=[]
petid=[]

for filename in sentimental_analysis_train:
             with open(filename, 'r') as f:
                sentiment_file = json.load(f)
             file_sentiment = sentiment_file['documentSentiment']
             file_score =  np.asarray(sentiment_file['documentSentiment']['score'])
             file_magnitude =np.asarray(sentiment_file['documentSentiment']['magnitude'])


             score.append(file_score)
             magnitude.append(file_magnitude)

             petid.append(filename.replace('.json','').replace('../input/petfinder-adoption-prediction/train_sentiment/', ''))

 # Output with sentiment data for each pet
sentimental_analysis_train = pd.concat([ pd.DataFrame(petid, columns =['PetID']) ,pd.DataFrame(score, columns =['sentiment_document_score']),
                                                    pd.DataFrame(magnitude, columns =['sentiment_document_magnitude'])],axis =1)

score=[]
magnitude=[]
petid=[]

for filename in sentimental_analysis_test:
             with open(filename, 'r') as f:
                sentiment_file = json.load(f)
             file_sentiment = sentiment_file['documentSentiment']
             file_score =  np.asarray(sentiment_file['documentSentiment']['score'])
             file_magnitude =np.asarray(sentiment_file['documentSentiment']['magnitude'])


             score.append(file_score)
             magnitude.append(file_magnitude)

             petid.append(filename.replace('.json','').replace('../input/test_sentiment/', ''))

 # Output with sentiment data for each pet
sentimental_analysis_test = pd.concat([ pd.DataFrame(petid, columns =['PetID']) ,pd.DataFrame(score, columns =['sentiment_document_score']),
                                                    pd.DataFrame(magnitude, columns =['sentiment_document_magnitude'])],axis =1)

## Image Metadata

In [None]:
image_metadata_train =  sorted(glob.glob('../input/petfinder-adoption-prediction/train_metadata/*.json'))
image_metadata_test =  sorted(glob.glob('../input/petfinder-adoption-prediction/test_metadata/*.json'))
print('num of train metadata: {}'.format(len(image_metadata_train)))
print('num of train metadata: {}'.format(len(image_metadata_test)))

In [None]:
description=[]
topicality=[]
imageid=[]
# Read Zip File and Export a Dataset with the Score and the ID
for filename in image_metadata_train:
         with open(filename, 'r') as f:
            d = json.load(f)
            file_keys = list(d.keys())
         if  'labelAnnotations' in file_keys:
            file_annots = d['labelAnnotations']
            file_topicality = np.asarray([x['topicality'] for x in file_annots])
            file_description = [x['description'] for x in file_annots]
            #Create a list of all descriptions and topicality
            description.append(file_description)
            topicality.append(file_topicality)
            #Create a list with all image id name
            imageid.append(filename.replace('.json','').replace('../input/petfinder-adoption-prediction/train_metadata/',''))

# Prepare the output by renaming all variables
description=pd.DataFrame(description)
topicality=pd.DataFrame(topicality)

new_names = [(i,'metadata_description_'+str(i)) for i in description.iloc[:, 0:].columns.values]
description.rename(columns = dict(new_names), inplace=True)

new_names = [(i,'metadata_topicality_'+str(i)) for i in topicality.iloc[:, 0:].columns.values]
topicality.rename(columns = dict(new_names), inplace=True)

# Output with sentiment data for each pet
image_labelannot_train = pd.concat([ pd.DataFrame(imageid, columns =['ImageId']) ,topicality,description],axis =1)

# create the PetId variable
image_labelannot_train['PetID'] = image_labelannot_train['ImageId'].str.split('-').str[0]


##############
# TOPICALITY #
##############

image_labelannot_train['metadata_topicality_mean'] = image_labelannot_train.iloc[:,1:10].mean(axis=1)
image_labelannot_train['metadata_topicality_mean']  = image_labelannot_train.groupby(['PetID'])['metadata_topicality_mean'].transform('mean') 

image_labelannot_train['metadata_topicality_max'] = image_labelannot_train.iloc[:,1:10].max(axis=1)
image_labelannot_train['metadata_topicality_max'] = image_labelannot_train.groupby(['PetID'])['metadata_topicality_max'].transform(max)

image_labelannot_train['metadata_topicality_min'] = image_labelannot_train.iloc[:,1:10].min(axis=1)
image_labelannot_train['metadata_topicality_min'] = image_labelannot_train.groupby(['PetID'])['metadata_topicality_min'].transform(min)


image_labelannot_train['metadata_topicality_0_mean']  = image_labelannot_train.groupby(['PetID'])['metadata_topicality_0'].transform('mean')
image_labelannot_train['metadata_topicality_0_max'] = image_labelannot_train.groupby(['PetID'])['metadata_topicality_0'].transform(max)
image_labelannot_train['metadata_topicality_0_min'] = image_labelannot_train.groupby(['PetID'])['metadata_topicality_0'].transform(min)


###############
# DESCRIPTION #
###############

# Create Features from the Images
image_labelannot_train['L_metadata_0_cat']=image_labelannot_train['metadata_description_0'].str.contains("cat").astype(int)
image_labelannot_train['L_metadata_0_dog'] =image_labelannot_train['metadata_description_0'].str.contains("dog").astype(int)

image_labelannot_train['L_metadata_any_cat']=image_labelannot_train.apply(lambda row: row.astype(str).str.contains('cat').any(), axis=1)
image_labelannot_train['L_metadata_any_dog']=image_labelannot_train.apply(lambda row: row.astype(str).str.contains('dog').any(), axis=1)

image_labelannot_train['L_metadata_0_cat_sum'] = image_labelannot_train.groupby(image_labelannot_train['PetID'])['L_metadata_0_cat'].transform('sum')
image_labelannot_train['L_metadata_0_dog_sum'] = image_labelannot_train.groupby(image_labelannot_train['PetID'])['L_metadata_0_dog'].transform('sum')

image_labelannot_train['L_metadata_any_cat_sum'] = image_labelannot_train.groupby(image_labelannot_train['PetID'])['L_metadata_any_cat'].transform('sum')
image_labelannot_train['L_metadata_any_dog_sum'] = image_labelannot_train.groupby(image_labelannot_train['PetID'])['L_metadata_any_dog'].transform('sum')

image_labelannot_train = image_labelannot_train[['PetID','metadata_topicality_max','metadata_topicality_mean','metadata_topicality_min','metadata_topicality_0_mean','metadata_topicality_0_max',
                                                 'metadata_topicality_0_min','L_metadata_0_cat_sum','L_metadata_0_dog_sum','L_metadata_any_cat_sum','L_metadata_any_dog_sum']]
image_labelannot_train=image_labelannot_train.drop_duplicates('PetID')

description=[]
topicality=[]
imageid=[]

# Read Zip File and Export a Dataset with the Score and the ID
for filename in image_metadata_test:
         with open(filename, 'r') as f:
            d = json.load(f)
            file_keys = list(d.keys())
         if  'labelAnnotations' in file_keys:
            file_annots = d['labelAnnotations']
            file_topicality = np.asarray([x['topicality'] for x in file_annots])
            file_description = [x['description'] for x in file_annots]
            #Create a list of all descriptions and topicality
            description.append(file_description)
            topicality.append(file_topicality)
            #Create a list with all image id name
            imageid.append(filename.replace('.json','').replace('../input/petfinder-adoption-prediction/test_metadata/',''))

# Prepare the output by renaming all variables
description=pd.DataFrame(description)
topicality=pd.DataFrame(topicality)

new_names = [(i,'metadata_description_'+str(i)) for i in description.iloc[:, 0:].columns.values]
description.rename(columns = dict(new_names), inplace=True)

new_names = [(i,'metadata_topicality_'+str(i)) for i in topicality.iloc[:, 0:].columns.values]
topicality.rename(columns = dict(new_names), inplace=True)

# Output with sentiment data for each pet
image_labelannot_test = pd.concat([ pd.DataFrame(imageid, columns =['ImageId']) ,topicality,description],axis =1)

# create the PetId variable
image_labelannot_test['PetID'] = image_labelannot_test['ImageId'].str.split('-').str[0]


##############
# TOPICALITY #
##############

image_labelannot_test['metadata_topicality_mean'] = image_labelannot_test.iloc[:,1:10].mean(axis=1)
image_labelannot_test['metadata_topicality_mean']  = image_labelannot_test.groupby(['PetID'])['metadata_topicality_mean'].transform('mean') 

image_labelannot_test['metadata_topicality_max'] = image_labelannot_test.iloc[:,1:10].max(axis=1)
image_labelannot_test['metadata_topicality_max'] = image_labelannot_test.groupby(['PetID'])['metadata_topicality_max'].transform(max)

image_labelannot_test['metadata_topicality_min'] = image_labelannot_test.iloc[:,1:10].min(axis=1)
image_labelannot_test['metadata_topicality_min'] = image_labelannot_test.groupby(['PetID'])['metadata_topicality_min'].transform(min)


image_labelannot_test['metadata_topicality_0_mean']  = image_labelannot_test.groupby(['PetID'])['metadata_topicality_0'].transform('mean')
image_labelannot_test['metadata_topicality_0_max'] = image_labelannot_test.groupby(['PetID'])['metadata_topicality_0'].transform(max)
image_labelannot_test['metadata_topicality_0_min'] = image_labelannot_test.groupby(['PetID'])['metadata_topicality_0'].transform(min)


###############
# DESCRIPTION #
###############

# Create Features from the Images
image_labelannot_test['L_metadata_0_cat']=image_labelannot_test['metadata_description_0'].str.contains("cat").astype(int)
image_labelannot_test['L_metadata_0_dog'] =image_labelannot_test['metadata_description_0'].str.contains("dog").astype(int)

image_labelannot_test['L_metadata_any_cat']=image_labelannot_test.apply(lambda row: row.astype(str).str.contains('cat').any(), axis=1)
image_labelannot_test['L_metadata_any_dog']=image_labelannot_test.apply(lambda row: row.astype(str).str.contains('dog').any(), axis=1)

image_labelannot_test['L_metadata_0_cat_sum'] = image_labelannot_test.groupby(image_labelannot_test['PetID'])['L_metadata_0_cat'].transform('sum')
image_labelannot_test['L_metadata_0_dog_sum'] = image_labelannot_test.groupby(image_labelannot_test['PetID'])['L_metadata_0_dog'].transform('sum')

image_labelannot_test['L_metadata_any_cat_sum'] = image_labelannot_test.groupby(image_labelannot_test['PetID'])['L_metadata_any_cat'].transform('sum')
image_labelannot_test['L_metadata_any_dog_sum'] = image_labelannot_test.groupby(image_labelannot_test['PetID'])['L_metadata_any_dog'].transform('sum')

image_labelannot_test = image_labelannot_test[['PetID','metadata_topicality_max','metadata_topicality_mean','metadata_topicality_min',
                                               'metadata_topicality_0_mean','metadata_topicality_0_max','metadata_topicality_0_min','L_metadata_0_cat_sum','L_metadata_0_dog_sum',
                                               'L_metadata_any_cat_sum','L_metadata_any_dog_sum']]
image_labelannot_test=image_labelannot_test.drop_duplicates('PetID')

In [None]:
color_score_mean=[]
color_score_min=[]
color_score_max=[]

color_pixelfrac_mean=[]
color_pixelfrac_min=[]
color_pixelfrac_max=[]

imageid=[]

# Read Zip File and Export a Dataset with the Score and the ID
for filename in image_metadata_train:
         with open(filename, 'r') as f:
              d = json.load(f)
              file_keys = list(d.keys())
              if  'imagePropertiesAnnotation' in file_keys:
                  file_colors = d['imagePropertiesAnnotation']['dominantColors']['colors']
               
                  file_color_score_mean = np.asarray([x['score'] for x in file_colors]).mean()
                  file_color_pixelfrac_mean = np.asarray([x['pixelFraction'] for x in file_colors]).mean()

                  file_color_score_min = np.asarray([x['score'] for x in file_colors]).min()
                  file_color_pixelfrac_min = np.asarray([x['pixelFraction'] for x in file_colors]).min()


                  file_color_score_max = np.asarray([x['score'] for x in file_colors]).max()
                  file_color_pixelfrac_max = np.asarray([x['pixelFraction'] for x in file_colors]).max()


              #Create a list with all image id name
              imageid.append(filename.replace('.json','').replace('../input/petfinder-adoption-prediction/train_metadata/', ''))

              color_score_mean.append(file_color_score_mean)
              color_score_min.append(file_color_score_min)
              color_score_max.append(file_color_score_max)


              color_pixelfrac_mean.append(file_color_pixelfrac_mean)
              color_pixelfrac_min.append(file_color_pixelfrac_min)
              color_pixelfrac_max.append(file_color_pixelfrac_max)

      
image_properties_train = pd.concat([pd.DataFrame({'ImageId':imageid}),pd.DataFrame({'metadata_color_pixelfrac_mean':color_pixelfrac_mean}), pd.DataFrame({'metadata_color_pixelfrac_min':color_pixelfrac_min}),pd.DataFrame({'metadata_color_pixelfrac_max':color_pixelfrac_max}),pd.DataFrame({'metadata_color_score_mean':color_score_mean}),pd.DataFrame({'metadata_color_score_min':color_score_min}),pd.DataFrame({'metadata_color_score_max':color_score_max})],axis=1)


# create the PetId variable
image_properties_train['PetID'] = image_properties_train['ImageId'].str.split('-').str[0]


##############
# COLOR INFO #
##############
image_properties_train['metadata_color_pixelfrac_mean']  = image_properties_train.groupby(['PetID'])['metadata_color_pixelfrac_mean'].transform('mean') 
image_properties_train['metadata_color_pixelfrac_min']  = image_properties_train.groupby(['PetID'])['metadata_color_pixelfrac_min'].transform(min) 
image_properties_train['metadata_color_pixelfrac_max']  = image_properties_train.groupby(['PetID'])['metadata_color_pixelfrac_max'].transform(max) 

image_properties_train['metadata_color_score_mean']  = image_properties_train.groupby(['PetID'])['metadata_color_score_mean'].transform('mean') 
image_properties_train['metadata_color_score_min']  = image_properties_train.groupby(['PetID'])['metadata_color_score_min'].transform(min) 
image_properties_train['metadata_color_score_max']  = image_properties_train.groupby(['PetID'])['metadata_color_score_max'].transform(max)

image_properties_train=image_properties_train.drop_duplicates('PetID')
image_properties_train = image_properties_train.drop(['ImageId'], 1)


color_score_mean=[]
color_score_min=[]
color_score_max=[]

color_pixelfrac_mean=[]
color_pixelfrac_min=[]
color_pixelfrac_max=[]

imageid=[]

# Read Zip File and Export a Dataset with the Score and the ID
for filename in image_metadata_test:
         with open(filename, 'r') as f:
              d = json.load(f)
              file_keys = list(d.keys())
              if  'imagePropertiesAnnotation' in file_keys:
                  file_colors = d['imagePropertiesAnnotation']['dominantColors']['colors']
               
                  file_color_score_mean = np.asarray([x['score'] for x in file_colors]).mean()
                  file_color_pixelfrac_mean = np.asarray([x['pixelFraction'] for x in file_colors]).mean()

                  file_color_score_min = np.asarray([x['score'] for x in file_colors]).min()
                  file_color_pixelfrac_min = np.asarray([x['pixelFraction'] for x in file_colors]).min()


                  file_color_score_max = np.asarray([x['score'] for x in file_colors]).max()
                  file_color_pixelfrac_max = np.asarray([x['pixelFraction'] for x in file_colors]).max()


              #Create a list with all image id name
              imageid.append(filename.replace('.json','').replace('../input/test_metadata/', ''))

              color_score_mean.append(file_color_score_mean)
              color_score_min.append(file_color_score_min)
              color_score_max.append(file_color_score_max)


              color_pixelfrac_mean.append(file_color_pixelfrac_mean)
              color_pixelfrac_min.append(file_color_pixelfrac_min)
              color_pixelfrac_max.append(file_color_pixelfrac_max)

      
image_properties_test = pd.concat([pd.DataFrame({'ImageId':imageid}),pd.DataFrame({'metadata_color_pixelfrac_mean':color_pixelfrac_mean}), pd.DataFrame({'metadata_color_pixelfrac_min':color_pixelfrac_min}),pd.DataFrame({'metadata_color_pixelfrac_max':color_pixelfrac_max}),pd.DataFrame({'metadata_color_score_mean':color_score_mean}),pd.DataFrame({'metadata_color_score_min':color_score_min}),pd.DataFrame({'metadata_color_score_max':color_score_max})],axis=1)


# create the PetId variable
image_properties_test['PetID'] = image_properties_test['ImageId'].str.split('-').str[0]


##############
# COLOR INFO #
##############
image_properties_test['metadata_color_pixelfrac_mean']  = image_properties_test.groupby(['PetID'])['metadata_color_pixelfrac_mean'].transform('mean') 
image_properties_test['metadata_color_pixelfrac_min']  = image_properties_test.groupby(['PetID'])['metadata_color_pixelfrac_min'].transform(min) 
image_properties_test['metadata_color_pixelfrac_max']  = image_properties_test.groupby(['PetID'])['metadata_color_pixelfrac_max'].transform(max) 

image_properties_test['metadata_color_score_mean']  = image_properties_test.groupby(['PetID'])['metadata_color_score_mean'].transform('mean') 
image_properties_test['metadata_color_score_min']  = image_properties_test.groupby(['PetID'])['metadata_color_score_min'].transform(min) 
image_properties_test['metadata_color_score_max']  = image_properties_test.groupby(['PetID'])['metadata_color_score_max'].transform(max)

image_properties_test=image_properties_test.drop_duplicates('PetID')
image_properties_test = image_properties_test.drop(['ImageId'], 1)

## Image Quality 

In [None]:
import glob
import random

import pandas as pd
import numpy as np
import cv2
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import functional as F
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt

In [None]:
def get_profile_path(category):

    data = []

    for path in sorted(glob.glob('../input/petfinder-adoption-prediction/%s_images/*-1.jpg' % category)):

        data.append({
            'PetID': path.split('/')[-1].split('-')[0],
            'path': path,
        })
            
    return pd.DataFrame(data)

train = get_profile_path('train')
test = get_profile_path('test')

In [None]:
size=224

def resize_to_square(image, size):
    h, w, d = image.shape
    ratio = size / max(h, w)
    resized_image = cv2.resize(image, (int(w*ratio), int(h*ratio)), cv2.INTER_AREA)
    return resized_image

def image_to_tensor(image, normalize=None):
    tensor = torch.from_numpy(np.moveaxis(image / (255. if image.dtype == np.uint8 else 1), -1, 0).astype(np.float32))
    if normalize is not None:
        return F.normalize(tensor, **normalize)
    return tensor

def pad(image, min_height, min_width):
    h,w,d = image.shape

    if h < min_height:
        h_pad_top = int((min_height - h) / 2.0)
        h_pad_bottom = min_height - h - h_pad_top
    else:
        h_pad_top = 0
        h_pad_bottom = 0

    if w < min_width:
        w_pad_left = int((min_width - w) / 2.0)
        w_pad_right = min_width - w - w_pad_left
    else:
        w_pad_left = 0
        w_pad_right = 0

    return cv2.copyMakeBorder(image, h_pad_top, h_pad_bottom, w_pad_left, w_pad_right, cv2.BORDER_CONSTANT, value=(0,0,0))


class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, df, size):
        self.df = df
        self.size = size
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):

        row = self.df.iloc[idx]

        image = cv2.imread(row.path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = resize_to_square(image, self.size)
        image = pad(image, self.size, self.size)
        tensor = image_to_tensor(image, normalize={'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]})
            
        return tensor

In [None]:
def extract_features_dense(df):
    model = models.densenet121(pretrained=False)
    model.load_state_dict(torch.load('../input/pytorch-pretrained-image-models/densenet121.pth'))
    model = model.cuda()
    model.eval()

    # register hook to access to features in forward pass
    features = []
    def hook(module, input, output):
        N,C,H,W = output.shape
        output = output.reshape(N,C,-1)
        features.append(output.mean(dim=2).cpu().detach().numpy())
        
    handle = model._modules.get('features').register_forward_hook(hook)

    dataset = Dataset(df, size)
    loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)

    for i_batch, inputs in tqdm(enumerate(loader), total=len(loader)):
        _ = model(inputs.cuda())

    features = np.concatenate(features)

    features = pd.DataFrame(features)
    features = features.add_prefix('IMAGE_')
    features.loc[:,'PetID'] = df['PetID']
    
    handle.remove()
    del model

    return features

In [None]:
def extract_features(df):
    model = models.resnet50(pretrained=False)
    model.load_state_dict(torch.load('../input/pytorch-pretrained-image-models/resnet50.pth'))
    model = model.cuda()
    model.eval()

    # register hook to access to features in forward pass
    features = []
    def hook(module, input, output):
        N,C,H,W = output.shape
        output = output.reshape(N,C,-1)
        features.append(output.mean(dim=2).cpu().detach().numpy())
    handle = model._modules.get('avgpool').register_forward_hook(hook)

    dataset = Dataset(df, size)
    loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=4)

    for i_batch, inputs in tqdm(enumerate(loader), total=len(loader)):
        _ = model(inputs.cuda())

    features = np.concatenate(features)

    features = pd.DataFrame(features)
    features = features.add_prefix('IMAGE_')
    features.loc[:,'PetID'] = df['PetID']
    
    handle.remove()
    del model

    return features

In [None]:
features_train = extract_features(train)
features_test = extract_features(test)

features_train_dense = extract_features_dense(train)
features_test_dense = extract_features_dense(test)

In [None]:
n_components = 5
petID_train = features_train[['PetID']]
petID_test = features_train[['PetID']]
svd_ = TruncatedSVD(n_components=n_components)
svd_col = svd_.fit_transform(features_train.drop('PetID', axis=1))
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_')
img_features = pd.concat([svd_col,petID_train], axis=1)

svd_col = svd_.transform(features_test.drop('PetID', axis=1))
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_')
test_img_features = pd.concat([svd_col,petID_test], axis=1)

petID = features_train_dense[['PetID']]
svd_ = TruncatedSVD(n_components=n_components)
svd_col = svd_.fit_transform(features_train_dense.drop('PetID', axis=1))
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_DENSE_')
img_features_dense = pd.concat([svd_col,petID_test], axis=1)

svd_col = svd_.transform(features_test_dense.drop('PetID', axis=1))
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_DENSE_')
test_img_features_dense = pd.concat([svd_col,petID_train], axis=1)

X_train_temp = img_features.merge(img_features_dense, left_on='PetID', right_on='PetID', how='left')
X_test_temp = test_img_features.merge(test_img_features_dense, left_on='PetID', right_on='PetID', how='left')

In [None]:
image_quality_train =sorted(glob.glob('../input/petfinder-adoption-prediction/train_images/*.jpg'))
image_quality_test =sorted(glob.glob('../input/petfinder-adoption-prediction/test_images/*.jpg'))

blur=[]
image_pixel=[]
imageid =[]

for filename in image_quality_train:
              #Blur 
              image = cv2.imread(filename)
              gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
              result = cv2.Laplacian(gray, cv2.CV_64F).var() 
              # Pixels
              with Image.open(filename) as pixel:
                  width, height = pixel.size
              
              pixel = width*height
              
              #image pixel size for each image
              
              image_pixel.append(pixel)
              #blur for each image
              blur.append(result)
              #image id
              imageid.append(filename.replace('.jpg','').replace('../input/petfinder-adoption-prediction/train_images/', ''))
                
# Join Pixel, Blur and Image ID
image_quality_train = pd.concat([ pd.DataFrame(imageid, columns =['ImageId']) ,pd.DataFrame(blur, columns =['blur']),
                                        pd.DataFrame(image_pixel,columns=['pixel'])],axis =1)

# create the PetId variable
image_quality_train['PetID'] = image_quality_train['ImageId'].str.split('-').str[0]

#Mean of the Mean
image_quality_train['pixel_mean'] = image_quality_train.groupby(['PetID'])['pixel'].transform('mean')
image_quality_train['blur_mean'] = image_quality_train.groupby(['PetID'])['blur'].transform('mean') 

image_quality_train['pixel_min'] = image_quality_train.groupby(['PetID'])['pixel'].transform('min') 
image_quality_train['blur_min'] = image_quality_train.groupby(['PetID'])['blur'].transform('min')

image_quality_train['pixel_max'] = image_quality_train.groupby(['PetID'])['pixel'].transform('max') 
image_quality_train['blur_max'] = image_quality_train.groupby(['PetID'])['blur'].transform('max')

image_quality_train = image_quality_train.drop(['blur','pixel','ImageId'], 1)
image_quality_train=image_quality_train.drop_duplicates('PetID')

blur=[]
image_pixel=[]
imageid =[]

for filename in image_quality_test:
              #Blur 
              image = cv2.imread(filename)
              gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
              result = cv2.Laplacian(gray, cv2.CV_64F).var() 
              # Pixels
              with Image.open(filename) as pixel:
                  width, height = pixel.size
              
              pixel = width*height
              
              #image pixel size for each image
              
              image_pixel.append(pixel)
              #blur for each image
              blur.append(result)
              #image id
              imageid.append(filename.replace('.jpg','').replace('../input/petfinder-adoption-prediction/test_images/', ''))
                
# Join Pixel, Blur and Image ID
image_quality_test = pd.concat([ pd.DataFrame(imageid, columns =['ImageId']) ,pd.DataFrame(blur, columns =['blur']),
                                        pd.DataFrame(image_pixel,columns=['pixel'])],axis =1)

# create the PetId variable
image_quality_test['PetID'] = image_quality_test['ImageId'].str.split('-').str[0]

#Mean of the Mean
image_quality_test['pixel_mean'] = image_quality_test.groupby(['PetID'])['pixel'].transform('mean')
image_quality_test['blur_mean'] = image_quality_test.groupby(['PetID'])['blur'].transform('mean') 

image_quality_test['pixel_min'] = image_quality_test.groupby(['PetID'])['pixel'].transform('min') 
image_quality_test['blur_min'] = image_quality_test.groupby(['PetID'])['blur'].transform('min')

image_quality_test['pixel_max'] = image_quality_test.groupby(['PetID'])['pixel'].transform('max') 
image_quality_test['blur_max'] = image_quality_test.groupby(['PetID'])['blur'].transform('max')

image_quality_test = image_quality_test.drop(['blur','pixel','ImageId'], 1)
image_quality_test=image_quality_test.drop_duplicates('PetID')

## Quality Information: HU Moments

In [None]:
from math import copysign, log10

huMoments0=[]
huMoments1=[]
huMoments2=[]
huMoments3=[]
huMoments4=[]
huMoments5=[]
huMoments6=[]
imageid =[]

image_info_train =sorted(glob.glob('../input/petfinder-adoption-prediction/train_images/*.jpg'))
image_info_test =sorted(glob.glob('../input/petfinder-adoption-prediction/test_images/*.jpg'))

for filename in image_info_train:
            if filename.endswith("-1.jpg"): # Take only the moments of picture 1
                image = cv2.imread(filename)
                im = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  
                # Calculate Moments
                moments = cv2.moments(im)

                # Calculate Hu Moments
                huMoments = cv2.HuMoments(moments)
                # Log scale hu moments
                for i in range(0,7):
                      huMoments[i] = round(-1* copysign(1.0, huMoments[i]) * log10(abs(huMoments[i])),2)

                #image id
                imageid.append(filename.replace('.jpg','').replace('../input/petfinder-adoption-prediction/train_images/', ''))
                huMoments0.append(huMoments[0])

                huMoments1.append(huMoments[1])
                huMoments2.append(huMoments[2])
                huMoments3.append(huMoments[3])
                huMoments4.append(huMoments[4])
                huMoments5.append(huMoments[5])
                huMoments6.append(huMoments[6])

image_moments_train = pd.concat([pd.DataFrame({'ImageId':imageid}),pd.DataFrame({'huMoments0':np.concatenate(huMoments0,axis=0)}), 
                                     pd.DataFrame({'huMoments1':np.concatenate(huMoments1,axis=0)}),
                                     pd.DataFrame({'huMoments2':np.concatenate(huMoments2,axis=0)}),
                                     pd.DataFrame({'huMoments3':np.concatenate(huMoments3,axis=0)}),
                                     pd.DataFrame({'huMoments4':np.concatenate(huMoments4,axis=0)}),
                                     pd.DataFrame({'huMoments5':np.concatenate(huMoments5,axis=0)}),pd.DataFrame({'huMoments6':np.concatenate(huMoments6,axis=0)})],axis=1)
            

# create the PetId variable
image_moments_train['PetID'] = image_moments_train['ImageId'].str.split('-').str[0]
image_moments_train = image_moments_train[image_moments_train['ImageId'].apply(lambda x:x.endswith(("-1")))]

huMoments0=[]
huMoments1=[]
huMoments2=[]
huMoments3=[]
huMoments4=[]
huMoments5=[]
huMoments6=[]
imageid =[]
for filename in image_info_test:
            if filename.endswith("-1.jpg"): # Take only the moments of picture 1
                image = cv2.imread(filename)
                im = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  
                # Calculate Moments
                moments = cv2.moments(im)

                # Calculate Hu Moments
                huMoments = cv2.HuMoments(moments)
                # Log scale hu moments
                for i in range(0,7):
                      huMoments[i] = round(-1* copysign(1.0, huMoments[i]) * log10(abs(huMoments[i])),2)

                #image id
                imageid.append(filename.replace('.jpg','').replace('../input/petfinder-adoption-prediction/test_images/', ''))
                huMoments0.append(huMoments[0])

                huMoments1.append(huMoments[1])
                huMoments2.append(huMoments[2])
                huMoments3.append(huMoments[3])
                huMoments4.append(huMoments[4])
                huMoments5.append(huMoments[5])
                huMoments6.append(huMoments[6])

image_moments_test= pd.concat([pd.DataFrame({'ImageId':imageid}),pd.DataFrame({'huMoments0':np.concatenate(huMoments0,axis=0)}), pd.DataFrame({'huMoments1':np.concatenate(huMoments1,axis=0)}),
                                           pd.DataFrame({'huMoments2':np.concatenate(huMoments2,axis=0)}),pd.DataFrame({'huMoments3':np.concatenate(huMoments3,axis=0)}),pd.DataFrame({'huMoments4':np.concatenate(huMoments4,axis=0)}),
                                           pd.DataFrame({'huMoments5':np.concatenate(huMoments5,axis=0)}),pd.DataFrame({'huMoments6':np.concatenate(huMoments6,axis=0)})],axis=1)
            

# create the PetId variable
image_moments_test['PetID'] = image_moments_test['ImageId'].str.split('-').str[0]
image_moments_test = image_moments_test[image_moments_test['ImageId'].apply(lambda x:x.endswith(("-1")))]
image_moments_test = image_moments_test.drop(['ImageId'], 1)

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
sample_submission = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')

breed =pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv',usecols=["BreedID", "BreedName"]) #A pet could have multiple breed
color =pd.read_csv('../input/petfinder-adoption-prediction/color_labels.csv') #A pet could have multiple colors
state =pd.read_csv('../input/petfinder-adoption-prediction/state_labels.csv')

# Add information about color, breed, state and sentiment data
train = (pd.merge(train, breed.rename(columns={"BreedName": "BreedName1"}),  how='left', left_on=['Breed1'], right_on = ['BreedID']).drop('BreedID', axis=1))
train = (pd.merge(train, breed.rename(columns={"BreedName": "BreedName2"}),  how='left', left_on=['Breed2'], right_on = ['BreedID']).drop('BreedID', axis=1))

train = (pd.merge(train, color.rename(columns={"ColorName": "ColorName1"}),  how='left', left_on=['Color1'], right_on = ['ColorID']).drop('ColorID', axis=1))
train = (pd.merge(train, color.rename(columns={"ColorName": "ColorName2"}),  how='left', left_on=['Color2'], right_on = ['ColorID']).drop('ColorID', axis=1))
train = (pd.merge(train, color.rename(columns={"ColorName": "ColorName3"}),  how='left', left_on=['Color3'], right_on = ['ColorID']).drop('ColorID', axis=1))

train = (pd.merge(train, state,  how='inner', left_on=['State'], right_on = ['StateID']).drop('StateID', axis=1))

# Add information about sentimental analysis
train = (pd.merge(train, sentimental_analysis_train,  how='left', left_on=['PetID'], right_on = ['PetID']))

# Add information about Metadata Images
train = (pd.merge(train, image_properties_train,  how='left', left_on=['PetID'], right_on = ['PetID']))
train = (pd.merge(train, image_labelannot_train,  how='left', left_on=['PetID'], right_on = ['PetID']))
train = (pd.merge(train, image_moments_train,  how='left', left_on=['PetID'], right_on = ['PetID']))

# Add information about quality Images
train = (pd.merge(train, image_quality_train,  how='left', left_on=['PetID'], right_on = ['PetID']))


# Add information about color, breed, state and sentiment data
test = (pd.merge(test, breed.rename(columns={"BreedName": "BreedName1"}),  how='left', left_on=['Breed1'], right_on = ['BreedID']).drop('BreedID', axis=1))
test = (pd.merge(test, breed.rename(columns={"BreedName": "BreedName2"}),  how='left', left_on=['Breed2'], right_on = ['BreedID']).drop('BreedID', axis=1))

test = (pd.merge(test, color.rename(columns={"ColorName": "ColorName1"}),  how='left', left_on=['Color1'], right_on = ['ColorID']).drop('ColorID', axis=1))
test = (pd.merge(test, color.rename(columns={"ColorName": "ColorName2"}),  how='left', left_on=['Color2'], right_on = ['ColorID']).drop('ColorID', axis=1))
test = (pd.merge(test, color.rename(columns={"ColorName": "ColorName3"}),  how='left', left_on=['Color3'], right_on = ['ColorID']).drop('ColorID', axis=1))

test = (pd.merge(test, state,  how='inner', left_on=['State'], right_on = ['StateID']).drop('StateID', axis=1))

# Add information about sentimental analysis
test = (pd.merge(test, sentimental_analysis_test,  how='left', left_on=['PetID'], right_on = ['PetID']))

# Add information about Metadata Images
test = (pd.merge(test, image_properties_test,  how='left', left_on=['PetID'], right_on = ['PetID']))
test = (pd.merge(test, image_labelannot_test,  how='left', left_on=['PetID'], right_on = ['PetID']))
test = (pd.merge(test, image_moments_test,  how='left', left_on=['PetID'], right_on = ['PetID']))

# Add information about quality Images
test = (pd.merge(test, image_quality_test,  how='left', left_on=['PetID'], right_on = ['PetID']))

# Add info from pytorch
train = (pd.merge(train, X_train_temp,  how='left', left_on=['PetID'], right_on = ['PetID']))
test = (pd.merge(test, X_test_temp,  how='left', left_on=['PetID'], right_on = ['PetID']))

In [None]:
train.columns

## Open Source (State Information)

In [None]:
## Using the Kernel:https://www.kaggle.com/bibek777/stacking-kernels
## Using the Kernel:https://www.kaggle.com/bibek777/stacking-kernels

# state GDP: https://en.wikipedia.org/wiki/List_of_Malaysian_states_by_GDP
state_gdp = {
    41336: 116.679,
    41325: 40.596,
    41367: 23.02,
    41401: 190.075,
    41415: 5.984,
    41324: 37.274,
    41332: 42.389,
    41335: 52.452,
    41330: 67.629,
    41380: 5.642,
    41327: 81.284,
    41345: 80.167,
    41342: 121.414,
    41326: 280.698,
    41361: 32.270
}

# state population: https://en.wikipedia.org/wiki/Malaysia
state_population = {
    41336: 33.48283,
    41325: 19.47651,
    41367: 15.39601,
    41401: 16.74621,
    41415: 0.86908,
    41324: 8.21110,
    41332: 10.21064,
    41335: 15.00817,
    41330: 23.52743,
    41380: 2.31541,
    41327: 15.61383,
    41345: 32.06742,
    41342: 24.71140,
    41326: 54.62141,
    41361: 10.35977
}

state_area ={
    41336:19102,
41325:9500,
41367:15099,
41401:243,
41415:91,
41324:1664,
41332:6686,
41335:36137,
41330:21035,
41380:821,
41327:1048,
41345:73631,
41342:124450,
41326:8104,
41361:13035}

# https://www.dosm.gov.my/
# Unemployment Rate in 2017
state_unemployment ={
    41336 : 3.6,
41325 :2.9,
41367: 3.8,
41324: 0.9,
41332 : 2.7,
41335: 2.6,
41330: 3.4,
41380: 2.9,
41327: 2.1,
41345 : 5.4,
41342 : 3.3,
41326: 3.2,
41361: 4.2,
41415: 7.8,
41401: 3.3
}
# https://www.dosm.gov.my/
# per 1000 population in 2016
state_birth_rate = {
 41336:16.3,
41325:17.0,
41367:21.4,
41401:14.4,
41415:18.1,
41324:16.0,
41332:16.4,
41335:17.0,
41330:14.4,
41380:17.5,
41327:12.7,
41345:13.7,
41342:13.9,
41326:16.6,
41361:23.3,     
}

train["state_gdp"] = train.State.map(state_gdp)
train["state_population"] = train.State.map(state_population)
train["state_area"] = train.State.map(state_area)
train['state_unemployment']=train.State.map(state_unemployment)
train['state_birth_rate']=train.State.map(state_birth_rate)

test["state_gdp"] =test.State.map(state_gdp)
test["state_population"] = test.State.map(state_population)
test["state_area"] = test.State.map(state_area)
test['state_unemployment']=test.State.map(state_unemployment)
test['state_birth_rate']=test.State.map(state_birth_rate)

In [None]:
train.columns

## Create Features based on Statistical Analysis

In [None]:
# Color (Create a Flag pet has 1 color, 2 colors, 3 colors)
train['L_Color1'] = (pd.isnull(train['ColorName3']) & pd.isnull(train['ColorName2']) & pd.notnull(train['ColorName1'])).astype(int)
train['L_Color2'] = (pd.isnull(train['ColorName3']) & pd.notnull(train['ColorName2']) & pd.notnull(train['ColorName1'])).astype(int)
train['L_Color3'] = (pd.notnull(train['ColorName3']) & pd.notnull(train['ColorName2']) & pd.notnull(train['ColorName1'])).astype(int)

# Breed (create a flag if the pet has 1 breed or 2)
train['L_Breed1'] = (pd.isnull(train['BreedName2']) & pd.notnull(train['BreedName1'])).astype(int)
train['L_Breed2'] = (pd.notnull(train['BreedName2']) & pd.notnull(train['BreedName1'])).astype(int)

#Name (create a flag if the name is missing, with less than two letters)
train['Name_Length']=train['Name'].str.len()
train['L_Name_missing'] =  (pd.isnull(train['Name'])).astype(int)

# Breed create columns
train['L_Breed1_Siamese'] =(train['BreedName1']=='Siamese').astype(int)
train['L_Breed1_Persian']=(train['BreedName1']=='Persian').astype(int)
train['L_Breed1_Labrador_Retriever']=(train['BreedName1']=='Labrador Retriever').astype(int)
train['L_Breed1_Terrier']=(train['BreedName1']=='Terrier').astype(int)
train['L_Breed1_Golden_Retriever ']=(train['BreedName1']=='Golden Retriever').astype(int)

#Description 
train['Description_Length']=train['Description'].str.len() 

# Fee Amount
train['L_Fee_Free'] =  (train['Fee']==0).astype(int)

#Add the Number of Pets per Rescuer 
pets_total = train.groupby(['RescuerID']).size().reset_index(name='N_pets_total')
train= pd.merge(train, pets_total, left_on='RescuerID', right_on='RescuerID', how='inner')
train.count()

# No photo
train['L_NoPhoto'] =  (train['PhotoAmt']==0).astype(int)

#No Video
train['L_NoVideo'] =  (train['VideoAmt']==0).astype(int)

#Log Age 
train['Log_Age']= np.log(train.Age+1) 

#Negative Score 
train['L_scoreneg'] =  (train['sentiment_document_score']<0).astype(int)

#Quantity Amount >5
train.loc[train['Quantity'] > 5, 'Quantity'] = 5

In [None]:
# Color (Create a Flag pet has 1 color, 2 colors, 3 colors)
test['L_Color1'] = (pd.isnull(test['ColorName3']) & pd.isnull(test['ColorName2']) & pd.notnull(test['ColorName1'])).astype(int)
test['L_Color2'] = (pd.isnull(test['ColorName3']) & pd.notnull(test['ColorName2']) & pd.notnull(test['ColorName1'])).astype(int)
test['L_Color3'] = (pd.notnull(test['ColorName3']) & pd.notnull(test['ColorName2']) & pd.notnull(test['ColorName1'])).astype(int)

# Breed (create a flag if the pet has 1 breed or 2)
test['L_Breed1'] = (pd.isnull(test['BreedName2']) & pd.notnull(test['BreedName1'])).astype(int)
test['L_Breed2'] = (pd.notnull(test['BreedName2']) & pd.notnull(test['BreedName1'])).astype(int)

#Name (create a flag if the name is missing, with less than two letters)
test['Name_Length']=test['Name'].str.len()
test['L_Name_missing'] =  (pd.isnull(test['Name'])).astype(int)

# Breed create columns
test['L_Breed1_Siamese'] =(test['BreedName1']=='Siamese').astype(int)
test['L_Breed1_Persian']=(test['BreedName1']=='Persian').astype(int)
test['L_Breed1_Labrador_Retriever']=(test['BreedName1']=='Labrador Retriever').astype(int)
test['L_Breed1_Terrier']=(test['BreedName1']=='Terrier').astype(int)
test['L_Breed1_Golden_Retriever ']=(test['BreedName1']=='Golden Retriever').astype(int)

#Description 
test['Description_Length']=test['Description'].str.len() 

# Fee Amount
test['L_Fee_Free'] =  (test['Fee']==0).astype(int)

#Add the Number of Pets per Rescuer 
pets_total = test.groupby(['RescuerID']).size().reset_index(name='N_pets_total')
test= pd.merge(test, pets_total, left_on='RescuerID', right_on='RescuerID', how='inner')
test.count()

# No photo
test['L_NoPhoto'] =  (test['PhotoAmt']==0).astype(int)

#No Video
test['L_NoVideo'] =  (test['VideoAmt']==0).astype(int)

#Log Age 
test['Log_Age']= np.log(test.Age+1) 

#Negative Score 
test['L_scoreneg'] =  (test['sentiment_document_score']<0).astype(int)

#Quantity Amount >5
test.loc[train['Quantity'] > 5, 'Quantity'] = 5

In [None]:
cat_human_age = {1:0.5, 2:3, 3:4, 4:6, 5:8, 6:10, 7:12, 8:14, 9:15, 10:16, 11:17, 12:18, 24:24, 48:35, 72:42, 96:50, 120:60, 144:70, 168:80, 192:84 }
small_dog_human_age = {1:1, 2:2, 3:2.5, 4:3.5, 5:4.3, 6:5, 7:6.3, 8:7, 9:9, 10:11, 11:13, 12:15, 24:23, 48:32, 72:40, 96:48, 120:56, 144:64, 168:72, 192:80 }
normal_dog_human_age = {1:1, 2:2, 3:2.5, 4:3.5, 5:4.3, 6:5, 7:6.3, 8:7, 9:9, 10:11, 11:13, 12:15, 24:24, 48:34, 72:42, 96:51, 120:60, 144:69, 168:78, 192:87 }
big_dog_human_age = {1:1, 2:2, 3:2.5, 4:3.5, 5:4.3, 6:5, 7:6.3, 8:7, 9:9, 10:11, 11:13, 12:14, 24:22, 48:34, 72:45, 96:55, 120:66, 144:77, 168:88, 192:99 }

def human_age(row):
    months = row['Age']
    if months == 0:
        return 0
    if row['Type'] == 2:
        if cat_human_age.get(months) is not None:
            return cat_human_age.get(months)
        else:
            if months < 25:
                return 25
            else:
                return (25 + ((months/12) - 2) * 4)
    elif row['Type'] == 1 and row['MaturitySize'] == 1:
        if small_dog_human_age.get(months) is not None:
            return small_dog_human_age.get(months)
        else:
            if months < 24:
                return (months/12) * 11
            else:
                return (22 + ((months/12) - 2) * 4)
    elif row['Type'] == 1 and row['MaturitySize'] == 3:
        if big_dog_human_age.get(months) is not None:
            return big_dog_human_age.get(months)
        else:
            if months < 24:
                return (months/12) * 11
            else:
                return (22 + ((months/12) - 2) * 4)
    if normal_dog_human_age.get(months) is not None:
        return normal_dog_human_age.get(months)
    else:
        if months < 24:
            return (months/12) * 11
        else:
            return (22 + ((months/12) - 2) * 4)

def lifestage(row):
    age = row['human_age']
    if age < 10:
        return 'Kitten/Puppy'
    elif age < 25:
        return 'Junior'
    elif age < 40:
        return 'Prime'
    elif age < 60:
        return 'Mature'
    elif age < 74:
        return 'Senior'
    return 'Geriatic'

In [None]:
train['human_age'] = train.apply(human_age, axis=1).apply(np.log1p)
train['lifestage'] = train.apply(lifestage, axis=1)

mapper = {'Kitten/Puppy':1, 'Junior':2, 'Prime':3, 'Mature':4,'Senior':5,'Geriatic':6}
train.lifestage.replace(mapper, inplace=True)

In [None]:
test['human_age'] = test.apply(human_age, axis=1).apply(np.log1p)
test['lifestage'] = test.apply(lifestage, axis=1)

mapper = {'Kitten/Puppy':1, 'Junior':2, 'Prime':3, 'Mature':4,'Senior':5,'Geriatic':6}
test.lifestage.replace(mapper, inplace=True)

In [None]:
# defining a function which returns a list of top names
def top_names(df, top_percent):
    df_withnames = df[df.has_name != 0]
    items = df_withnames.shape[0]
    top_names = []
    counter = 0
    for i,v in df_withnames.Name.value_counts().items():
        if (counter/items)>top_percent:
            break
        top_names.append(i)
        counter = counter + v  
    return top_names

train['has_name'] = train['Name'].apply(lambda x: 0 if x == 'No Name' or x == 'Unnamed' else 1)
topnames = top_names(train, 0.2)
train['has_topname'] = train['Name'].apply(lambda row: 1 if row in topnames else 0)

test['has_name'] = test['Name'].apply(lambda x: 0 if x == 'No Name' or x == 'Unnamed' else 1)
topnames = top_names(test, 0.2)
test['has_topname'] = test['Name'].apply(lambda row: 1 if row in topnames else 0)

In [None]:
def mixed_breed(row):
    if row['Breed1'] == 307:
        return 1
    elif row['Breed2'] == 0:
        return 0 
    elif row['Breed2'] != row['Breed1']:
        return 1
    else:
        return 0

train['mixed_breed'] = train.apply(mixed_breed, axis=1)
test['mixed_breed'] = test.apply(mixed_breed, axis=1)

In [None]:
rescuer_count = train.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

train = train.merge(rescuer_count, how='left', on='RescuerID')
train['RescuerID_COUNT_log'] = train.RescuerID_COUNT.apply(np.log1p)

# now we have a number count for each rescuer. now we create bins 
binner = KBinsDiscretizer(n_bins=10,encode='ordinal', strategy='kmeans')
train['rescuer_bin_kmeans'] = pd.DataFrame(binner.fit_transform(train[['RescuerID_COUNT_log']].copy()))


rescuer_count = test.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

test = test.merge(rescuer_count, how='left', on='RescuerID')
test['RescuerID_COUNT_log'] = test.RescuerID_COUNT.apply(np.log1p)

# now we have a number count for each rescuer. now we create bins 
binner = KBinsDiscretizer(n_bins=10,encode='ordinal', strategy='kmeans')
test['rescuer_bin_kmeans'] = pd.DataFrame(binner.fit_transform(test[['RescuerID_COUNT_log']].copy()))

In [None]:
# from: https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b
from sklearn import base

class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):

    def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):

        self.colnames = colnames
        self.targetName = targetName
        self.n_fold = n_fold
        self.verbosity = verbosity
        self.discardOriginal_col = discardOriginal_col

    def fit(self, X, y=None):
        return self


    def transform(self,X):

        assert(type(self.targetName) == str)
        assert(type(self.colnames) == str)
        assert(self.colnames in X.columns)
        assert(self.targetName in X.columns)

        mean_of_target = X[self.targetName].mean()
        kf = StratifiedKFold(n_splits=self.n_fold, shuffle=True)
        
        col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
        X[col_mean_name] = np.nan

        for tr_ind, val_ind in kf.split(X,X[self.targetName]):
            X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
            #print(tr_ind,val_ind)
            X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())

        X[col_mean_name].fillna(mean_of_target, inplace = True)

        if self.verbosity:

            encoded_feature = X[col_mean_name].values
            print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
                                                                                      self.targetName,
                                                                                      np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
        if self.discardOriginal_col:
            X = X.drop(self.targetName, axis=1)
            
        return X
    
    
    
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
    def __init__(self,train,colNames,encodedName):
        self.train = train
        self.colNames = colNames
        self.encodedName = encodedName
        
    def fit(self, X, y=None):
        return self

    def transform(self,X):

        mean = self.train[[self.colNames,self.encodedName]].groupby(self.colNames).mean().reset_index() 
        
        dd = {}
        for index, row in mean.iterrows():
            dd[row[self.colNames]] = row[self.encodedName]

        X[self.encodedName] = X[self.colNames]
        X[self.encodedName] = X[self.encodedName].map(dd)

        return X

In [None]:
def target_encode(X_train, X_test, columns):
    for column in columns:
        print(column)
        # target encode
        targetc = KFoldTargetEncoderTrain(column,'AdoptionSpeed',n_fold=5)
        X_train = targetc.fit_transform(X_train)

        test_targetc = KFoldTargetEncoderTest(X_train,column,column + '_Kfold_Target_Enc')
        X_test= test_targetc.fit_transform(X_test)
        
        X_train[column].fillna(X_train.AdoptionSpeed.mean(), inplace=True)
        X_test[column].fillna(X_train.AdoptionSpeed.mean(), inplace=True)
    return X_train, X_test

In [None]:
target_encoded_columns = ['Breed1', 'Breed2','rescuer_bin_kmeans', 'State', 'Color1', 'Color2', 'Color3']
train, test = target_encode(train, test,target_encoded_columns)

## Features Text Mining

In [None]:
# Normalize the Variable Description
train['Description'] =train['Description'].fillna("<MISSING>")
train['Description'] = train['Description'].str.replace('\d+', '')
train['Description'] = train['Description'].str.lower()
train["Description"] = train['Description'].str.replace('[^\w\s]','')

# Stop Words 
from nltk.corpus import stopwords

stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))
train['Description'] = train['Description'].str.replace(pat, '')
train['Description'] = train['Description'].str.replace(r'\s+', ' ')

# Stem Words
train['Description'] = train['Description'].astype(str).str.split()

from nltk.stem import PorterStemmer, WordNetLemmatizer
porter_stemmer = PorterStemmer()
train['Description']=train['Description'].apply(lambda x : [porter_stemmer.stem(y) for y in x])

train['Description']=train['Description'].apply(lambda x : " ".join(x))

def get_top_n_words(corpus, n=None):
    from sklearn.feature_extraction.text import CountVectorizer

    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    test=pd.DataFrame(words_freq[:n], columns=['words','freq']) 
    
    sns.barplot(x='words', y='freq', data=test)

get_top_n_words(train['Description'],10)

from sklearn.decomposition import TruncatedSVD, NMF
# Matrix Factorization for dimensionality reduction
from sklearn.feature_extraction.text import TfidfVectorizer

svd_ = TruncatedSVD(
    n_components=5, random_state=1337)
nmf_ = NMF(
    n_components=5, random_state=1337)

tfidf = TfidfVectorizer(min_df=2,  max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)

tfidf_col = tfidf.fit_transform(train['Description'])
svd_col = svd_.fit_transform(tfidf_col)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('SVD_')

nmf_col = nmf_.fit_transform(tfidf_col)
nmf_col = pd.DataFrame(nmf_col)
nmf_col = nmf_col.add_prefix('NMF_')

# Concatenate all dataframes
train = pd.concat([train,nmf_col,svd_col],axis=1)


# Normalize the Variable Description
test['Description'] =test['Description'].fillna("<MISSING>")
test['Description'] = test['Description'].str.replace('\d+', '')
test['Description'] = test['Description'].str.lower()
test["Description"] = test['Description'].str.replace('[^\w\s]','')

# Stop Words 
from nltk.corpus import stopwords

stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))
test['Description'] = test['Description'].str.replace(pat, '')
test['Description'] = test['Description'].str.replace(r'\s+', ' ')

# Stem Words
test['Description'] = test['Description'].astype(str).str.split()

from nltk.stem import PorterStemmer, WordNetLemmatizer
porter_stemmer = PorterStemmer()
test['Description']=test['Description'].apply(lambda x : [porter_stemmer.stem(y) for y in x])

test['Description']=test['Description'].apply(lambda x : " ".join(x))

def get_top_n_words(corpus, n=None):
    from sklearn.feature_extraction.text import CountVectorizer

    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    test=pd.DataFrame(words_freq[:n], columns=['words','freq']) 
    
    sns.barplot(x='words', y='freq', data=test)

get_top_n_words(train['Description'],10)

from sklearn.decomposition import TruncatedSVD, NMF
# Matrix Factorization for dimensionality reduction
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_col = tfidf.transform(test['Description'])
svd_col = svd_.transform(tfidf_col)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('SVD_')

nmf_col = nmf_.transform(tfidf_col)
nmf_col = pd.DataFrame(nmf_col)
nmf_col = nmf_col.add_prefix('NMF_')

# Concatenate all dataframes
test = pd.concat([test,nmf_col,svd_col],axis=1)


## Impute Missing Values

In [None]:
train.columns.values

In [None]:
# Cannot be used for this analysis (IDs, Texts...)
train_analysis = train.drop(["Name","BreedName2","ColorName3",'Name','Breed1','Breed2','RescuerID','Description',
                            'BreedName1','Color1','ColorName1', 'Color2','ColorName2','Color3','Age','State','ImageId'],axis=1)

# Cannot be used for this analysis (IDs, Texts...)
test_analysis = test.drop(["Name","Description","BreedName2","ColorName3",'Name','Breed1','Breed2','RescuerID','Description',
                            'BreedName1','Color1', 'ColorName1', 'Color2','ColorName2', 'Color3','Age','State'],axis=1)

train_analysis = train_analysis.fillna(train_analysis.median())
test_analysis = test_analysis.fillna(train_analysis.median())

## Categorical Encoding

In [None]:
#Label Encoding Breed
#One Hot Encoding: ColorName1,ColorName2,StateName
train_analysis = pd.concat([train_analysis.drop('StateName', axis=1),pd.get_dummies(train_analysis['StateName'], prefix='State')], axis=1)

col=['Health', 'Gender', 'Dewormed','Type','MaturitySize', 'Sterilized','Vaccinated','FurLength']
for i in col:
    train_analysis = pd.concat([train_analysis.drop(i, axis=1),pd.get_dummies(train_analysis[i], prefix=i)], axis=1)
    #Label Encoding Breed
#One Hot Encoding: ColorName1,ColorName2,StateName
test_analysis = pd.concat([test_analysis.drop('StateName', axis=1),pd.get_dummies(test_analysis['StateName'], prefix='State')], axis=1)

col=['Health', 'Gender', 'Dewormed','Type','MaturitySize', 'Sterilized','Vaccinated','FurLength']
for i in col:
    test_analysis = pd.concat([test_analysis.drop(i, axis=1),pd.get_dummies(test_analysis[i], prefix=i)], axis=1)

# Modelisation

In [None]:
# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)


In [None]:
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -quadratic_weighted_kappa(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x'] 
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

## Performance of the model

In [None]:
def evaluate(y_pred, y_true):
  
    cohen_kappa= cohen_kappa_score(y_true, y_pred)
    accuracy=accuracy_score(y_true,y_pred)
    f1=f1_score(y_true,y_pred,average='micro')
    classification=classification_report(y_true,y_pred)
    
    #Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(20,6))
    
    sns.heatmap(cm, annot=True)
    plt.title('Confusion matrix')
    plt.figure(figsize = (5,4))
    plt.show()
    #Evaluation Metrics
    print('Cohen Kappa: {:0.2f}.'.format(cohen_kappa))
    print('Accuracy Score: {:0.2f}%.'.format(accuracy))
    print('F1 Score: {:0.2f}%.'.format(f1))
    

In [None]:
#Extracting Features and Output
ids=train_analysis[['PetID']]
train_analysis=train_analysis.drop(['PetID'],axis=1)

In [None]:
X, y = train_analysis.loc[:, train_analysis.columns != 'AdoptionSpeed'], train_analysis['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)

feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(10, 17))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

features_selection = SelectFromModel(model, threshold='1.25*median') # The Threshold is the median of features importance*1.25 
features_selection.fit(X_train, y_train)

features_selection_support = features_selection.get_support()
features_selection = X_train.loc[:,features_selection_support].columns.tolist()
len(features_selection)

In [None]:
X_train =X_train.loc[:,features_selection]
X_test = X_test.loc[:,features_selection]

## Oversampling

In [None]:
from sklearn.impute import SimpleImputer

train_columns = X_train.columns
test_columns = X_test.columns

my_imputer = SimpleImputer()
X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
X_train.columns = train_columns

X_test = pd.DataFrame(my_imputer.transform(X_test))
X_test.columns = test_columns

In [None]:
from collections import Counter

#Let us try some sampling technique to remove class imbalance
from imblearn.over_sampling import SMOTE
#Over-sampling: SMOTE
#SMOTE (Synthetic Minority Oversampling TEchnique) consists of synthesizing elements for the minority class, 
#based on those that already exist. It works randomly picking a point from the minority class and computing 
#the k-nearest neighbors for this point.The synthetic points are added between the chosen point and its neighbors.
#We'll use ratio='minority' to resample the minority class.
smote = SMOTE('minority')

print('Original dataset shape %s' % Counter(y_train))
X_res, y_res = smote.fit_sample(X_train,y_train)
print('Resampled dataset shape %s' % Counter(y_res))

X_res_df = pd.DataFrame(X_res)
X_res_df.columns = train_columns

y_res_df = pd.DataFrame(y_res)

## Cross Validation

In [None]:
def cross_val(model,X_train,y_train):
    X = X_train
    y = y_train
    coeff = np.empty((1,4))
    cv_scores=[]
    fold=1
    skf = StratifiedKFold(n_splits=5,shuffle=True)
    for train_index, val_index in skf.split(X, y):
        xtrain, xvalid = X[train_index], X[val_index]
        ytrain, yvalid = y[train_index], y[val_index]

        model.fit(
            xtrain, ytrain,
            eval_set=[(xvalid, yvalid)],
            verbose=100,
            early_stopping_rounds=100
        )

        valid_preds = model.predict(xvalid, num_iteration=model.best_iteration_)
        yvalid = np.array(yvalid).tolist()
        optR = OptimizedRounder()
        optR.fit(valid_preds, yvalid)

        coefficients = optR.coefficients()
        valid_p = optR.predict(valid_preds, coefficients)

        scr = quadratic_weighted_kappa(yvalid, valid_p)
        cv_scores.append(scr)

        print("QWK = {}. Coef = {}".format(scr, coefficients))
        #coefficients.reshape((4, 1))

        coeff = np.vstack([coeff, coefficients])
        fold += 1


    coeff = np.delete(coeff, (0), axis=0)
    global coefficient_mean
    coefficient_mean = coeff.mean(axis=0)
    print("Coef Mean ={}".format(coefficient_mean))

In [None]:
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'learning_rate': 0.005,
'subsample': .8,
'colsample_bytree': 0.8,
'min_split_gain': 0.006,
'min_child_samples': 150,
'min_child_weight': 0.1,
'n_estimators': 1000,
'num_leaves': 80,
'silent': -1,
'verbose': -1,
'max_depth': 11,
'random_state': 2018
}
    
lgb_model = lgb.LGBMRegressor(**lgb_params)

cross_val(lgb_model,X_res,y_res)

#Prediction
#best iteration is used
y_pred=lgb_model.predict(X_train)

In [None]:
qwk = quadratic_weighted_kappa(y_train, y_pred)
print("QWK = ", qwk)

optR=OptimizedRounder()
predictions = optR.predict(y_pred, coefficient_mean).astype(int)
qwk = quadratic_weighted_kappa(y_train, predictions)
print("QWK = ", qwk)

#predict test set
ids=test[['PetID']]
test_features=test.drop(['PetID'],axis=1)
test_features =test_features.loc[:,features_selection]
pred1 = lgb_model.predict(test_features.values)
pred1 = optR.predict(pred1, coefficient_mean).astype(int)

## LightGBM: Optimize the boundaries

## LightGBM 2

In [None]:
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'learning_rate': 0.005,
'subsample': .8,
'colsample_bytree': 0.8,
'min_split_gain': 0.006,
'min_child_samples': 150,
'min_child_weight': 0.1,
'n_estimators': 1000,
'num_leaves': 80,
'silent': -1,
'verbose': -1,
'max_depth': 11,
'random_state': 2001
}
lgb_model2 = lgb.LGBMRegressor(**lgb_params)

cross_val(lgb_model2,X_res,y_res)

#Prediction
y_pred=lgb_model2.predict(X_train)

In [None]:
#cohen_kappa_score(y_train, y_pred)
qwk = quadratic_weighted_kappa(y_train, y_pred)
print("QWK = ", qwk)

optR=OptimizedRounder()
predictions = optR.predict(y_pred, coefficient_mean).astype(int)
qwk = quadratic_weighted_kappa(y_train, predictions)
print("QWK = ", qwk)

#predict test set
pred2 = lgb_model2.predict(test_features.values)
pred2 = optR.predict(pred2, coefficient_mean).astype(int)

## LightGBM 3

In [None]:
lgb_params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'learning_rate': 0.005,
'subsample': .8,
'colsample_bytree': 0.8,
'min_split_gain': 0.006,
'min_child_samples': 150,
'min_child_weight': 0.1,
'n_estimators': 1000,
'num_leaves': 80,
'silent': -1,
'verbose': -1,
'max_depth': 11,
'random_state': 2000
}
   
lgb_model3 = lgb.LGBMRegressor(**lgb_params)

cross_val(lgb_model3,X_res,y_res)

#Prediction
y_pred=lgb_model3.predict(X_train.values)

In [None]:
qwk = quadratic_weighted_kappa(y_train, y_pred)
print("QWK = ", qwk)

optR=OptimizedRounder()
predictions = optR.predict(y_pred, coefficient_mean).astype(int)
qwk = quadratic_weighted_kappa(y_train, predictions)
print("QWK = ", qwk)

#predict test set
pred3 = lgb_model3.predict(test_features.values)
pred3 = optR.predict(pred3, coefficient_mean).astype(int)

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

xgb_params = {
    'eval_metric': 'rmse',
    'silent': 1,
}

def run_xgb(params, X_train, y_train, X_test):
    n_splits = 5
    verbose_eval = 1000
    num_rounds = 50000
    early_stop = 1000

    oof_test = np.zeros((X_test.shape[0], n_splits))
    
    X = X_train
    y = y_train
    coeff = np.empty((1,4))
    cv_scores=[]
    fold=0
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    for train_index, val_index in skf.split(X, y):
        xtrain, xvalid = X.iloc[train_index, :], X.iloc[val_index,:]
        ytrain, yvalid = y.iloc[train_index,:], y.iloc[val_index,:]
                
        d_train = xgb.DMatrix(data=xtrain, label=ytrain, feature_names=X.columns)
        d_valid = xgb.DMatrix(data=xvalid, label=yvalid, feature_names=X.columns)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

        valid_pred = model.predict(xgb.DMatrix(xvalid, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        yvalid = yvalid.values.reshape([-1])
        optR = OptimizedRounder()                
        optR.fit(valid_pred, yvalid)

        coefficients = optR.coefficients()
        valid_p = optR.predict(valid_pred, coefficients)

        scr = quadratic_weighted_kappa(yvalid, valid_p)
        cv_scores.append(scr)

        print("QWK = {}. Coef = {}".format(scr, coefficients))
        #coefficients.reshape((4, 1))
        
        test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)
        oof_test[:, fold] = test_pred
        coeff = np.vstack([coeff, coefficients])
        fold += 1

    coeff = np.delete(coeff, (0), axis=0)
    global coefficient_mean
    coefficient_mean = coeff.mean(axis=0)
    print("Coef Mean ={}".format(coefficient_mean))
    return oof_test

In [None]:
oof_test = run_xgb(xgb_params,X_res_df,y_res_df, test_features)

In [None]:
optR=OptimizedRounder()
predictions = optR.predict(oof_test.mean(axis=1), coefficient_mean).astype(int)

In [None]:
#create our on voting classifier, inputting our models
# inspired by the voting Classfier with voting='soft', weights=[3,1,1,1])

#score with mean from all 4 values: 0.35
#score with weights 0.26
#score with majority vote 0.29
def vote(row):
    if row[1] == row[2] and row[2] == row[3]:
        return row[1]
    return row[0]

In [None]:
# majority vote
from collections import Counter

def majority_vote(row):
    c = Counter(row)
    value, count =  c.most_common()[0]
    return value

In [None]:
combination = pd.concat([pd.DataFrame(predictions), pd.DataFrame(pred1),pd.DataFrame(pred2),pd.DataFrame(pred3)],axis=1,ignore_index=True)
combination['predn']=combination.apply(majority_vote, axis=1)

predictions = combination['predn']

In [None]:
submission = pd.DataFrame({'PetID': ids['PetID'].values, 'AdoptionSpeed': predictions.astype(np.int32)})
submission.to_csv('submission.csv', index=False)

submission.head(20)