We have 3 feature tables and one from public kernel, and just merging them together.

# imports

In [1]:
import gc
import os
import json
import re
import glob
from joblib import Parallel, delayed

import scipy as sp
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer

from collections import Counter

import xgboost as xgb
import lightgbm as lgb

np.random.seed(1029)

from tqdm import tqdm, tqdm_notebook

import cv2
from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K

from scipy import stats

from PIL import Image

Using TensorFlow backend.


# nfolds

In [2]:
N_FOLDS = 4
FOLDS = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# metric functions and OptimizedRounder(s)

In [3]:
def get_chi2(obs, exp):
    diff = set(exp) - set(obs)
    f_obs = obs.value_counts()
    f_exp = exp.value_counts()
    if diff:
        for i in diff:
            f_obs[i] = 0
    f_obs = f_obs.sort_index()
    f_exp = f_exp.sort_index()
    chi2, _ = stats.chisquare(f_obs.values,f_exp.values)
    return chi2

def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

def histogram(ratings, min_rating=None, max_rating=None):
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def quadratic_weighted_kappa(y_true, y_pred):
    rater_a = y_true
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

def get_class_bounds(y, y_pred, N=5, class0_fraction=-1):
    ysort = np.sort(y)
    predsort = np.sort(y_pred)
    bounds = []
    for ibound in range(N-1):
        iy = len(ysort[ysort <= ibound])
        if (ibound == 0) and (class0_fraction >= 0.0) :
            iy = int(class0_fraction * iy)
        bounds.append(predsort[iy])
    return bounds

def assign_class(y_pred, boundaries):
    y_classes = np.zeros(len(y_pred))
    for iclass, bound in enumerate(boundaries):
        y_classes[y_pred >= bound] = iclass + 1
    return y_classes.astype(int)

def get_init_coefs(y_test_pred, y_test):
    kappas = []
    coefs = []
    cl0fracs = np.array(np.arange(0.01,30,0.01))
    for cl0frac in cl0fracs:
        coef = get_class_bounds(y_test, y_test_pred, class0_fraction=cl0frac)
        coefs.append(coef)
        y_test_k = assign_class(y_test_pred, coef)
        kappa = cohen_kappa_score(y_test, y_test_k, weights='quadratic')
        kappas.append(kappa)
    ifmax = np.array(kappas).argmax()
    best_frac = cl0fracs[ifmax]
    best_coef = coefs[ifmax] 
    return best_coef

def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [4]:
class OptimizedRounder(object):
    def __init__(self,initial_coefs = None):
        if(initial_coefs == None):
            self.initial_coefs = [1.775, 2.1057, 2.4438, 2.7892]
        else:
            self.initial_coefs = initial_coefs.copy()
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        self.coef_ = sp.optimize.minimize(loss_partial, self.initial_coefs, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

class OptimizedRounder_v2(object):
    def __init__(self, initial_coefs = None):
        if(initial_coefs == None):
            self.initial_coefs = [1.775, 2.1057, 2.4438, 2.7892]
        else:
            self.initial_coefs = initial_coefs.copy()
        self.coef_ = 0
    
    def _kappa_loss(self, coef, X, y):
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')    
        chi2 =  get_chi2(X_p, y)
        ll = ll - chi2 * (1.0 / 25000)
        return -ll
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        self.coef_ = sp.optimize.minimize(loss_partial, self.initial_coefs, method = 'nelder-mead')
    
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    
    def coefficients(self):
        return self.coef_['x']


class OptimizedRounder_v3(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef, len_0=410):
        X_p = np.copy(X)
        temp = sorted(list(X_p))
        threshold = temp[int(0.9*len_0)-1]
        for i, pred in enumerate(X_p):
            if pred < threshold:
                X_p[i] = 0
            elif pred >= threshold and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

# image processing functions

In [5]:
def resize_to_square(im):
    old_size = im.shape[:2]
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

# Feature table 1

## load data

In [6]:
train = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")

breeds = pd.read_csv("../input/petfinder-adoption-prediction/breed_labels.csv")
colors = pd.read_csv("../input/petfinder-adoption-prediction/color_labels.csv")
states = pd.read_csv("../input/petfinder-adoption-prediction/state_labels.csv")

In [7]:
origin_train = train[list(train.columns)]
origin_test = test[list(test.columns)]

In [8]:
breedid_map = dict(zip(breeds['BreedID'], breeds['BreedName'].map(lambda x:x.lower())))
color_map = dict(zip(colors['ColorID'], colors['ColorName'].map(lambda x:x)))
state_map = dict(zip(states['StateID'], states['StateName'].map(lambda x:x)))

In [9]:
train_id = train['PetID']
test_id = test['PetID']

## common feature

In [10]:
def sentiment_feature(data, ids, path):
    doc_sent_mag = []
    doc_sent_score = []
    doc_sent_len = []
    doc_sent_mags = []
    doc_sent_scores = []

    doc_entity_len = []
    doc_entity_sali = []

    nf_count = 0

    for pet in ids:
        try:
            with open('../input/petfinder-adoption-prediction/%s/' % path + pet + '.json', 'r') as f:
                sentiment = json.load(f)
            doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
            doc_sent_score.append(sentiment['documentSentiment']['score'])
            
            doc_sent_len.append(len(sentiment['sentences']))
            if len(sentiment['sentences']) == 0:
                doc_sent_mags.append([-999])
                doc_sent_scores.append([-999])
            else:
                doc_sent_mags.append([sent['sentiment']['magnitude'] for sent in sentiment['sentences']])
                doc_sent_scores.append([sent['sentiment']['score'] for sent in sentiment['sentences']])
            
            doc_entity_len.append(len(sentiment['entities']))
            if len(sentiment['entities']) == 0:
                doc_entity_sali.append([-999])
            else:
                doc_entity_sali.append([entity['salience'] for entity in sentiment['entities']])
        except FileNotFoundError:
            nf_count += 1
            doc_sent_mag.append(-1)
            doc_sent_score.append(-1)
            doc_sent_len.append(-1)
            doc_sent_mags.append([-1000])
            doc_sent_scores.append([-1000])
            doc_entity_len.append(-1)
            doc_entity_sali.append([-1000])

    data.loc[:, 'doc_sent_mag'] = doc_sent_mag
    data.loc[:, 'doc_sent_score'] = doc_sent_score

    return data

train = sentiment_feature(train, train_id, 'train_sentiment')
test = sentiment_feature(test, test_id, 'test_sentiment')

In [11]:
def gen_meta_f(df, ids, meta_path):
    vertex_xs = []
    vertex_ys = []
    bounding_confidences = []
    bounding_importance_fracs = []
    dominant_blues = []
    dominant_greens = []
    dominant_reds = []
    dominant_pixel_fracs = []
    dominant_scores = []
    
    dominant_blues1 = []
    dominant_greens1 = []
    dominant_reds1 = []
    dominant_pixel_fracs1 = []
    dominant_scores1 = []

    label_descriptions = []
    label_descriptions1 = []
    label_descriptions2 = []
    label_descriptions3 = []
    
    label_scores = []
    label_scores1 = []
    label_scores2 = []
    label_scores3 = []
    
    nf_count = 0
    nl_count = 0
    label_data = {}
    for idx, pet in enumerate(ids):
        try:
            with open('../input/petfinder-adoption-prediction/%s/' % meta_path + pet + '-1.json', 'r') as f:
                data = json.load(f)
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            # 0
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            # 1
            if len(data['imagePropertiesAnnotation']['dominantColors']['colors']) > 1 and len(data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']) == 3:
                dominant_blue1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']['blue']
                dominant_blues1.append(dominant_blue1)
                dominant_green1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']['green']
                dominant_greens1.append(dominant_green1)
                dominant_red1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['color']['red']
                dominant_reds1.append(dominant_red1)
                dominant_pixel_frac1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['pixelFraction']
                dominant_pixel_fracs1.append(dominant_pixel_frac1)
                dominant_score1 = data['imagePropertiesAnnotation']['dominantColors']['colors'][1]['score']
                dominant_scores1.append(dominant_score1)
        
            else:
                dominant_blues1.append(-1)
                dominant_greens1.append(-1)
                dominant_reds1.append(-1)
                dominant_pixel_fracs1.append(-1)
                dominant_scores1.append(-1)
                
            if data.get('labelAnnotations'):
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)

                if len(data['labelAnnotations']) > 1:
                    label_description1 = data['labelAnnotations'][1]['description']
                    label_descriptions1.append(label_description1)
                    label_score1 = data['labelAnnotations'][1]['score']
                    label_scores1.append(label_score1)
                else:
                    label_descriptions1.append('nothing')
                    label_scores1.append(-1)
                
                if len(data['labelAnnotations']) > 2:
                    label_description2 = data['labelAnnotations'][2]['description']
                    label_descriptions2.append(label_description2)
                    label_score2 = data['labelAnnotations'][2]['score']
                    label_scores2.append(label_score2)
                else:
                    label_descriptions2.append('nothing')
                    label_scores2.append(-1)

                if len(data['labelAnnotations']) > 3:
                    label_description3 = data['labelAnnotations'][3]['description']
                    label_descriptions3.append(label_description3)
                    label_score3 = data['labelAnnotations'][3]['score']
                    label_scores3.append(label_score3)
                else:
                    label_descriptions3.append('nothing')
                    label_scores3.append(-1)

            else:
                nl_count += 1
                label_descriptions.append('nothing')
                label_descriptions1.append(label_description1)
                label_descriptions2.append(label_description2)
                label_descriptions3.append(label_description3)
                
                label_scores.append(-1)
                label_scores1.append(-1)
                label_scores2.append(-1)
                label_scores3.append(-1)
                                                            
        except FileNotFoundError:
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            
            dominant_blues1.append(-1)
            dominant_greens1.append(-1)
            dominant_reds1.append(-1)
            dominant_pixel_fracs1.append(-1)
            dominant_scores1.append(-1)

            label_descriptions.append('nothing')
            label_descriptions1.append('nothing')
            label_descriptions2.append('nothing')
            label_descriptions3.append('nothing')
            label_scores.append(-1)
            label_scores1.append(-1)
            label_scores2.append(-1)
            label_scores3.append(-1)

    prefix = 'meta_'
    df.loc[:, prefix+'vertex_x'] = vertex_xs
    df.loc[:, prefix+'vertex_y'] = vertex_ys
    df.loc[:, prefix+'bounding_confidence'] = bounding_confidences
    df.loc[:, prefix+'bounding_importance'] = bounding_importance_fracs
    df.loc[:, prefix+'dominant_blue'] = dominant_blues
    df.loc[:, prefix+'dominant_green'] = dominant_greens
    df.loc[:, prefix+'dominant_red'] = dominant_reds
    df.loc[:, prefix+'dominant_pixel_frac'] = dominant_pixel_fracs
    df.loc[:, prefix+'dominant_score'] = dominant_scores
    
    df.loc[:, prefix+'label_description'] = label_descriptions
    df.loc[:, prefix+'label_description1'] = label_descriptions1
    df.loc[:, prefix+'label_description2'] = label_descriptions2

    df.loc[:, prefix+'label_score'] = label_scores
    df.loc[:, prefix+'label_score1'] = label_scores1
    df.loc[:, prefix+'label_score2'] = label_scores1
    cate_cols = [prefix+col for col in ['label_description','label_description1','label_description2']]
    df.loc[:, cate_cols] = df[cate_cols].astype('category')

gen_meta_f(train, train_id, 'train_metadata')
gen_meta_f(test, test_id, 'test_metadata')

## origin feature

In [12]:
def rescue_feature(df):
    rescue_count = df.groupby('RescuerID')['Quantity'].count()
    rescue_count.name = 'rescue_count'
    rescue_num = df.groupby('RescuerID')['Quantity'].sum()
    rescue_num.name = 'rescue_num'
    rescue_unique_type = df.drop_duplicates(['RescuerID', 'Type']).groupby('RescuerID')['RescuerID'].count()
    rescue_unique_type.name = 'rescue_unique_type'
    df = df.join(rescue_count, on='RescuerID')
    df = df.join(rescue_num, on='RescuerID') 
    df['rescue_rank'] = df.RescuerID.map(df.RescuerID.value_counts().rank()/df.RescuerID.unique().shape[0])
    return df

def pure_breed_encode(data):
    data['pure_breed1'] = np.where((data['Breed1'] != 307) , '0', '1')
    data['pure_breed2'] = np.where((data['Breed2'] == 0) , '0', np.where(data['Breed2'] != 307, '1', '2'))
    data['pure_breed3'] = (data['pure_breed1'] + data['pure_breed2'])
    data['pure_animal_pure_breed4'] = np.where((data['Type'].astype(np.str)=='1') & (data['pure_breed3']=='00'), '100', np.where((data['Type'].astype(np.str)=='2') & (data['pure_breed3']=='00'), '200', '333'))
    for col in ['pure_breed1', 'pure_breed2', 'pure_breed3', 'pure_animal_pure_breed4']:
        data[col] = data[col].astype('category')
    del data['pure_animal_pure_breed4']
    return data

def call_name_f(data):
    is_call_name = []
    for name, desc in zip(data['Name'], data['Description']):
        clean_desc = str(desc).lower()
        clean_name = str(name).lower()
        if clean_name == 'nan':
            is_call_name.append(0)
        else:
            num = len(clean_desc.split(clean_name))
            is_call_name.append(num)
    data['call_name_num'] = is_call_name
    return data

train = rescue_feature(train)
test = rescue_feature(test)

train = pure_breed_encode(train)
test = pure_breed_encode(test)

## description feature

In [13]:
def language_type(desc):
    desc = str(desc)
    if desc=='nan':
        return 0
    zhmodel = re.compile(u'[\u4e00-\u9fa5]')
    enmodel = re.compile(u'[a-zA-Z]')
    zhmatch = zhmodel.search(desc)
    enmatch = enmodel.search(desc)
    if zhmatch and enmatch:
        return 3
    elif zhmatch:
        return 3
    elif enmatch:
        return 2
    else:
        return 1

def malaiyu_type(desc):
    desc = str(desc)
    malai = [' la x ' , ' nk ',' nie ', ' umur ', ' di ', 'teruk', ' satu ',' dh ', ' ni ',' tp ', ' yg ', 'mmg', 'msj', ' utk ' ,'neh' ]
    for ma_tag in malai:
        if desc.find(ma_tag) > -1:
            return ma_tag,1
    
    return "", 0

lang_prefix = 'lang_'
train[lang_prefix+'language_type'] = train.Description.map(lambda x:language_type(x))
train[lang_prefix+'malaiyu_type'] = train.Description.map(lambda x:malaiyu_type(x)[1])

test[lang_prefix+'language_type'] = test.Description.map(lambda x:language_type(x))
test[lang_prefix+'malaiyu_type'] = test.Description.map(lambda x:malaiyu_type(x)[1])

In [14]:
def obtain_text(df):
    breed1_text = df['Breed1'].map(lambda x:breedid_map.get(x, 'unknown_breed'))
    breed2_text = df['Breed2'].map(lambda x:breedid_map.get(x, 'unknown_breed'))
    color1_text = df['Color1'].map(lambda x:color_map.get(x, 'unknown_color'))
    color2_text = df['Color2'].map(lambda x:color_map.get(x, 'unknown_color'))
    color3_text = df['Color3'].map(lambda x:color_map.get(x, 'unknown_color'))

    text = df['Name'].fillna("none") + " " \
           + breed1_text  + " " \
           + breed2_text + " " \
           + color1_text + " " \
           + color2_text + " " \
           + color3_text + " " \
           + df['Description'].fillna("none")
    
    return text

train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    
# Fit TFIDF
tfv.fit(list(train_desc))
X =  tfv.transform(train_desc)
X_test = tfv.transform(test_desc)

components = 120
svd = TruncatedSVD(n_components=components)
svd.fit(X)

X = svd.transform(X)
X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(components)])
train = pd.concat((train, X), axis=1)
X_test = svd.transform(X_test)
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(components)])
test = pd.concat((test, X_test), axis=1)

## NMF LDA

In [15]:
def nmf_lda_feature(train, test, train_text, test_text):
    tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

    tfv.fit(list(train_text)+list(test_text))
    X =  tfv.transform(train_text)
    X_test = tfv.transform(test_text)

    components = 20
    nmf = NMF(n_components=components, random_state=100).fit(X)
    nmf_x = nmf.transform(X)
    nmf_x = pd.DataFrame(nmf_x, columns=['nmf_{}'.format(i) for i in range(components)])
    train = pd.concat((train, nmf_x), axis=1)
    nmf_x_test = nmf.transform(X_test)
    nmf_x_test = pd.DataFrame(nmf_x_test, columns=['nmf_{}'.format(i) for i in range(components)])
    test = pd.concat((test, nmf_x_test), axis=1)

    components = 12
    lda = LatentDirichletAllocation(n_components=components, max_iter=120, n_jobs=-1)
    lda.fit(X)
    lda_x = lda.transform(X)
    lda_x = pd.DataFrame(lda_x, columns=['lda_{}'.format(i) for i in range(components)])
    train = pd.concat((train, lda_x), axis=1)
    lda_x_test = lda.transform(X_test)
    lda_x_test = pd.DataFrame(lda_x_test, columns=['lda_{}'.format(i) for i in range(components)])
    test = pd.concat((test, lda_x_test), axis=1)
    
    return train, test

train_text = obtain_text(train)
test_text = obtain_text(test)

train, test = nmf_lda_feature(train, test, train_text, test_text)

## image feature

In [16]:
train_df = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test_df = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')
test_pet_ids = test_df['PetID'].values
train_pet_ids = train_df['PetID'].values
target = train_df['AdoptionSpeed'].values

In [17]:
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D,Dense,Dropout
import keras.backend as K
from keras.optimizers import Adam
from keras.applications.densenet import preprocess_input, DenseNet121
from keras.applications.resnet50 import preprocess_input as res_preprocess, ResNet50

### ResNet50 meta feature

In [18]:
batch_size = 128
def BASE_MODEL():
    inp = Input((128,128,3))
    backbone = ResNet50(input_tensor = inp, 
                           weights="../input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5",
                           include_top = False)
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(512)(x)
    x = Dropout(0.5)(x)
    output = Dense(1,activation='linear')(x)
    return Model(inp,output)

def new_load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    try:
        new_image = cv2.resize(image,(128,128))
    except:
        new_image = np.zeros((128,128,3))
    new_image = res_preprocess(new_image)
    return new_image


def train_gen(batch_size=128, shuffle=True, pet_list=None, pet_labels=None, use_labels=True):
    images_df = pd.DataFrame({'img_id':pet_list,'label':pet_labels})
    while True:
        if shuffle:
            images_df = images_df.sample(frac=1.0).reset_index(drop=True)
        for start in range(0, len(images_df), batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size,len(images_df))
            for _id in range(start,end):
                image_row = images_df.iloc[_id]
                image_id = image_row['img_id']
                img = new_load_image("../input/petfinder-adoption-prediction/train_images/", image_id)
                if use_labels:
                    img_label = image_row['label']
                    y_batch.append(img_label)
                else:
                    y_batch.append(-1.0)
                x_batch.append(img)
            yield np.array(x_batch),np.array(y_batch)
            
def test_gen(batch_size=128,shuffle=True,pet_list=None,pet_labels=None,use_labels=True):
    images_df = pd.DataFrame({'img_id':pet_list,'label':pet_labels})
    while True:
        if shuffle:
            images_df = images_df.sample(frac=1.0).reset_index(drop=True)
        for start in range(0, len(images_df), batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size,len(images_df))
            for _id in range(start,end):
                image_row = images_df.iloc[_id]
                image_id = image_row['img_id']
                img = new_load_image("../input/petfinder-adoption-prediction/test_images/", image_id)
                if use_labels:
                    img_label = image_row['label']
                    y_batch.append(img_label)
                else:
                    y_batch.append(-1.0)
                x_batch.append(img)
            yield np.array(x_batch),np.array(y_batch)

In [19]:
test_img_prob = np.zeros(shape=(test_df.shape[0],1))
train_img_prob = np.zeros(shape=(train_df.shape[0],1))

for tr_idx,te_idx in FOLDS.split(train_pet_ids, target):
    gen_tr = train_gen(batch_size=batch_size,
                       shuffle=True,
                       pet_list=train_pet_ids[tr_idx],
                       pet_labels=target[tr_idx])
    gen_te = train_gen(batch_size=batch_size,
                       shuffle=False,
                       pet_list=train_pet_ids[te_idx],
                       pet_labels=target[te_idx])
    gen_test = test_gen(batch_size=batch_size,
                        shuffle=False,
                        pet_list=test_pet_ids,
                        pet_labels=None,
                        use_labels=False)
    
    model = BASE_MODEL()
    model.compile(optimizer='adam', loss='mse')
    
    model.fit_generator(gen_tr,
                        steps_per_epoch=int(np.ceil(len(tr_idx)*1.0/batch_size)),
                        epochs=3,verbose=1,
                        validation_data=gen_te,
                        validation_steps=int(np.ceil(len(te_idx)*1.0/batch_size)))
    _test_prob = model.predict_generator(gen_test,
                                         steps=int(np.ceil(len(test_df)*1.0/(batch_size))))
    _val_prob = model.predict_generator(gen_te,
                                        steps=int(np.ceil(len(te_idx)*1.0/(batch_size))))
    
    train_img_prob[te_idx,:] = _val_prob 
    test_img_prob += _test_prob

test_img_prob /= N_FOLDS

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


### DenseNet121 extracted 256 dim image features

In [20]:
img_size = 256
batch_size = 16

train_df = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
pet_ids = train_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]

train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.columns = ['pic_'+str(i) for i in range(train_feats.shape[1])]

HBox(children=(IntProgress(value=0, max=938), HTML(value='')))




In [21]:
test_df = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

pet_ids = test_df['PetID'].values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.columns = ['pic_'+str(i) for i in range(test_feats.shape[1])]

test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

test_feats.head()
train = pd.merge(train, train_feats, how='left', on='PetID')
test = pd.merge(test, test_feats, how='left', on='PetID')

HBox(children=(IntProgress(value=0, max=249), HTML(value='')))




In [22]:
train_feats_256 = train_feats.copy()
test_feats_256 = test_feats.copy()

# Feature table 2

In [23]:
breed_id_map = dict(zip(breeds.BreedID.values,breeds.BreedName.values))
breed_type_map = dict(zip(breeds.BreedID.values,breeds.Type.values))
color_id_map = dict(zip(colors.ColorID.values,colors.ColorName.values))

## colors

In [24]:
train['Breed1_text'] = train['Breed1'].map(lambda x:breed_id_map.get(x,'UNK_Breed1'))
train['Breed2_text'] = train['Breed2'].map(lambda x:breed_id_map.get(x,'UNK_Breed2'))
train['Color1_text'] = train['Color1'].map(lambda x:color_id_map.get(x,'UNK_Color1'))
train['Color2_text'] = train['Color2'].map(lambda x:color_id_map.get(x,'UNK_Color2'))
train['Color3_text'] = train['Color3'].map(lambda x:color_id_map.get(x,'UNK_Color3'))

test['Breed1_text'] = test['Breed1'].map(lambda x:breed_id_map.get(x,'UNK_Breed1'))
test['Breed2_text'] = test['Breed2'].map(lambda x:breed_id_map.get(x,'UNK_Breed2'))
test['Color1_text'] = test['Color1'].map(lambda x:color_id_map.get(x,'UNK_Color1'))
test['Color2_text'] = test['Color2'].map(lambda x:color_id_map.get(x,'UNK_Color2'))
test['Color3_text'] = test['Color3'].map(lambda x:color_id_map.get(x,'UNK_Color3'))

## all raw text

In [25]:
train['raw_text'] =  train['Name'] + ' ' \
                    + train['Breed1_text'] + ' ' + train['Breed2_text'] + ' ' \
                    + train['Color1_text'] + ' ' + train['Color2_text'] + ' ' \
                    + train['Color3_text'] + ' ' \
                    + train['Description']

test['raw_text'] =  test['Name'] + ' ' \
                    + test['Breed1_text'] + ' ' + test['Breed2_text'] + ' ' \
                    + test['Color1_text'] + ' ' + test['Color2_text'] + ' ' \
                    + test['Color3_text'] + ' ' \
                    + test['Description']

## rescuer, breeds and description

In [26]:
gzf_prefix = 'gzf_'

train[gzf_prefix+'RescureID_rank'] = train.RescuerID.map(train.RescuerID.value_counts().rank()/train.RescuerID.unique().shape[0])
train[gzf_prefix+'Description_len'] = train.Description.map(lambda x:len(x) if type(x)!=float else 0)
train[gzf_prefix+'Description_word_len'] = train.Description.map(lambda x:len(x.strip().split()) if type(x)!=float else 0)
train[gzf_prefix+'Description_distinct_word_len'] = train.Description.map(lambda x:len(set(x.lower().strip().split())) if type(x)!=float else 0)
train[gzf_prefix+'Description_distinct_word_ratio'] = train[gzf_prefix+'Description_distinct_word_len'] / (train[gzf_prefix+'Description_word_len'] + 1.0)

test[gzf_prefix+'RescureID_rank'] = test.RescuerID.map(test.RescuerID.value_counts().rank()/test.RescuerID.unique().shape[0])
test[gzf_prefix+'Description_len'] = test.Description.map(lambda x:len(x) if type(x)!=float else 0)
test[gzf_prefix+'Description_word_len'] = test.Description.map(lambda x:len(x.strip().split()) if type(x)!=float else 0)
test[gzf_prefix+'Description_distinct_word_len'] = test.Description.map(lambda x:len(set(x.lower().strip().split())) if type(x)!=float else 0)
test[gzf_prefix+'Description_distinct_word_ratio'] = test[gzf_prefix+'Description_distinct_word_len'] / (test[gzf_prefix+'Description_word_len'] + 1.0)

In [27]:
X = pd.concat([train,test],axis=0,ignore_index=True)
len_train = len(train)

In [28]:
X[gzf_prefix+'is_pure'] = ((X.Breed1!=307) & (X.Breed2!=307) & (X.Breed2!=0)).astype(float)
X[gzf_prefix+'is_pure_breed1'] = (X.Breed1!=307).astype(float)
X[gzf_prefix+'is_pure_breed2'] = ((X.Breed2!=307) & (X.Breed2!=0)).astype(float)

In [29]:
agg_num_feature = ['Age','Health','PhotoAmt','Quantity',
                   'doc_sent_mag', 'doc_sent_score', 
                   'meta_dominant_score', 'meta_label_score',gzf_prefix+'Description_len']
agg_rescureid_1 = X.groupby(['RescuerID'])[agg_num_feature].mean()
agg_rescureid_1.columns = ['Age_id','Health_id','PhotoAmt_id','Quantity_id',
                   'doc_sent_mag_id', 'doc_sent_score_id', 
                   'dominant_score_id', 'label_score_id','Description_len_id']
agg_rescureid_2 = X.groupby(['RescuerID'])['Breed1'].aggregate({'307_ratio':lambda x:(x==307).mean()})
agg_rescureid = pd.concat([agg_rescureid_1,agg_rescureid_2],axis=1)
agg_rescureid.columns = [gzf_prefix+x for x in agg_rescureid.columns ]
X = pd.merge(X,agg_rescureid,left_on='RescuerID',right_index=True,how='left')

## TFIDF NMF LDA

In [30]:
SVD_FEATURES = 120
NMF_FEATURES = 20
LDA_FEATURES = 12

desc = X.raw_text.fillna("none").values
tfidf = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    
# Fit TFIDF
X_tfidf = tfidf.fit_transform(list(desc))

svd = TruncatedSVD(n_components=SVD_FEATURES)
svd.fit(X_tfidf)
X_svd = svd.fit_transform(X_tfidf)

X_svd = pd.DataFrame(X_svd, columns=[gzf_prefix+'sdv_{}'.format(i) for i in range(SVD_FEATURES)])
X = pd.concat((X, X_svd), axis=1)

nmf = NMF(n_components=NMF_FEATURES)
nmf.fit(X_tfidf)
X_nmf = nmf.fit_transform(X_tfidf)

X_nmf = pd.DataFrame(X_nmf, columns=[gzf_prefix+'mnf_{}'.format(i) for i in range(NMF_FEATURES)])
X = pd.concat((X, X_nmf), axis=1)

lda = LatentDirichletAllocation(n_components=LDA_FEATURES, n_jobs=-1,max_iter=120)
lda.fit(X_tfidf)
X_lda = lda.fit_transform(X_tfidf)

X_lda = pd.DataFrame(X_lda, columns=[gzf_prefix+'lad_{}'.format(i) for i in range(LDA_FEATURES)])
X = pd.concat((X, X_lda), axis=1)


In [31]:
cat_cols = ['Health',
 'Breed1', 'Breed2',
 'Type', 'Gender',
 'Color3', 'Color2', 'Color1',
 'Vaccinated','Sterilized',  'Dewormed',
 'MaturitySize', 'FurLength',
 'State','meta_label_description','meta_label_description1','meta_label_description2']
X.loc[:, cat_cols] = X[cat_cols].astype('category')

In [32]:
foo = train.dtypes
cat_feature_names = foo[foo == "category"].index.values
cat_features = [i for i in range(X.shape[1]) if X.columns[i] in cat_feature_names]

In [33]:
train = X[:len_train]
test = X[len_train:]
train.index = range(len_train)
test.index = range(test.shape[0])

target = train['AdoptionSpeed']
rescue_id = train['RescuerID']

train.shape, target.shape

((14993, 633), (14993,))

# training functions for FT 1 and 2

In [34]:
def obtain_train_mse_and_kappa(train_predictions, target):
    optR = OptimizedRounder()
    optR.fit(train_predictions, target)
    coefficients_ = optR.coefficients()
    rmse_score1 = rmse(target, train_predictions)
    train_predictions = optR.predict(train_predictions, optR.coefficients()).astype(int)
    qwk_score = quadratic_weighted_kappa(target, train_predictions)
    rmse_score2 = rmse(target, train_predictions)
    
    return rmse_score1, rmse_score2, qwk_score

def run_cv_model(train, test, target, weight, model_fn, params={}, eval_fn=None, label='model'):
    kf = FOLDS
    n_splits = N_FOLDS
    
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], n_splits))
    pred_test = np.zeros((origin_test.shape[0], n_splits))
    
    all_coefficients = np.zeros((n_splits, 4))
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/{}'.format(n_splits))
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
            dev_weight, val_weight = weight[dev_index], weight[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
            dev_weight, val_weight = weight[dev_index], weight[val_index]
            
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(dev_X, dev_y, val_X, val_y, dev_weight, val_weight, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        pred_test[:, i-1] = pred_test_y.reshape(-1)
        
        all_coefficients[i-1, :] = coefficients
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(label + ' cv score {}: RMSE {} QWK {}'.format(i, cv_score, qwk))
        i += 1
    train_rmse1,  train_rmse2, train_qwk = obtain_train_mse_and_kappa([r[0] for r in pred_train], target)
    print('{} cv RMSE scores : {}'.format(label, cv_scores))
    print('{} cv mean        RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv recalculate RMSE1 score : {}'.format(label, train_rmse1))
    print('{} cv recalculate RMSE2 score : {}'.format(label, train_rmse2))
    print('{} cv std RMSE score : {}'.format(label, np.std(cv_scores)))
    print('{} cv QWK scores : {}'.format(label, qwk_scores))
    print('{} cv mean        QWK score : {}'.format(label, np.mean(qwk_scores)))
    print('{} cv recalculate QWK score : {}'.format(label, train_qwk))
    print('{} cv std QWK score : {}'.format(label, np.std(qwk_scores)))
    pred_full_test = pred_full_test / float(n_splits)
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test, 'test_value':pred_test,
                'cv': cv_scores, 'qwk': qwk_scores,
               'coefficients': all_coefficients}
    return results

def runLGB(train_X, train_y, test_X, test_y, dev_weight, val_weight, test_X2, params):
    print('Prep LGB')

    d_train = lgb.Dataset(train_X, label=train_y, weight=dev_weight)
    d_valid = lgb.Dataset(test_X, label=test_y, weight=val_weight)
    watchlist = [d_train, d_valid]
    print('Train LGB')
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    early_stop = None
    if params.get('early_stop'):
        early_stop = params.pop('early_stop')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    print('Predict 1/2')
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    importances = model.feature_importance()
    optR = OptimizedRounder_v3()
    len_0 = test_y[test_y==0].shape[0]
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients, len_0)
    print("Valid Counts = ", Counter(test_y))
    print("Predicted Counts = ", Counter(pred_test_y_k))
    print("Coefficients = ", coefficients)
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
    print("QWK = ", qwk)
    print('Predict 2/2')
    return np.array(pred_test_y).reshape(-1, 1), np.array(pred_test_y2).reshape(-1, 1), importances, coefficients, qwk

In [35]:
def get_cols(totals, prefixs):
    if isinstance(prefixs, list):
        cols = []
        for prefix in prefixs:
            cols += [col for col in totals if col.find(prefix) > -1]
        return cols
    else:
        return [col for col in totals if col.find(prefixs) > -1]

origin_cols = [
    "Type","Age",
    "Breed1","Breed2","Gender",
    "Color1","Color2","Color3",
    "MaturitySize","FurLength",
    "Vaccinated","Dewormed","Sterilized","Health",
    "Quantity","Fee","State",
    "VideoAmt","PhotoAmt"
]

doc_cols = get_cols(train.columns, 'doc_')
meta_cols = get_cols(train.columns, 'meta_')
pure_cols = get_cols(train.columns, 'pure_')
rescue_cols = get_cols(train.columns, 'rescue_')
lang_cols = get_cols(train.columns, 'lang_')
sml_cols = get_cols(train.columns, ['svd_', 'lda_', 'nmf_'])
pic_cols = get_cols(train.columns, 'pic_')

In [36]:
train['ResNet_meta'] = train_img_prob.flatten()
test['ResNet_meta'] = test_img_prob.flatten()

In [37]:
gzf_cols = doc_cols + lang_cols + origin_cols + pic_cols + [
                'meta_dominant_blue', 'meta_dominant_green','meta_dominant_pixel_frac', 
                'meta_dominant_red', 'meta_dominant_score', 'meta_label_score', 
                'meta_vertex_x', 'meta_vertex_y'] + [
                gzf_prefix+'RescureID_rank',gzf_prefix+'Description_len',
                gzf_prefix+'Description_word_len',gzf_prefix+'Description_distinct_word_len',
                gzf_prefix+'Description_distinct_word_ratio',
                gzf_prefix+'is_pure',gzf_prefix+'is_pure_breed1',gzf_prefix+'is_pure_breed2',
                gzf_prefix+'Quantity_id',gzf_prefix+'307_ratio'
                ] + [gzf_prefix+'sdv_{}'.format(i) for i in range(SVD_FEATURES)] \
                  + [gzf_prefix+'mnf_{}'.format(i) for i in range(NMF_FEATURES)] \
                  + [gzf_prefix+'lad_{}'.format(i) for i in range(LDA_FEATURES)] \
                  + ['ResNet_meta']

In [38]:
zkr_cols = origin_cols + doc_cols + meta_cols + pure_cols + rescue_cols + lang_cols + sml_cols + pic_cols + ['ResNet_meta']

# Training FT 1 and 2

In [39]:
train_gzf = train[gzf_cols]
test_gzf = test[gzf_cols]

train_zkr = train[zkr_cols]
test_zkr = test[zkr_cols]

print(train_gzf.shape, test_gzf.shape, train_zkr.shape, test_zkr.shape)

(14993, 452) (3972, 452) (14993, 457) (3972, 457)


## LGB

In [40]:
params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 80,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.9,
          'bagging_freq': 3,
          'feature_fraction': 0.85,
          'min_split_gain': 0.01,
          'min_child_samples': 150,
          'min_child_weight': 0.1,
          'verbosity': -1,
          'data_random_seed': 3,
          'early_stop': 100,
          'verbose_eval': 500,
          'num_rounds': 5000
         }

weight = pd.Series(np.where(train['Type']==2, 1.0, 1.0))
lgb_gzf = run_cv_model(train[gzf_cols], test[gzf_cols], target, weight, runLGB, params, rmse, 'lgb')

Started lgb fold 1/4
Prep LGB
Train LGB
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.832749	valid_1's rmse: 1.04387
[1000]	training's rmse: 0.707013	valid_1's rmse: 1.03652
Early stopping, best iteration is:
[1272]	training's rmse: 0.651996	valid_1's rmse: 1.03519
Predict 1/2
Valid Counts =  Counter({4.0: 1050, 2.0: 1010, 3.0: 815, 1.0: 773, 0.0: 103})
Predicted Counts =  Counter({2.0: 1270, 4.0: 1019, 1.0: 714, 3.0: 657, 0.0: 91})
Coefficients =  [0.49110651 2.05002231 2.56081551 2.86398469]
QWK =  0.4617867479103991
Predict 2/2
lgb cv score 1: RMSE 1.0351866548278965 QWK 0.4617867479103991
Started lgb fold 2/4
Prep LGB
Train LGB
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.83197	valid_1's rmse: 1.04014
[1000]	training's rmse: 0.703062	valid_1's rmse: 1.03191
[1500]	training's rmse: 0.602803	valid_1's rmse: 1.02984
Early stopping, best iteration is:
[1511]	training's rmse: 0.600867	valid_1's rmse: 1.

In [None]:
params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 80,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.9,
          'bagging_freq': 3,
          'feature_fraction': 0.84,
          'min_split_gain': 0.01,
          'min_child_samples': 150,
          'min_child_weight': 0.1,
          'verbosity': -1,
          'data_random_seed': 3,
          'verbose_eval': 500,
          'num_rounds': 1500,
         }

weight = pd.Series(np.where(train['Type']==2, 1.0, 1.0))
lgb_zkr = run_cv_model(train[zkr_cols], test[zkr_cols], target, weight, runLGB, params, rmse, 'lgb')

Started lgb fold 1/4
Prep LGB
Train LGB
[500]	training's rmse: 0.828353	valid_1's rmse: 1.04354
[1000]	training's rmse: 0.700263	valid_1's rmse: 1.03543
[1500]	training's rmse: 0.597154	valid_1's rmse: 1.033
Predict 1/2
Valid Counts =  Counter({4.0: 1050, 2.0: 1010, 3.0: 815, 1.0: 773, 0.0: 103})
Predicted Counts =  Counter({2.0: 1748, 4.0: 1246, 1.0: 666, 0.0: 91})
Coefficients =  [0.52215962 2.0265292  2.75981032 1.58504158]
QWK =  0.4576383478618269
Predict 2/2
lgb cv score 1: RMSE 1.0330003567066113 QWK 0.4576383478618269
Started lgb fold 2/4
Prep LGB
Train LGB
[500]	training's rmse: 0.827086	valid_1's rmse: 1.03639
[1000]	training's rmse: 0.693852	valid_1's rmse: 1.02738
[1500]	training's rmse: 0.591903	valid_1's rmse: 1.02481
Predict 1/2
Valid Counts =  Counter({4.0: 1049, 2.0: 1009, 3.0: 815, 1.0: 773, 0.0: 103})
Predicted Counts =  Counter({2.0: 1699, 4.0: 948, 3.0: 751, 1.0: 260, 0.0: 91})
Coefficients =  [0.52093116 1.79473986 2.56482092 2.93222952]
QWK =  0.46864051063799317

# Feature table 3

## reload

In [None]:
del train, test
gc.collect()

train = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
test = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")

## color

In [None]:
train['Color'] = train.Color1 * 100 + train.Color2 * 10 + train.Color3
train.drop(['Color1', 'Color2', 'Color3'], axis=1, inplace=True)

test['Color'] = test.Color1 * 100 + test.Color2 * 10 + test.Color3
test.drop(['Color1', 'Color2', 'Color3'], axis=1, inplace=True)

In [None]:
target = train['AdoptionSpeed']
train_id = train['PetID']
test_id = test['PetID']

## sentiment data

In [None]:
doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in train_id:
    try:
        with open('../input/petfinder-adoption-prediction/train_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

train.loc[:, 'doc_sent_mag'] = doc_sent_mag
train.loc[:, 'doc_sent_score'] = doc_sent_score
train["doc_sentiment"] = train.doc_sent_mag * train.doc_sent_score

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in test_id:
    try:
        with open('../input/petfinder-adoption-prediction/test_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

test.loc[:, 'doc_sent_mag'] = doc_sent_mag
test.loc[:, 'doc_sent_score'] = doc_sent_score
test["doc_sentiment"] = test.doc_sent_mag * test.doc_sent_score

## TFIDF

In [None]:
n_components = 150

train_desc = train.Description.fillna("none").values
test_desc = test.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=None,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words='english')

tfv.fit(list(train_desc))
X = tfv.transform(train_desc)
X_test = tfv.transform(test_desc)

svd = TruncatedSVD(n_components=n_components)
svd.fit(X)
X = svd.transform(X)

X = pd.DataFrame(X, columns=['svd_{}'.format(i) for i in range(n_components)])
train = pd.concat((train, X), axis=1)
X_test = svd.transform(X_test)
X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(n_components)])
test = pd.concat((test, X_test), axis=1)

## image metadata

In [None]:
img_xs = []
img_ys = []
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in train_id:
    try:
        im = Image.open('../input/petfinder-adoption-prediction/train_images/%s-1.jpg' % pet)
        width, height = im.size
        img_xs.append(width)
        img_ys.append(height)
        with open('../input/petfinder-adoption-prediction/train_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        img_xs.append(-1)
        img_ys.append(-1)
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

train.loc[:, 'img_x'] = img_xs
train.loc[:, 'img_y'] = img_ys
train.loc[:, 'vertex_x'] = vertex_xs
train.loc[:, 'vertex_y'] = vertex_ys
train.loc[:, 'bounding_confidence'] = bounding_confidences
train.loc[:, 'bounding_importance'] = bounding_importance_fracs
train.loc[:, 'dominant_blue'] = dominant_blues
train.loc[:, 'dominant_green'] = dominant_greens
train.loc[:, 'dominant_red'] = dominant_reds
train.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
train.loc[:, 'dominant_score'] = dominant_scores
train.loc[:, 'label_description'] = label_descriptions
train.loc[:, 'label_score'] = label_scores

img_xs = []
img_ys = []
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in test_id:
    try:
        im = Image.open('../input/petfinder-adoption-prediction/test_images/%s-1.jpg' % pet)
        width, height = im.size
        img_xs.append(width)
        img_ys.append(height)
        with open('../input/petfinder-adoption-prediction/test_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        img_xs.append(-1)
        img_ys.append(-1)
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

test.loc[:, 'img_x'] = img_xs
test.loc[:, 'img_y'] = img_ys
test.loc[:, 'vertex_x'] = vertex_xs
test.loc[:, 'vertex_y'] = vertex_ys
test.loc[:, 'bounding_confidence'] = bounding_confidences
test.loc[:, 'bounding_importance'] = bounding_importance_fracs
test.loc[:, 'dominant_blue'] = dominant_blues
test.loc[:, 'dominant_green'] = dominant_greens
test.loc[:, 'dominant_red'] = dominant_reds
test.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
test.loc[:, 'dominant_score'] = dominant_scores
test.loc[:, 'label_description'] = label_descriptions
test.loc[:, 'label_score'] = label_scores

In [None]:
train["vertex_x_ratio"] = train.vertex_x / train.img_x
train["vertex_y_ratio"] = train.vertex_y / train.img_y

test["vertex_x_ratio"] = test.vertex_x / test.img_x
test["vertex_y_ratio"] = test.vertex_y / test.img_y

## name

In [None]:
train.Name = train.Name.fillna('')
test.Name = test.Name.fillna('')
train["Name"] = train.Name.apply(lambda x: str(x).lower())
test["Name"] = test.Name.apply(lambda x: str(x).lower())

train["name_length"] = train.Name.apply(lambda x: len(str(x)))
test["name_length"] = test.Name.apply(lambda x: len(str(x)))

In [None]:
all_data = pd.concat((train, test))

name_idx, name_val = all_data.Name.value_counts().index, all_data.Name.value_counts().values
name_map = dict()
for idx, val in zip(name_idx, name_val):
    name_map.update({idx: val})

train["name_cnt"] = train.Name.map(name_map)
test["name_cnt"] = test.Name.map(name_map)

## description

In [None]:
train['Description'] = train['Description'].fillna('')
test['Description'] = test['Description'].fillna('')

train['desc_length'] = train['Description'].apply(lambda x: len(x))
train['desc_words'] = train['Description'].apply(lambda x: len(x.split()))

test['desc_length'] = test['Description'].apply(lambda x: len(x))
test['desc_words'] = test['Description'].apply(lambda x: len(x.split()))

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '^']

def lexical_density(x):
    for punct in puncts:
        x = x.replace(punct, "")
    li = x.split(" ")
    return len(set(li)) / len(li) if len(li) != 0 else 0

train["desc_lexical_density"] = train.Description.apply(lambda x: lexical_density(x))
test["desc_lexical_density"] = test.Description.apply(lambda x: lexical_density(x))

In [None]:
def sentences_count(x):
    return len(re.split(r'[.!?]+', x))

train["sentences_count"] = train.Description.apply(lambda x: sentences_count(x))
test["sentences_count"] = test.Description.apply(lambda x: sentences_count(x))

In [None]:
def find_capitals(x):
    return len(re.findall('[A-Z]', x))

train["desc_capitals"] = train.Description.apply(lambda x: find_capitals(x))
test["desc_capitals"] = test.Description.apply(lambda x: find_capitals(x))

## rescuer

In [None]:
rescuer_idx, rescuer_val = all_data.RescuerID.value_counts().index, all_data.RescuerID.value_counts().values
rescuer_map = dict()
for idx, val in zip(rescuer_idx, rescuer_val):
    rescuer_map.update({idx: val})

train["rescuer_cnt"] = train.RescuerID.map(rescuer_map)
test["rescuer_cnt"] = test.RescuerID.map(rescuer_map)

## state

In [None]:
# state GDP: https://en.wikipedia.org/wiki/List_of_Malaysian_states_by_GDP
state_gdp = {
    41336: 116.679,
    41325: 40.596,
    41367: 23.02,
    41401: 190.075,
    41415: 5.984,
    41324: 37.274,
    41332: 42.389,
    41335: 52.452,
    41330: 67.629,
    41380: 5.642,
    41327: 81.284,
    41345: 80.167,
    41342: 121.414,
    41326: 280.698,
    41361: 32.270
}

# state population: https://zh.wikipedia.org/wiki/%E9%A9%AC%E6%9D%A5%E8%A5%BF%E4%BA%9A
state_population = {
    41336: 33.48283,
    41325: 19.47651,
    41367: 15.39601,
    41401: 16.74621,
    41415: 0.86908,
    41324: 8.21110,
    41332: 10.21064,
    41335: 15.00817,
    41330: 23.52743,
    41380: 2.31541,
    41327: 15.61383,
    41345: 32.06742,
    41342: 24.71140,
    41326: 54.62141,
    41361: 10.35977
}

# state area
state_area = {
    41336: 19.210,
    41325: 9.500,
    41367: 15.099,
    41401: 0.243,
    41415: 0.091,
    41324: 1.664,
    41332: 6.686,
    41335: 36.137,
    41330: 21.035,
    41380: 2.31541,
    41327: 0.821,
    41345: 73.631,
    41342: 124.450,
    41326: 8.104,
    41361: 13.035
}

train["state_gdp"] = train.State.map(state_gdp)
train["state_population"] = train.State.map(state_population)
train["state_area"] = train.State.map(state_area)
test["state_gdp"] = test.State.map(state_gdp)
test["state_population"] = test.State.map(state_population)
test["state_area"] = test.State.map(state_area)

## pure breed

In [None]:
# {"Domestic Long Hair": 264, "Domestic Medium Hair": 265, "Domestic Short Hair": 266, "Mixed Breed": 307}

train['Pure_breed'] = 1
train.loc[train['Breed2'] != 0, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 264, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 265, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 266, 'Pure_breed'] = 0
train.loc[train['Breed1'] == 307, 'Pure_breed'] = 0

test['Pure_breed'] = 1
test.loc[test['Breed2'] != 0, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 264, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 265, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 266, 'Pure_breed'] = 0
test.loc[test['Breed1'] == 307, 'Pure_breed'] = 0

In [None]:
# drop some not so impantance features

train.drop(['vertex_x', 'vertex_y', 'bounding_confidence'], axis=1, inplace=True)
test.drop(['vertex_x', 'vertex_y', 'bounding_confidence'], axis=1, inplace=True)

## DenseNet121 extracted 128 dim image features

In [None]:
n_img_features = 128

img_size = 256
batch_size = 16

inp = Input((img_size, img_size, 3))
backbone = DenseNet121(input_tensor=inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(1024//n_img_features)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

In [None]:
pet_ids = train_id.values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/train_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
train_feats = pd.DataFrame.from_dict(features, orient='index')
train_feats.head()

In [None]:
pet_ids = test_id.values
n_batches = len(pet_ids) // batch_size + 1

features = {}
for b in tqdm_notebook(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size
    batch_pets = pet_ids[start:end]
    batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
    for i,pet_id in enumerate(batch_pets):
        try:
            batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_pets):
        features[pet_id] = batch_preds[i]
        
test_feats = pd.DataFrame.from_dict(features, orient='index')
test_feats.head()

In [None]:
train_feats.columns = ["img_feat{}".format(i) for i in range(n_img_features)]
test_feats.columns = ["img_feat{}".format(i) for i in range(n_img_features)]

train_feats["PetID"] = train_feats.index
test_feats["PetID"] = test_feats.index

train = pd.merge(train, train_feats, on="PetID")
test = pd.merge(test, test_feats, on="PetID")

print(train.shape, test.shape)

In [None]:
train.drop(['AdoptionSpeed', 'PetID'], axis=1, inplace=True)
test.drop(['PetID'], axis=1, inplace=True)

## LGB

In [None]:
train.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)
test.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)

# rearrange columns again
c = ['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt', 'Color'] +  ["img_feat{}".format(i) for i in range(n_img_features)] + ['doc_sent_mag', 'doc_sent_score'] + ['svd_{}'.format(i) for i in range(n_components)] + ['img_x', 'img_y', 'bounding_importance', 'dominant_blue', 'dominant_green', 'dominant_red', 'dominant_pixel_frac', 'dominant_score','label_description', 'label_score', 'vertex_x_ratio', 'vertex_y_ratio', 'name_length', 'name_cnt', 'desc_length', 'desc_words', 'desc_lexical_density', 'sentences_count', 'desc_capitals', 'rescuer_cnt', 'state_gdp', 'state_population', 'Pure_breed']
train = train[c]
test = test[c]

numeric_cols = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'doc_sent_mag', 'doc_sent_score', 'dominant_score', 'dominant_pixel_frac', 'dominant_red', 'dominant_green', 'dominant_blue', 'bounding_importance', 'img_x', 'img_y', 'vertex_x_ratio', 'vertex_y_ratio', 'label_score', 'desc_length', 'desc_words', 'desc_lexical_density', 'sentences_count', 'desc_capitals', 'rescuer_cnt', 'state_gdp', 'state_population', 'Pure_breed', 'name_length', 'name_cnt'] + ['svd_{}'.format(i) for i in range(n_components)] + ["img_feat{}".format(i) for i in range(n_img_features)]
cat_cols = list(set(train.columns) - set(numeric_cols))

train.loc[:, cat_cols] = train[cat_cols].astype('category')
test.loc[:, cat_cols] = test[cat_cols].astype('category')

foo = train.dtypes
cat_feature_names = foo[foo == "category"]
cat_features = [train.columns.get_loc(c) for c in train.columns if c in cat_feature_names]

In [None]:
del run_cv_model
gc.collect()

def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = FOLDS
    n_splits = N_FOLDS
    
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], n_splits))
    all_coefficients = np.zeros((n_splits, 4))
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/' + str(n_splits))
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        all_coefficients[i-1, :] = coefficients
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(label + ' cv score {}: RMSE {} QWK {}'.format(i, cv_score, qwk))
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = train.columns.values
        fold_importance_df['importance'] = importances
        fold_importance_df['fold'] = i
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        i += 1
    print('{} cv RMSE scores : {}'.format(label, cv_scores))
    print('{} cv mean RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv QWK scores : {}'.format(label, qwk_scores))
    print('{} cv mean QWK score : {}'.format(label, np.mean(qwk_scores)))
    print('{} cv std QWK score : {}'.format(label, np.std(qwk_scores)))
    pred_full_test = pred_full_test / float(n_splits)
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test,
                'cv': cv_scores, 'qwk': qwk_scores,
               'importance': feature_importance_df,
               'coefficients': all_coefficients}
    return results

In [None]:
del runLGB
gc.collect()

def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    print('Train LGB')
    try:
        num_rounds = params.pop('num_rounds')
    except:
        pass
    verbose_eval = params.pop('verbose_eval')
    early_stop = None
    if params.get('early_stop'):
        early_stop = params.pop('early_stop')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=10000,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      callbacks=[lgb.reset_parameter(learning_rate=[0.005]*1000+[0.003]*1000+[0.001]*8000)],
                      early_stopping_rounds=early_stop)

    print('Predict 1/2')
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    init_coef = get_init_coefs(pred_test_y, test_y)
    optR = OptimizedRounder_v2(initial_coefs=init_coef)
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients)
    chi2 = get_chi2(pred_test_y_k, test_y)
    print("Valid Counts = {}".format(Counter(test_y)))
    print("Predicted Counts = {}".format(Counter(pred_test_y_k)))
    print("Coefficients = {}".format(coefficients))
    print("Chi2 = {}".format(chi2))
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
    print("QWK = {}".format(qwk))
    print('Predict 2/2')
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance(), coefficients, qwk

In [None]:
param = {'application': 'regression',
         'boosting': 'gbdt', 
         'metric': 'rmse', 
         'num_leaves': 149, 
         'max_depth': 11, 
         'max_bin': 37, 
         'bagging_fraction': 0.975419815153193, 
         'bagging_freq': 1, 
         'feature_fraction': 0.2705570927694394, 
         'min_split_gain': 0.7636472013417633, 
         'min_child_samples': 29, 
         'min_child_weight': 0.13126728393897313, 
         'lambda_l2': 0.841358003322472, 
         'verbosity': -1, 
         'data_random_seed': 1029, 
         'early_stop': 100, 
         'verbose_eval': 2000, 
         'num_rounds': 10000}

In [None]:
lgb_zyl = run_cv_model(train, test, target, runLGB, param, rmse, 'lgb')

# Feature table 3

https://www.kaggle.com/ranjoranjan/single-xgboost-model

## reload

In [None]:
del train, test
gc.collect()

train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

labels_breed = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')

## features

In [None]:
all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']]

In [None]:
n_components = 32
svd_ = TruncatedSVD(n_components=n_components, random_state=1337)

features_df = pd.concat([train_feats_256, test_feats_256], axis=0)
features = features_df[[f'pic_{i}' for i in range(256)]].values

svd_col = svd_.fit_transform(features)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_SVD_')

img_features = pd.concat([all_ids, svd_col], axis=1)

In [None]:
train_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_images/*.jpg'))
train_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_metadata/*.json'))
train_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))

test_image_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_images/*.jpg'))
test_metadata_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_metadata/*.json'))
test_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))

In [None]:
split_char = '/'

In [None]:
train_df_ids = train[['PetID']]

train_df_ids = train[['PetID']]
train_df_metadata = pd.DataFrame(train_metadata_files)
train_df_metadata.columns = ['metadata_filename']
train_metadata_pets = train_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
train_df_metadata = train_df_metadata.assign(PetID=train_metadata_pets)

train_df_ids = train[['PetID']]
train_df_sentiment = pd.DataFrame(train_sentiment_files)
train_df_sentiment.columns = ['sentiment_filename']
train_sentiment_pets = train_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
train_df_sentiment = train_df_sentiment.assign(PetID=train_sentiment_pets)

In [None]:
test_df_ids = test[['PetID']]

test_df_metadata = pd.DataFrame(test_metadata_files)
test_df_metadata.columns = ['metadata_filename']
test_metadata_pets = test_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
test_df_metadata = test_df_metadata.assign(PetID=test_metadata_pets)

test_df_sentiment = pd.DataFrame(test_sentiment_files)
test_df_sentiment.columns = ['sentiment_filename']
test_sentiment_pets = test_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
test_df_sentiment = test_df_sentiment.assign(PetID=test_sentiment_pets)

In [None]:
class PetFinderParser(object):
    
    def __init__(self, debug=False):        
        self.debug = debug
        self.sentence_sep = ' '        
        self.extract_sentiment_text = False
    
    def open_json_file(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            json_file = json.load(f)
        return json_file
        
    def parse_sentiment_file(self, file):
        file_sentiment = file['documentSentiment']
        file_entities = [x['name'] for x in file['entities']]
        file_entities = self.sentence_sep.join(file_entities)       
        file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]        
        file_sentences_sentiment = pd.DataFrame.from_dict(
            file_sentences_sentiment, orient='columns')
        file_sentences_sentiment_df = pd.DataFrame(
            {
                'magnitude_sum': file_sentences_sentiment['magnitude'].sum(axis=0),
                'score_sum': file_sentences_sentiment['score'].sum(axis=0),
                'magnitude_mean': file_sentences_sentiment['magnitude'].mean(axis=0),
                'score_mean': file_sentences_sentiment['score'].mean(axis=0),
                'magnitude_var': file_sentences_sentiment['magnitude'].var(axis=0),
                'score_var': file_sentences_sentiment['score'].var(axis=0),
            }, index=[0]
        )        
        df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient='index').T
        df_sentiment = pd.concat([df_sentiment, file_sentences_sentiment_df], axis=1)            
        df_sentiment['entities'] = file_entities
        df_sentiment = df_sentiment.add_prefix('sentiment_')        
        return df_sentiment
    
    def parse_metadata_file(self, file):
        file_keys = list(file.keys())        
        if 'labelAnnotations' in file_keys:
            file_annots = file['labelAnnotations']
            file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
            file_top_desc = [x['description'] for x in file_annots]
        else:
            file_top_score = np.nan
            file_top_desc = ['']        
        file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']
        file_crops = file['cropHintsAnnotation']['cropHints']
        file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
        file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()
        file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()        
        if 'importanceFraction' in file_crops[0].keys():
            file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
        else:
            file_crop_importance = np.nan
        df_metadata = {
            'annots_score': file_top_score,
            'color_score': file_color_score,
            'color_pixelfrac': file_color_pixelfrac,
            'crop_conf': file_crop_conf,
            'crop_importance': file_crop_importance,
            'annots_top_desc': self.sentence_sep.join(file_top_desc)
        }        
        df_metadata = pd.DataFrame.from_dict(df_metadata, orient='index').T
        df_metadata = df_metadata.add_prefix('metadata_')        
        return df_metadata
    
def extract_additional_features(pet_id, mode='train'):
    sentiment_filename = f'../input/petfinder-adoption-prediction/{mode}_sentiment/{pet_id}.json'
    try:
        sentiment_file = pet_parser.open_json_file(sentiment_filename)
        df_sentiment = pet_parser.parse_sentiment_file(sentiment_file)
        df_sentiment['PetID'] = pet_id
    except FileNotFoundError:
        df_sentiment = []
    dfs_metadata = []
    metadata_filenames = sorted(glob.glob(f'../input/petfinder-adoption-prediction/{mode}_metadata/{pet_id}*.json'))
    if len(metadata_filenames) > 0:
        for f in metadata_filenames:
            metadata_file = pet_parser.open_json_file(f)
            df_metadata = pet_parser.parse_metadata_file(metadata_file)
            df_metadata['PetID'] = pet_id
            dfs_metadata.append(df_metadata)
        dfs_metadata = pd.concat(dfs_metadata, ignore_index=True, sort=False)
    dfs = [df_sentiment, dfs_metadata]    
    return dfs

pet_parser = PetFinderParser()

In [None]:
train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()

dfs_train = Parallel(n_jobs=-1, verbose=1)(
    delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids)
train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]
train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False)
train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False)
print(train_dfs_sentiment.shape, train_dfs_metadata.shape)

dfs_test = Parallel(n_jobs=-1, verbose=1)(
    delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids)
test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]
test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)
print(test_dfs_sentiment.shape, test_dfs_metadata.shape)

In [None]:
aggregates = ['sum', 'mean', 'var']
sent_agg = ['sum']

train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
train_metadata_desc = train_metadata_desc.reset_index()
train_metadata_desc[
    'metadata_annots_top_desc'] = train_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
train_metadata_gr = train_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in train_metadata_gr.columns:
    if 'PetID' not in i:
        train_metadata_gr[i] = train_metadata_gr[i].astype(float)
train_metadata_gr = train_metadata_gr.groupby(['PetID']).agg(aggregates)
train_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in train_metadata_gr.columns.tolist()])
train_metadata_gr = train_metadata_gr.reset_index()

train_sentiment_desc = train_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
train_sentiment_desc = train_sentiment_desc.reset_index()
train_sentiment_desc[
    'sentiment_entities'] = train_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
train_sentiment_gr = train_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in train_sentiment_gr.columns:
    if 'PetID' not in i:
        train_sentiment_gr[i] = train_sentiment_gr[i].astype(float)
train_sentiment_gr = train_sentiment_gr.groupby(['PetID']).agg(sent_agg)
train_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in train_sentiment_gr.columns.tolist()])
train_sentiment_gr = train_sentiment_gr.reset_index()


test_metadata_desc = test_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
test_metadata_desc = test_metadata_desc.reset_index()
test_metadata_desc[
    'metadata_annots_top_desc'] = test_metadata_desc[
    'metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
test_metadata_gr = test_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in test_metadata_gr.columns:
    if 'PetID' not in i:
        test_metadata_gr[i] = test_metadata_gr[i].astype(float)
test_metadata_gr = test_metadata_gr.groupby(['PetID']).agg(aggregates)
test_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in test_metadata_gr.columns.tolist()])
test_metadata_gr = test_metadata_gr.reset_index()

test_sentiment_desc = test_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
test_sentiment_desc = test_sentiment_desc.reset_index()
test_sentiment_desc[
    'sentiment_entities'] = test_sentiment_desc[
    'sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
test_sentiment_gr = test_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in test_sentiment_gr.columns:
    if 'PetID' not in i:
        test_sentiment_gr[i] = test_sentiment_gr[i].astype(float)
test_sentiment_gr = test_sentiment_gr.groupby(['PetID']).agg(sent_agg)
test_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in test_sentiment_gr.columns.tolist()])
test_sentiment_gr = test_sentiment_gr.reset_index()

In [None]:
train_proc = train.copy()
train_proc = train_proc.merge(
    train_sentiment_gr, how='left', on='PetID')
train_proc = train_proc.merge(
    train_metadata_gr, how='left', on='PetID')
train_proc = train_proc.merge(
    train_metadata_desc, how='left', on='PetID')
train_proc = train_proc.merge(
    train_sentiment_desc, how='left', on='PetID')

test_proc = test.copy()
test_proc = test_proc.merge(
    test_sentiment_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_gr, how='left', on='PetID')
test_proc = test_proc.merge(
    test_metadata_desc, how='left', on='PetID')
test_proc = test_proc.merge(
    test_sentiment_desc, how='left', on='PetID')

assert train_proc.shape[0] == train.shape[0]
assert test_proc.shape[0] == test.shape[0]

In [None]:
train_breed_main = train_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))
train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')
train_breed_second = train_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))
train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')
train_proc = pd.concat(
    [train_proc, train_breed_main, train_breed_second], axis=1)

test_breed_main = test_proc[['Breed1']].merge(
    labels_breed, how='left',
    left_on='Breed1', right_on='BreedID',
    suffixes=('', '_main_breed'))
test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')
test_breed_second = test_proc[['Breed2']].merge(
    labels_breed, how='left',
    left_on='Breed2', right_on='BreedID',
    suffixes=('', '_second_breed'))
test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')
test_proc = pd.concat(
    [test_proc, test_breed_main, test_breed_second], axis=1)

In [None]:
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)

In [None]:
X_temp = X.copy()

text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']

to_drop_columns = ['PetID', 'Name', 'RescuerID']

In [None]:
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

In [None]:
for i in categorical_columns:
    X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]

In [None]:
X_text = X_temp[text_columns]

for i in X_text.columns:
    X_text.loc[:, i] = X_text.loc[:, i].fillna('none')

In [None]:
X_temp['Length_Description'] = X_text['Description'].map(len)
X_temp['Length_metadata_annots_top_desc'] = X_text['metadata_annots_top_desc'].map(len)
X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(len)

In [None]:
n_components = 16
text_features = []

for i in X_text.columns:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(min_df=2,  max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1)
    svd_ = TruncatedSVD(
        n_components=n_components, random_state=1337)
    tfidf_col = tfv.fit_transform(X_text.loc[:, i].values)    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))    
    text_features.append(svd_col)
    
text_features = pd.concat(text_features, axis=1)

X_temp = pd.concat([X_temp, text_features], axis=1)

for i in X_text.columns:
    X_temp = X_temp.drop(i, axis=1)

In [None]:
X_temp = X_temp.merge(img_features, how='left', on='PetID')

In [None]:
train_df_ids = train[['PetID']]
test_df_ids = test[['PetID']]

train_df_imgs = pd.DataFrame(train_image_files)
train_df_imgs.columns = ['image_filename']
train_imgs_pets = train_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])

test_df_imgs = pd.DataFrame(test_image_files)
test_df_imgs.columns = ['image_filename']
test_imgs_pets = test_df_imgs['image_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])

train_df_imgs = train_df_imgs.assign(PetID=train_imgs_pets)
test_df_imgs = test_df_imgs.assign(PetID=test_imgs_pets)

def getSize(filename):
    st = os.stat(filename)
    return st.st_size

def getDimensions(filename):
    img_size = Image.open(filename).size
    return img_size 

train_df_imgs['image_size'] = train_df_imgs['image_filename'].apply(getSize)
train_df_imgs['temp_size'] = train_df_imgs['image_filename'].apply(getDimensions)
train_df_imgs['width'] = train_df_imgs['temp_size'].apply(lambda x : x[0])
train_df_imgs['height'] = train_df_imgs['temp_size'].apply(lambda x : x[1])
train_df_imgs = train_df_imgs.drop(['temp_size'], axis=1)

test_df_imgs['image_size'] = test_df_imgs['image_filename'].apply(getSize)
test_df_imgs['temp_size'] = test_df_imgs['image_filename'].apply(getDimensions)
test_df_imgs['width'] = test_df_imgs['temp_size'].apply(lambda x : x[0])
test_df_imgs['height'] = test_df_imgs['temp_size'].apply(lambda x : x[1])
test_df_imgs = test_df_imgs.drop(['temp_size'], axis=1)

aggs = {
    'image_size': ['sum', 'mean', 'var'],
    'width': ['sum', 'mean', 'var'],
    'height': ['sum', 'mean', 'var'],
}
agg_train_imgs = train_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_train_imgs.columns = new_columns
agg_train_imgs = agg_train_imgs.reset_index()

agg_test_imgs = test_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_test_imgs.columns = new_columns
agg_test_imgs = agg_test_imgs.reset_index()

agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True)

In [None]:
X_temp = X_temp.merge(agg_imgs, how='left', on='PetID')

X_temp = X_temp.drop(to_drop_columns, axis=1)

In [None]:
X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

X_test = X_test.drop(['AdoptionSpeed'], axis=1)

assert X_train.shape[0] == train.shape[0]
assert X_test.shape[0] == test.shape[0]

train_cols = X_train.columns.tolist()
train_cols.remove('AdoptionSpeed')

test_cols = X_test.columns.tolist()

assert np.all(train_cols == test_cols)

In [None]:
X_train_non_null = X_train.fillna(-1)
X_test_non_null = X_test.fillna(-1)
X_train_non_null['ResNet_meta'] = train_img_prob.flatten()         # ADD IMG ResNet50 metafeature
X_test_non_null['ResNet_meta'] = test_img_prob.flatten()           # ADD IMG ResNet50 metafeature

In [None]:
X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()
X_train_non_null.shape, X_test_non_null.shape

## XGB

In [None]:
xgb_params = {
    'eval_metric': 'rmse',
    'seed': 1337,
    'eta': 0.0123,
    'subsample': 0.8,
    'colsample_bytree': 0.85,
    'tree_method': 'gpu_hist',
    'device': 'gpu',
    'silent': 1,
}

In [None]:
def run_xgb(params, X_train, X_test):
    kf = FOLDS
    n_splits = N_FOLDS
    
    verbose_eval = 1000
    num_rounds = 60000
    early_stop = 500

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))

    i = 0

    for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):

        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
        d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

        valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit)

        oof_train[valid_idx] = valid_pred
        oof_test[:, i] = test_pred

        i += 1
    return model, oof_train, oof_test

In [None]:
model, oof_train, oof_test = run_xgb(xgb_params, X_train_non_null, X_test_non_null)

In [None]:
xgb_453_train_pred = oof_train
xgb_453_test_pred = np.mean(oof_test, axis=1)
xgb_453_train_pred.shape, xgb_453_test_pred.shape

# Corr

In [None]:
gzf_lgb = lgb_gzf["test"].reshape(-1)
zkr_lgb = lgb_zkr["test"].reshape(-1)
zyl_lgb = lgb_zyl["test"].reshape(-1)

dfa = pd.DataFrame({"gzf_lgb":gzf_lgb, 
                    "zkr_lgb":zkr_lgb, 
                    "zyl_lgb":zyl_lgb, 
                    "453_xgb":xgb_453_test_pred})
dfa.corr()

# Stacking

In [None]:
gzf_lgb_train_pred = np.mean(lgb_gzf['train'], axis=1)
gzf_lgb_test_pred = np.mean(lgb_gzf['test'], axis=1)

zkr_lgb_train_pred = np.mean(lgb_zkr['train'], axis=1)
zkr_lgb_test_pred = np.mean(lgb_zkr['test'], axis=1)

zyl_lgb_train_pred = np.mean(lgb_zyl['train'], axis=1)
zyl_lgb_test_pred = np.mean(lgb_zyl['test'], axis=1)


train_meta = np.concatenate([gzf_lgb_train_pred.reshape(-1,1),
                             zkr_lgb_train_pred.reshape(-1,1),
                             zyl_lgb_train_pred.reshape(-1,1),
                             xgb_453_train_pred.reshape(-1,1)
                            ], axis=1)
test_meta = np.concatenate([gzf_lgb_test_pred.reshape(-1,1),
                            zkr_lgb_test_pred.reshape(-1,1),
                            zyl_lgb_test_pred.reshape(-1,1),
                            xgb_453_test_pred.reshape(-1,1)
                           ], axis=1)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
clf = Ridge(alpha=0.1)

clf.fit(train_meta, target)
train_pred = clf.predict(train_meta)

In [None]:
print(clf.coef_)

In [None]:
init_coef = get_init_coefs(train_pred,  target)
optR = OptimizedRounder_v2(initial_coefs=init_coef)
optR.fit(train_pred, target)
coefficients = optR.coefficients()
print("coefficients: ", coefficients, "\n")

print("True Counter: ", Counter(target))

optR = OptimizedRounder_v2()
train_predictions = optR.predict(train_pred, coefficients).astype(int)
print("Train Counter: ", Counter(train_predictions))

print("\nTrain QWK: ", quadratic_weighted_kappa(target, train_predictions))
print("Train RMSE: ", rmse(target, train_pred))

In [None]:
predictions = clf.predict(test_meta)

optR = OptimizedRounder_v3()
test_predictions = optR.predict(predictions, coefficients, 110).astype(int)
print("Test Counter: ", Counter(test_predictions), "\n")

print("True Distribution:")
print(pd.value_counts(target, normalize=True).sort_index())
print("Train Predicted Distribution:")
print(pd.value_counts(train_predictions, normalize=True).sort_index())
print("Test Predicted Distribution:")
print(pd.value_counts(test_predictions, normalize=True).sort_index())

In [None]:
submission = pd.DataFrame({'PetID': test_id, 'AdoptionSpeed': test_predictions})
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
!head submission.csv