In [None]:
from itertools import chain
import json
from functools import partial
import random
import os
import warnings

from joblib import Parallel, delayed
import pandas as pd
import numpy as np
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF
from sklearn.linear_model import Lasso
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import scipy as sp
from tqdm.auto import tqdm
import xgboost as xgb

warnings.filterwarnings('ignore')
random.seed(3665)
os.environ['PYTHONHASHSEED'] = '3665'
pd.options.display.max_columns = 300
sp.__version__
tqdm.pandas()

# Feature Extraction

In [None]:
def read_sentiment_data(pet_id, pet_name, type_name='train'):
    pet_name = pet_name if not pd.isnull(pet_name) else 'it is no name, how you?????'
    filename = "../input/petfinder-adoption-prediction/%s_sentiment/%s.json" % (type_name, pet_id)
    try:
        with open(filename, "r") as f:
            data = json.load(f)
    except FileNotFoundError:
        data = None
    if data is None:
        return {}
    sentiment_data = data['sentences']
    sentiment_mag = [i['sentiment']['magnitude'] for i in sentiment_data]
    sentiment_score = [i['sentiment']['score'] for i in sentiment_data]
    entities = data['entities']
    has_name_entities = [i['salience'] for i in entities if pet_name in i['name']]
#     not_has_name_entities = [i['salience'] for i in entities if pet_name not in i['name']]
    doc_sentiment_mag = data['documentSentiment']['magnitude']
    doc_sentiment_score = data['documentSentiment']['score']
    language = data['language']
    cols = [
        ('sentiment_mag', sentiment_mag), ('sentiment_score', sentiment_score), 
#         ('has_name_entities', has_name_entities), ('not_has_name_entities', not_has_name_entities)
    ]
    agg_funcs = [('max', np.max), ('mean', np.mean), ('min', np.min), ('sum', sum), ('std', np.std)]
    result = {
        'pet_id': pet_id,
        'entities': ' '.join([x['name'] for x in data['entities']]),
        'doc_sentiment_mag': doc_sentiment_mag,
        'doc_sentiment_score': doc_sentiment_score,
        'language': language,
        'len_sentences': len(sentiment_data),
        'len_entities': len(entities),
        'len_has_name_entities': len(has_name_entities)
    }
    for name, var in cols:
        if not var:
            continue
        for agg_name, agg_func in agg_funcs:
            result['%s_%s' % (agg_name, name)] = agg_func(var)
    return {'sentiment_%s' % key: val for key, val in result.items()}

In [None]:
def read_metadata(pet_id, order_id=1, type_name="train", threshold=0.05):
    filename = "../input/petfinder-adoption-prediction/%s_metadata/%s-%s.json" % (type_name, pet_id, order_id)
    try:
        with open(filename, "r") as f:
            data = json.load(f)
    except FileNotFoundError:
        data = None
    if data is None:
        return {}
#     print(data)
    if 'labelAnnotations' not in data:
        category = np.nan
        annotation = []
        top_annotation = []
        top_desc = ''
    else:
        category = data['labelAnnotations'][0]['description']
        annotation = [i['score'] for i in data['labelAnnotations']]
#         e = int(len(annotation) * 0.3)
        e = None
        top_annotation = annotation[:e]
        top_desc = ' '.join([i['description'] for i in data['labelAnnotations'][:e]])
    obj_colors = data['imagePropertiesAnnotation']['dominantColors']['colors']
    main_colors = [i['score'] for i in obj_colors if i['pixelFraction'] >= threshold]
    colors = [i['score'] for i in obj_colors]
    hints = data['cropHintsAnnotation']['cropHints']
    importance = np.nan
    if hints and 'importanceFraction' in hints[0]:
        importance = np.mean([i['importanceFraction'] for i in hints])
    result = {
        'pet_id': pet_id,
        'category': category,
        'len_main_colors': len(main_colors),
        'mean_annotation': np.mean(annotation),
        'mean_top_annotation': np.mean(top_annotation),
        'mean_pixel': np.mean([i['pixelFraction'] for i in obj_colors]),
        'mean_crop_confidence': np.mean([i['confidence'] for i in hints]),
        'mean_crop_importance': importance,
        'top_desc': top_desc
    }
    cols = [
        ('main_colors', main_colors), ('colors', colors)
    ]
    agg_funcs = [('max', np.max), ('mean', np.mean), ('min', np.min), ('sum', sum), ('std', np.std)]
    for name, var in cols:
        if not var:
            continue
        for agg_name, agg_func in agg_funcs:
            result['%s_%s' % (agg_name, name)] = agg_func(var)
    return {'meta_%s' % key: val for key, val in result.items()}


def read_multi_metadata(pet_id, image_num, type_name="train", threshold=0.1):
    if not image_num:
        return {}
    for i in range(1, int(image_num) + 1):
        yield read_metadata(pet_id, i, type_name=type_name, threshold=threshold)

In [None]:
import cv2
import os
from keras.applications.densenet import preprocess_input, DenseNet121

In [None]:
def resize_to_square(im):
    old_size = im.shape[:2]
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

In [None]:
img_size = 256
batch_size = 32

In [None]:
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet-keras/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

In [None]:
def get_image_feature(df_org, type_name="train"):
    pet_ids = df_org['PetID'].values
    n_batches = len(pet_ids) // batch_size + 1
    features = {}
    for b in tqdm(range(n_batches)):
        start = b * batch_size
        end = (b + 1) * batch_size
        batch_pets = pet_ids[start: end]
        batch_images = np.zeros((len(batch_pets), img_size, img_size, 3))
        for i,pet_id in enumerate(batch_pets):
            try:
                path = "../input/petfinder-adoption-prediction/%s_images/" % type_name
                batch_images[i] = load_image(path, pet_id)
            except:
                pass
        batch_preds = m.predict(batch_images)
        for i, pet_id in enumerate(batch_pets):
            features[pet_id] = batch_preds[i]
    train_feats = pd.DataFrame.from_dict(features, orient='index')
    train_feats.columns = [f'pic_{i}' for i in range(train_feats.shape[1])]
    return train_feats

In [None]:
sentiment_feature_keys = [
    'sentiment_doc_sentiment_mag', 'sentiment_doc_sentiment_score', 'sentiment_sum_sentiment_mag', 
    'sentiment_sum_sentiment_score'
]
# sentiment_language, sentiment_pet_id
sentiment_feature_keys = [
    'sentiment_doc_sentiment_mag', 'sentiment_doc_sentiment_score', 'sentiment_mean_sentiment_mag', 
    'sentiment_std_sentiment_mag', 'sentiment_std_sentiment_score', 'sentiment_sum_sentiment_mag', 
    'sentiment_sum_sentiment_score', 'sentiment_mean_sentiment_score'
]
meta_feature_keys = [
    'meta_mean_colors', 'meta_mean_top_annotation', 'meta_mean_pixel', 'meta_mean_crop_confidence', 
    'meta_mean_crop_importance'
]

In [None]:
train_org = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
print(train_org.shape)
display(train_org.head(1))
test_org = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")
print(test_org.shape)
# display(test_org.head(1))

In [None]:
train_feats = get_image_feature(train_org, type_name="train")
test_feats = get_image_feature(test_org, type_name="test")

In [None]:
train_feats = train_feats.reset_index()
train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

test_feats = test_feats.reset_index()
test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

In [None]:
all_ids = pd.concat([train_org[['PetID']], test_org[['PetID']]], axis=0, ignore_index=True, sort=False)
all_ids.shape

In [None]:
n_components = 32
svd_ = TruncatedSVD(n_components=n_components, random_state=1337)

features_df = pd.concat([train_feats, test_feats], axis=0)
features = features_df[[f'pic_{i}' for i in range(256)]].values

svd_col = svd_.fit_transform(features)
svd_col = pd.DataFrame(svd_col)
svd_col = svd_col.add_prefix('IMG_SVD_')

img_features = pd.concat([all_ids, svd_col], axis=1)

In [None]:
if 'IMG_SVD_1' not in train_org.columns:
    print("merge image features")
    train_org = pd.merge(train_org, img_features, how='left', on='PetID')
    test_org = pd.merge(test_org, img_features, how='left', on='PetID')

In [None]:
def read_metadata_train(x):
    return list(read_multi_metadata(x['PetID'], x['PhotoAmt'], type_name="train", threshold=0.1))
train_metadata = train_org.progress_apply(read_metadata_train, axis=1, raw=True)
train_metadata = pd.DataFrame(list(chain.from_iterable(list(train_metadata))))
display(train_metadata.head(1))

In [None]:
def read_metadata_test(x):
    return list(read_multi_metadata(x['PetID'], x['PhotoAmt'], type_name="test", threshold=0.1))
test_metadata = test_org.progress_apply(read_metadata_test, axis=1, raw=True)
test_metadata = pd.DataFrame(list(chain.from_iterable(list(test_metadata))))
display(test_metadata.head(1))

In [None]:
aggfuncs = ["mean", "sum", "var"]
train_metadata_gb = train_metadata[meta_feature_keys + ['meta_pet_id']].groupby("meta_pet_id").agg(aggfuncs)
columns = train_metadata_gb.columns.tolist()
train_metadata_gb.columns = pd.Index(['{}_{}'.format(c[0], c[1].upper()) for c in columns])
train_metadata_gb = train_metadata_gb.reset_index()
train_metadata_gb.head(2)

In [None]:
test_metadata_gb = test_metadata[meta_feature_keys + ['meta_pet_id']].groupby("meta_pet_id").agg(aggfuncs)
columns = test_metadata_gb.columns.tolist()
test_metadata_gb.columns = pd.Index(['{}_{}'.format(c[0], c[1].upper()) for c in columns])
test_metadata_gb = test_metadata_gb.reset_index()
test_metadata_gb.head(2)

In [None]:
%%time
train_sentiment = train_org.progress_apply(lambda x: read_sentiment_data(x['PetID'], x['Name'], type_name="train"), axis=1, raw=True)
train_sentiment = pd.DataFrame(list(train_sentiment))
display(train_sentiment.head(2))

In [None]:
train_keys = sentiment_feature_keys + ['sentiment_pet_id']
train_sentiment_gb = train_sentiment[train_keys].groupby("sentiment_pet_id").agg(["mean"])
columns = train_sentiment_gb.columns.tolist()
train_sentiment_gb.columns = pd.Index(['{}_{}'.format(c[0], c[1].upper()) for c in columns])
train_sentiment_gb = train_sentiment_gb.reset_index()
train_sentiment_gb.head(2)

In [None]:
%%time
test_sentiment = test_org.progress_apply(lambda x: read_sentiment_data(x['PetID'], x['Name'], type_name="test"), axis=1, raw=True)
test_sentiment = pd.DataFrame(list(test_sentiment))
display(test_sentiment.head(2))

In [None]:
test_sentiment_gb = test_sentiment[train_keys].groupby("sentiment_pet_id").agg(["mean"])
columns = test_sentiment_gb.columns.tolist()
test_sentiment_gb.columns = pd.Index(['{}_{}'.format(c[0], c[1].upper()) for c in columns])
test_sentiment_gb = test_sentiment_gb.reset_index()
test_sentiment_gb.head(2)

## Merge

In [None]:
%%time
train_joined = pd.merge(train_org, train_metadata_gb, how='left', left_on='PetID', right_on='meta_pet_id')
train_joined = train_joined.drop('meta_pet_id', axis=1)
train_joined = pd.merge(train_joined, train_sentiment_gb, how='left', left_on='PetID', 
                        right_on='sentiment_pet_id')
train_joined = train_joined.drop('sentiment_pet_id', axis=1)
display(train_joined.head(1))

In [None]:
%%time
test_joined = pd.merge(test_org, test_metadata_gb, how='left', left_on='PetID', right_on='meta_pet_id')
test_joined = test_joined.drop('meta_pet_id', axis=1)
test_joined = pd.merge(test_joined, test_sentiment_gb, how='left', left_on='PetID', 
                       right_on='sentiment_pet_id')
test_joined = test_joined.drop('sentiment_pet_id', axis=1)
display(test_joined.head(1))

In [None]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix

def quadratic_weighted_kappa(y, y_pred):
    return cohen_kappa_score(y, y_pred, weights='quadratic')

In [None]:
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

class OptimizedRounder(object):
    def __init__(self, init_coef=None):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -quadratic_weighted_kappa(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']

## Extra Features

In [None]:
train_org = train_joined  # pd.concat([train_joined, train_metadata_gb[['meta_pet_id']]], axis=1)
test_org = test_joined  # pd.concat([test_joined, test_metadata_gb[['meta_pet_id']]], axis=1)
print(train_org.shape, test_org.shape)

colors = test_org.apply(lambda x: '%s_%s_%s' % (x['Color1'], x['Color2'], x['Color3']), axis=1, raw=True)
color_dict = dict(test_org.apply(lambda x: '%s_%s_%s' % (x['Color1'], x['Color2'], x['Color3']), 
                                 axis=1, raw=True).value_counts())
def color_type(x):
    if x not in color_dict:
        return -1
    if color_dict[x] == 1460:
        return 0
    if color_dict[x] == 1417:
        return 1
    if color_dict[x] == 1375:
        return 2
    if color_dict[x] == 1159:
        return 3
    if color_dict[x] == 1002:
        return 4
    if color_dict[x] == 929:
        return 5
    return 6

def age_scale(x):
    if x <= 6:
        return x
    if x <= 12:
        return 7
    if x <= 24:
        return 8
    if x <= 60:
        return 9
    if x <= 120:
        return 10
    return 11

rescuer_dict = dict(test_org.RescuerID.value_counts())
def great_rescuer(x):
    if x not in rescuer_dict:
        return 0
    if rescuer_dict[x] >= 200:
        return 1
    if rescuer_dict[x] >= 100:
        return 2
    if rescuer_dict[x] >= 80:
        return 3
    if rescuer_dict[x] >= 40:
        return 4
    if rescuer_dict[x] > 1:
        return 5
    return 6

def breed_type(x):
    d = {307: 1, 266: 2, 265: 3}
    d2 = {299: 4, 264: 4, 292: 4}
    if x in d:
        return d[x]
    if x in d2:
        return d2[x]
    return 5


def fur_scale(x):
    if x >= 60:
        return 1
    if x >= 30:
        return 2
    if x >= 10:
        return 3
    if x >= 3:
        return 4
    return 5


def age_scale(x):
    if x <= 6:
        return x
    if x <= 12:
        return 7
    if x <= 24:
        return 8
    if x <= 60:
        return 9
    if x <= 120:
        return 10
    return 11


# train_org['AgeScale'] = train_org.Age.apply(age_scale)
train_org['GreatRescuer'] = train_org.RescuerID.apply(great_rescuer).values
train_org['BreedType'] = train_org.Breed1.apply(breed_type).values
train_org['AgeScale'] = train_org.Age.apply(age_scale).values
train_org['FurScale'] = train_org.FurLength.apply(fur_scale).values

# test_org['AgeScale'] = test_org.Age.apply(age_scale)
test_org['GreatRescuer'] = test_org.RescuerID.apply(great_rescuer).values
test_org['BreedType'] = test_org.Breed1.apply(breed_type).values
test_org['AgeScale'] = test_org.Age.apply(age_scale).values
test_org['FurScale'] = test_org.FurLength.apply(fur_scale).values

X = pd.concat([train_org, test_org], ignore_index=True, sort=False)
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerIDCOUNT']
train_org = pd.merge(train_org, rescuer_count, how='left', left_on='RescuerID', right_on='RescuerID')
test_org = pd.merge(test_org, rescuer_count, how='left', left_on='RescuerID', right_on='RescuerID')

# train_org['sentiment_doc_sentiment_score_MEAN2'] = train_org['sentiment_doc_sentiment_score_MEAN']
# test_org['sentiment_doc_sentiment_score_MEAN2'] = test_org['sentiment_doc_sentiment_score_MEAN']

### Text Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

n_components = [16, 16, 16]
text_columns = ['Description', 'meta_top_desc', 'sentiment_entities']
# categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
to_drop_columns = ['PetID', 'Name', 'RescuerID']

def extract_text_feature(df):
    # Generate text features:
    text_features = []
    for i, col in enumerate(text_columns):
        # Initialize decomposition methods:
        print('generating features from: {}'.format(col))
        svd_ = TruncatedSVD(
            n_components=n_components[i], random_state=1337)
#         nmf_ = NMF(
#             n_components=n_components[i], random_state=1337)
        
        tfidf_col = TfidfVectorizer(min_df=2,  max_features=None,
                          strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
                          ngram_range=(1, 3), use_idf=1, smooth_idf=1, 
                                    sublinear_tf=1).fit_transform(df[col].fillna("missing").values)
        svd_col = svd_.fit_transform(tfidf_col)
        print("SVD:", np.sum(svd_.explained_variance_ratio_))
        svd_col = pd.DataFrame(svd_col)
        svd_col = svd_col.add_prefix('SVD_{}_'.format(col))

#         nmf_col = nmf_.fit_transform(tfidf_col)
# #         print("NMF:", np.sum(nmf_.explained_variance_ratio_))
#         nmf_col = pd.DataFrame(nmf_col)
#         nmf_col = nmf_col.add_prefix('NMF_{}_'.format(col))

        text_features.append(svd_col)
#         text_features.append(nmf_col)
    text_features = pd.concat(text_features, axis=1)
    return text_features

train_metadata_desc = train_metadata.groupby("meta_pet_id")["meta_top_desc"].unique().apply(
    lambda x: ' '.join(x)).reset_index()
train_metadata_desc.columns = ['PetID', 'meta_top_desc']
train_metadata_desc = pd.merge(train_org[['PetID']], train_metadata_desc, how='left', on='PetID')
train_metadata_desc = pd.merge(train_metadata_desc, 
                               train_sentiment[['sentiment_entities', 'sentiment_pet_id']], 
                               how='left', left_on='PetID', right_on='sentiment_pet_id')
train_metadata_desc = train_metadata_desc.drop('sentiment_pet_id', axis=1)
# train_metadata_desc['sentiment_entities'] = train_sentiment['sentiment_entities']
train_metadata_desc = pd.merge(train_metadata_desc, 
                               train_org[['Description', 'PetID']], 
                               how='left', on='PetID')

# train_metadata_desc['Description'] = train_org['Description']

test_metadata_desc = test_metadata.groupby("meta_pet_id")["meta_top_desc"].unique().apply(
    lambda x: ' '.join(x)).reset_index()
test_metadata_desc.columns = ['PetID', 'meta_top_desc']
test_metadata_desc = pd.merge(test_org[['PetID']], test_metadata_desc, how='left', on='PetID')
# test_metadata_desc['sentiment_entities'] = test_sentiment['sentiment_entities']
# test_metadata_desc['Description'] = test_org['Description']
test_metadata_desc = pd.merge(test_metadata_desc, 
                               test_sentiment[['sentiment_entities', 'sentiment_pet_id']], 
                               how='left', left_on='PetID', right_on='sentiment_pet_id')
test_metadata_desc = test_metadata_desc.drop('sentiment_pet_id', axis=1)
# train_metadata_desc['sentiment_entities'] = train_sentiment['sentiment_entities']
test_metadata_desc = pd.merge(test_metadata_desc, 
                               test_org[['Description', 'PetID']], 
                               how='left', on='PetID')
metadata_desc = pd.concat([train_metadata_desc, test_metadata_desc], axis=0)

text_features = extract_text_feature(metadata_desc)

In [None]:
# _train_desc = train_metadata.groupby("meta_pet_id")["meta_top_desc"].unique()
# _train_desc.loc['86e1089a3']
# # train_metadata_desc.meta_top_desc.values[0]

In [None]:
train_text_features = text_features.loc[:train_joined.shape[0] - 1, :]
test_text_features = text_features.loc[train_joined.shape[0]:, :]
test_text_features = test_text_features.reset_index(drop=True)
print(text_features.shape[0], train_joined.shape[0], test_joined.shape[0])
print(train_text_features.shape, test_text_features.shape)
train_text_features.head(2)

In [None]:
if 'SVD_Description_0' not in train_org.columns:
    print("concat")
    train_org = pd.concat([train_org, train_text_features], axis=1)
    test_org = pd.concat([test_org, test_text_features], axis=1)
print(train_org.shape, test_org.shape)

In [None]:
from PIL import Image

def get_size(filename):
    st = os.stat(filename)
    return st.st_size

def get_dimensions(filename):
    img_size = Image.open(filename).size
    return img_size

def get_image_size_feature(df_org, type_name="train"):
    data = []
    for pet_id, photo_num in tqdm(df_org[['PetID', 'PhotoAmt']].values):
        if not photo_num:
            continue
        for i in range(1, int(photo_num) + 1):
            filename = '../input/petfinder-adoption-prediction/%s_images/%s-%s.jpg' % (type_name, pet_id, i)
            try:
                size = get_size(filename)
                w, h = get_dimensions(filename)
            except FileNotFoundError:
                continue
            data.append({'PetID': pet_id, 'image_size': size, 'width': w, 'height': h})
    return pd.DataFrame(data)

In [None]:
%%time
train_df_imgs = get_image_size_feature(train_org, type_name="train")
test_df_imgs = get_image_size_feature(test_org, type_name="test")

aggs = {
    'image_size': ['sum', 'mean', 'var'],
    'width': ['sum', 'mean', 'var'],
    'height': ['sum', 'mean', 'var'],
}
agg_train_imgs = train_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_train_imgs.columns = new_columns
agg_train_imgs = agg_train_imgs.reset_index()

agg_test_imgs = test_df_imgs.groupby('PetID').agg(aggs)
new_columns = [
    k + '_' + agg for k in aggs.keys() for agg in aggs[k]
]
agg_test_imgs.columns = new_columns
agg_test_imgs = agg_test_imgs.reset_index()

In [None]:
if 'image_size_sum' not in train_org.columns:
    print("merge")
    agg_train_imgs = agg_train_imgs.fillna(-1)
    agg_test_imgs = agg_test_imgs.fillna(-1)
    train_org = pd.merge(train_org, agg_train_imgs, how='left', on='PetID')
    test_org = pd.merge(test_org, agg_test_imgs, how='left', on='PetID')

In [None]:
import re
import string

from nltk.corpus import stopwords
eng_stop_words = set(stopwords.words("english"))
puncs = set(string.punctuation)

def get_features(df_org):
    df = pd.DataFrame()
    desc = df_org.Description.fillna('')
    df['total_puncs'] = desc.apply(lambda x: sum([1 for i in x if i in puncs]))
    re_puncs = re.compile(r"[@\|\};<\\\?\(\"\-\]:\^\)\.\!`\~>%=\{\[\,/\+\$&\'_\*#]")
    desc = desc.apply(lambda x: re_puncs.sub("", x))
    words = desc.apply(lambda x: x.split())
    df['total_words'] = words.apply(len)
    df['total_unique_words'] = words.apply(lambda x: len(set(x)))
    df['total_stop_words'] = words.apply(lambda x: len([i for i in x if i in eng_stop_words]))
    df['unique_rate'] = df['total_unique_words'] / df['total_words']
    df['stop_words_rate'] = df['total_stop_words'] / df['total_words']
    return df

In [None]:
train_text_stat_features = get_features(train_org)
test_text_stat_features = get_features(test_org)
if 'total_puncs' not in train_org.columns:
    print('concat')
    train_org = pd.concat([train_org, train_text_stat_features], axis=1)
    test_org = pd.concat([test_org, test_text_stat_features], axis=1)

In [None]:
# not_sentiment_keys = list(set(train_sentiment.columns) - set(sentiment_feature_keys))
# rm_cols = ['sentiment_entities', 'sentiment_mean_sentiment_score', 'sentiment_std_sentiment_mag', 
#             'sentiment_mean_sentiment_mag', 'sentiment_std_sentiment_score']
# rm_cols = ['sentiment_entities']
# for key in rm_cols:
#     not_sentiment_keys.remove(key)

In [None]:
# if 'sentiment_len_entities' not in train_org.columns:
#     print('merge')
#     train_org = pd.merge(train_org, train_sentiment[not_sentiment_keys], how='left', left_on='PetID', 
#                          right_on='sentiment_pet_id')
#     test_org = pd.merge(test_org, test_sentiment[not_sentiment_keys], how='left', left_on='PetID', 
#                          right_on='sentiment_pet_id')
#     train_org = train_org.drop('sentiment_pet_id', axis=1)
#     test_org = test_org.drop('sentiment_pet_id', axis=1)
#     mapping_dict = {'en': 1, 'zh': 2, 'zh-Hant': 3, 'de': 4}
#     train_org['sentiment_language'] = train_org.sentiment_language.apply(lambda x: mapping_dict.get(x, 0))
#     test_org['sentiment_language'] = test_org.sentiment_language.apply(lambda x: mapping_dict.get(x, 0))

In [None]:
print("create_feature")
train_metadata_desc = train_metadata_desc.fillna("")
test_metadata_desc = test_metadata_desc.fillna("")
top_desc_func = lambda x: len(' '.join(x))
train_org['Length_Description'] = train_metadata_desc['Description'].map(len)
train_org['Length_metadata_annots_top_desc'] = train_metadata_desc['meta_top_desc'].map(top_desc_func)
train_org['Lengths_sentiment_entities'] = train_metadata_desc['sentiment_entities'].map(len)

test_org['Length_Description'] = test_metadata_desc['Description'].map(len)
test_org['Length_metadata_annots_top_desc'] = test_metadata_desc['meta_top_desc'].map(top_desc_func)
test_org['Lengths_sentiment_entities'] = test_metadata_desc['sentiment_entities'].map(len)

## update features

In [None]:
# train_org['main_breed_Type'] = train_org['Type']
# test_org['main_breed_Type'] = test_org['Type']

In [None]:
# df_concat = pd.concat([train_org[['RescuerID', 'PetID']], test_org[['RescuerID', 'PetID']]], axis=0)
# rescuer_count = df_concat.groupby(['RescuerID'])['PetID'].count().reset_index()
# rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']
# if 'RescuerID_COUNT' not in train_org:
#     train_org = pd.merge(train_org, rescuer_count, how='left', on='RescuerID')
#     test_org = pd.merge(test_org, rescuer_count, how='left', on='RescuerID')

### Breed Features

In [None]:
def get_breed_name_feature(df):
    labels_breed = pd.read_csv("../input/petfinder-adoption-prediction/breed_labels.csv")
    train_breed_main = df[['Breed1']].merge(
        labels_breed, how='left',
        left_on='Breed1', right_on='BreedID',
        suffixes=('', '_main_breed'))

    train_breed_main = train_breed_main.iloc[:, 2:]
    train_breed_main = train_breed_main.add_prefix('main_breed_')

    train_breed_second = df[['Breed2']].merge(
        labels_breed, how='left',
        left_on='Breed2', right_on='BreedID',
        suffixes=('', '_second_breed'))

    train_breed_second = train_breed_second.iloc[:, 2:]
    train_breed_second = train_breed_second.add_prefix('second_breed_')
    return pd.concat([train_breed_main, train_breed_second], axis=1)

In [None]:
train_breed_feature = get_breed_name_feature(train_org)
test_breed_feature = get_breed_name_feature(test_org)
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
for col in categorical_columns:
    train_breed_feature[col] = pd.factorize(train_breed_feature[col])[0]
    test_breed_feature[col] = pd.factorize(test_breed_feature[col])[0]
train_breed_feature = train_breed_feature.fillna(-1)
test_breed_feature = test_breed_feature.fillna(-1)
train_breed_feature.head(2)

In [None]:
if 'main_breed_BreedName' not in train_org.columns:
    print('concat')
    train_org = pd.concat([train_org, train_breed_feature], axis=1)
    test_org = pd.concat([test_org, test_breed_feature], axis=1)

In [None]:
def get_rescuer_feature(df, aggfunc=None):
    if aggfunc is None:
        aggfunc = {
            'Age': ['mean', 'sum', 'max', 'min', 'std'],
            'PhotoAmt': ['mean'],
            'VideoAmt': ['mean'],
            'Type': ['mean']
        }
    train_rescuer_info = df.pivot_table(list(aggfunc.keys()), 'RescuerID', aggfunc=aggfunc)
    cols = ['%s_%s' % (i, j.upper()) for i, j in train_rescuer_info.columns]
    train_rescuer_info.columns = cols
    train_rescuer_info = train_rescuer_info.add_prefix("rescuer_")
    return train_rescuer_info.reset_index()

In [None]:
aggfunc = {
    'Age': ['mean', 'sum', 'max', 'min', 'std'],
    'PhotoAmt': ['mean', 'sum', 'std'],
    'VideoAmt': ['mean', 'sum', 'std'],
    'Type': ['mean'],
    'MaturitySize': ['mean', 'sum', 'std'],
    'FurLength': ['mean', 'sum', 'std'],
    'Vaccinated': ['mean', 'sum', 'std'],
    'Dewormed': ['mean', 'sum', 'std'],
    'Sterilized': ['mean', 'sum', 'std'],
    'Quantity': ['mean', 'sum', 'std'],
    'Fee': ['mean', 'sum', 'std'],
}
train_rescuer_info = get_rescuer_feature(train_org, aggfunc=aggfunc)
test_rescuer_info = get_rescuer_feature(test_org, aggfunc=aggfunc)

In [None]:
if 'rescuer_Age_MEAN' not in train_org.columns:
    print('merge rescuer info')
    train_org = pd.merge(train_org, train_rescuer_info, how='left', on='RescuerID')
    test_org = pd.merge(test_org, test_rescuer_info, how='left', on='RescuerID')

In [None]:
# def get_color_columns(df_org):
#     df_color = pd.get_dummies(df_org[['Color1', 'Color2', 'Color3']], columns=['Color1', 'Color2', 'Color3'])
#     df = pd.DataFrame(np.zeros((df_color.shape[0], 7)), columns=['Color_%s' % i for i in range(1, 8)])
#     for i in range(1, 8):
#         for j in range(1, 4):
#             if 'Color%s_%s' % (j, i) not in df_color:
#                 continue
#             df['Color_%s' % i] = df['Color_%s' % i] + df_color['Color%s_%s' % (j, i)]
#     return df

# train_colors = get_color_columns(train_org)
# test_colors = get_color_columns(test_org)
# if 'Color_1' not in train_org.columns:
#     print("concat")
#     train_org = pd.concat([train_org, train_colors], axis=1)
#     test_org = pd.concat([test_org, test_colors], axis=1)

# Text Features

In [None]:
# from keras.preprocessing.text import Tokenizer
# tokenizer = Tokenizer(num_words=None, lower=False)
# texts = train_org.Description.fillna("").values.tolist() + test_org.Description.fillna("").values.tolist()
# tokenizer.fit_on_texts(texts)

In [None]:
# import io

# def load_embedding(word_index):
#     fin = io.open('../input/fatsttext-common-crawl/crawl-300d-2M/crawl-300d-2M.vec', 'r', encoding='utf-8', 
#                   newline='\n', errors='ignore')
#     n, d = map(int, fin.readline().split())
#     embedding = np.zeros((len(word_index) + 1, 300))
#     for line in fin:
#         tokens = line.rstrip().split(' ')
#         word = tokens[0]
#         if word not in word_index:
#             continue
#         embedding[word_index[word]] = np.array(tokens[1:]).astype(float)
#     return embedding

In [None]:
# embedding_matrix = load_embedding(tokenizer.word_index)

In [None]:
# train_words = tokenizer.texts_to_sequences(train_org.Description.fillna(""))
# test_words = tokenizer.texts_to_sequences(test_org.Description.fillna(""))

In [None]:
# def get_vec(x):
#     if not x:
#         return np.zeros((300, ))
#     return np.mean([embedding_matrix[i] for i in x], axis=0)

In [None]:
# train_vecs = np.array([get_vec(i) for i in train_words])
# test_vecs = np.array([get_vec(i) for i in test_words])
# test_vecs.shape

In [None]:
# svd_ = TruncatedSVD(n_components=32, random_state=42)
# svd_.fit(train_vecs)
# train_vecs = svd_.transform(train_vecs)
# test_vecs = svd_.transform(test_vecs)
# train_vecs = pd.DataFrame(train_vecs)
# test_vecs = pd.DataFrame(test_vecs)
# train_vecs = train_vecs.add_prefix("fasttext_")
# test_vecs = test_vecs.add_prefix("fasttext_")

In [None]:
# for col in train_org.columns:
#     if col.startswith('fasttext_'):
#         train_org = train_org.drop(col, axis=1)
#         test_org = test_org.drop(col, axis=1)
# if 'fasttext_1' not in train_org.columns:
#     print("concat fasttext features")
#     train_org = pd.concat([train_org, train_vecs], axis=1)
#     test_org = pd.concat([test_org, test_vecs], axis=1)

In [None]:
state_labels = pd.read_csv("../input/petfinder-adoption-prediction/state_labels.csv")
state_labels

In [None]:
state_gdp = {
    41336: 116.679,
    41325: 40.596,
    41367: 23.02,
    41401: 190.075,
    41415: 5.984,
    41324: 37.274,
    41332: 42.389,
    41335: 52.452,
    41330: 67.629,
    41380: 5.642,
    41327: 81.284,
    41345: 80.167,
    41342: 121.414,
    41326: 280.698,
    41361: 32.270
}

# state population: https://en.wikipedia.org/wiki/Malaysia
state_population = {
    41336: 33.48283,
    41325: 19.47651,
    41367: 15.39601,
    41401: 16.74621,
    41415: 0.86908,
    41324: 8.21110,
    41332: 10.21064,
    41335: 15.00817,
    41330: 23.52743,
    41380: 2.31541,
    41327: 15.61383,
    41345: 32.06742,
    41342: 24.71140,
    41326: 54.62141,
    41361: 10.35977
}

state_area ={
    41336:19102,
41325:9500,
41367:15099,
41401:243,
41415:91,
41324:1664,
41332:6686,
41335:36137,
41330:21035,
41380:821,
41327:1048,
41345:73631,
41342:124450,
41326:8104,
41361:13035}

state_hdi = {41336: 0.785, 41325: 0.769, 41367: 0.741, 41401: 0.822, 41415: 0.742, 41324: 0.794, 41332: 0.789, 41335: 0.766, 41330: 0.778, 41380: 0.767, 41327: 0.803, 41345: 0.674, 41342: 0.709, 41326: 0.819, 41361: 0.762}

train_org["state_gdp"] = train_org.State.map(state_gdp)
train_org["state_population"] = train_org.State.map(state_population)
train_org["state_area"] = train_org.State.map(state_area)
train_org["state_hdi"] = train_org.State.map(state_hdi)

test_org["state_gdp"] = test_org.State.map(state_gdp)
test_org["state_population"] = test_org.State.map(state_population)
test_org["state_area"] = test_org.State.map(state_area)
test_org["state_hdi"] = test_org.State.map(state_hdi)

# Modeling

In [None]:
train_org.head(1)

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
from collections import Counter

class TotalKFold(object):
    def __init__(self, kfold=5, stratify=False, random_state=None):
        self.kfold = kfold
        self.stratify = stratify
        self.random_state = random_state
    
    def split(self, X, y=None):
        y_count = list(sorted(Counter(y).items(), key=lambda x: x[1], reverse=True))
        num_per_fold = int(len(y) / self.kfold)
        folds = [[] for _ in range(self.kfold)]
        numbers = [0] * self.kfold
        for i, (rid, cnt) in enumerate(y_count):
            if (i // self.kfold) % 2 == 1:
                idx = self.kfold - 1 - i % self.kfold
            else:
                idx = i % self.kfold
            folds[idx].append(rid)
            numbers[idx] += cnt
        folds_with_index = []
        for fold in folds:
            fold = set(fold)
            folds_with_index.append([i for i in range(len(y)) if y[i] in fold])
        for i in range(self.kfold):
            train_idx = np.array(list(
                chain.from_iterable(folds_with_index[0:i] + folds_with_index[i+1:None])))
            yield train_idx, np.array(folds_with_index[i])

In [None]:
def train_xgboost_model(params, X, y, cv=None, cv_col='target', random_state=42, 
                        early_stopping_rounds=500, verbose=100, stratify=False):
    oof_preds = np.zeros((train_org.shape[0], ))
    test_preds = []
    if params is None:
        params = dict(max_depth=7, learning_rate=0.01, n_estimators=10000, 
                      n_jobs=-1, min_child_weight=1, subsample=.9, colsample_bytree=.9, 
                      random_state=42, seed=428, reg_alpha=0, reg_lambda=0.0475, gamma=0, 
                      num_leaves=70)
    if isinstance(cv_col, str) and cv_col == 'target':
        y_cv = y.copy()
    else:
        y_cv = cv_col.copy()
    if stratify:
        cv_func = StratifiedKFold
    else:
        cv_func = KFold
    if cv is None:
        kfold = cv_func(5, True, random_state=random_state)
    if isinstance(cv, int):
        kfold = cv_func(cv, True, random_state=random_state)
    if isinstance(cv, str) and cv.startswith('total'):
        n = int(cv.split('-')[1])
        kfold = TotalKFold(n, False, random_state=random_state)
#         encoder = OneHotEncoder()
#         y_cv = encoder.fit_transform(y_cv.reshape(-1, 1)).toarray()
    scores = []
    corr = None
    for i, (train_idx, val_idx) in enumerate(kfold.split(X, y_cv), 1):
        X_train, X_val, y_train, y_val = X[train_idx], X[val_idx], y[train_idx], y[val_idx]
        print(X_val.shape, y_val.shape)
        print('=== Fold %s ===' % i)
        model = xgb.XGBRegressor(**params)

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
                  eval_metric='rmse', 
                  early_stopping_rounds=early_stopping_rounds, verbose=verbose)
        x_pred = model.predict(X_val, ntree_limit=model.best_ntree_limit).reshape(-1, )
        opt_rounder = OptimizedRounder()
        opt_rounder.fit(x_pred.reshape(-1, ), y_val.reshape(-1, ).astype(float))
        corr = opt_rounder.coefficients()
        x_pred_binary = opt_rounder.predict(x_pred.reshape(-1, ), corr)
        score = quadratic_weighted_kappa(y_val.reshape(-1, ), x_pred_binary)
        print("QWK:", score)
        scores.append(score)
        oof_preds[val_idx] = x_pred
        test_preds.append(model.predict(test_org.drop(cols, axis=1).fillna(-1).values, ntree_limit=model.best_ntree_limit))
#         ranker = RankerScore(y_train.reshape(-1, ), x_pred.reshape(-1, ))
#         print(quadratic_weighted_kappa(y_val.reshape(-1, ), ranker.predict(x_pred.reshape(-1, )).reshape(-1, )))
    print("mean:", np.mean(scores), "std:", np.std(scores))
#     opt_rounder = OptimizedRounder()
#     opt_rounder.fit(y.reshape(-1, ), oof_preds.reshape(-1, ))
#     corr = opt_rounder.coefficients()
#     print("oof predict:", quadratic_weighted_kappa(y.reshape(-1, ), opt_rounder.predict(oof_preds, corr)))
    return oof_preds, test_preds, np.mean(scores)

In [None]:
# text_columns = [
#     'Name', 'RescuerID', 'Description', 'PetID'
# ]
# org_columns = [
#     'Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 
#     'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 
#     'VideoAmt', 'PhotoAmt', 'AdoptionSpeed'
# ]

In [None]:
cols = ['Name', 'RescuerID', 'Description', 'PetID', 'GreatRescuer']
# 'len_has_name_entities', 'len_entities', 'len_sentences'
# , 'meta_category', 'sentiment_language'
# RescuerIDCOUNT
choose_cols = [
    'Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'MaturitySize', 
    'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'State', 
    'PhotoAmt', 'Fee', 'VideoAmt', 'Color1', 'Color2', 'Color3'
]
org_cols = [
    'Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 
    'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 
    'VideoAmt', 'PhotoAmt'
]
del_columns = [
    'meta_mean_top_annotation_MEAN', 'meta_mean_top_annotation_SUM', 'meta_mean_top_annotation_VAR', 
    'sentiment_std_sentiment_mag_MEAN', 'sentiment_std_sentiment_score_MEAN', 'AgeScale', 'GreatRescuer', 
    'BreedType', 'FurScale'
]
del_columns = [
    'AgeScale', 'GreatRescuer', 'BreedType', 'FurScale'
]
# del_columns = ['GreatRescuer', 'BreedType', 'FurScale']
stat_features = [
    'total_puncs', 'total_words', 'total_unique_words', 'total_stop_words', 'unique_rate', 
    'stop_words_rate'
]
# stat_features = []
text_len_feature = ['Length_Description', 'Length_metadata_annots_top_desc', 'Lengths_sentiment_entities']
text_len_feature = []
cols += org_cols
cols += del_columns
cols += stat_features
cols += text_len_feature
for col in choose_cols:
    cols.remove(col)
X = train_org.drop(["AdoptionSpeed"] + cols, axis=1).fillna(-1).values
y = train_org.AdoptionSpeed.values.reshape(-1, 1)

In [None]:
train_org.drop(cols, axis=1).head(3)

In [None]:
# %%time
# # each fold ensemble?
# # params = dict(max_depth=7, learning_rate=0.01, n_estimators=10000, 
# #               n_jobs=-1, min_child_weight=1, subsample=.9, colsample_bytree=.9, 
# #               random_state=42, seed=428, reg_alpha=0, reg_lambda=0.0475, gamma=0, 
# #               num_leaves=70, objective='reg:linear', booster='gbtree')
# # Fold 1 h, 2 l
# # max_depth = 4, 0.438
# # max_depth = 6, 0.44029
# params = dict(max_depth=6, learning_rate=0.01, n_estimators=10000, 
#               n_jobs=-1, min_child_weight=1, subsample=.9, colsample_bytree=.9, 
#               random_state=42, seed=428, reg_alpha=0, reg_lambda=0.0475, gamma=0, 
#               num_leaves=70, objective='reg:linear', booster='gbtree')
# # Fold 1 l, 2 h
# # params = dict(max_depth=5, learning_rate=0.015, n_estimators=10000, 
# #               n_jobs=-1, min_child_weight=5, subsample=.9, colsample_bytree=.9, 
# #               random_state=42, seed=428, reg_alpha=0, reg_lambda=0.0475, gamma=0, 
# #               num_leaves=70, objective='reg:linear', booster='gbtree')
# # train_org.Gender.values
# # train_org.RescuerID.values.reshape(-1, )
# oof_preds, test_preds = train_xgboost_model(params, X, y, cv_col=train_org.RescuerID.values.reshape(-1, ), 
#                                             cv='total-5', verbose=1000, stratify=True, 
#                                             early_stopping_rounds=250)

In [None]:
def get_qwk(oof_preds):
    opt_rounder = OptimizedRounder()
    opt_rounder.fit(oof_preds.reshape(-1, ), train_org['AdoptionSpeed'].values.reshape(-1, ).astype(float))
    coefficients = opt_rounder.coefficients()
    train_preds = opt_rounder.predict(oof_preds, coefficients)
    print(np.mean(oof_preds), np.mean(train_org['AdoptionSpeed'].values))
    print(Counter(train_preds))
    print(Counter(train_org['AdoptionSpeed'].values))
    print(quadratic_weighted_kappa(y.reshape(-1, ), train_preds))
    return coefficients

In [None]:
# get_qwk(oof_preds)

In [None]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.preprocessing import MinMaxScaler

In [None]:
def run_xgb(params, X_train, X_test, groups=None, n_splits=5):
    verbose_eval = 1000
    num_rounds = 30000
    early_stop = 500

    kf = TotalKFold(n_splits, random_state=1337)

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))

    i = 0
    scores = []
    for train_idx, valid_idx in kf.split(X_train, groups):
        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
        d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

        valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit)

        oof_train[valid_idx] = valid_pred
        oof_test[:, i] = test_pred
        opt_rounder = OptimizedRounder()
        opt_rounder.fit(valid_pred.reshape(-1, ), y_val.reshape(-1, ))
        corr = opt_rounder.coefficients()
        score = quadratic_weighted_kappa(y_val.reshape(-1, ), 
                                         opt_rounder.predict(valid_pred.reshape(-1, ), corr))
        print("Fold:", i, "QWK:", score)
        i += 1
        scores.append(score)
    print("MEAN:", np.mean(scores), "STD:", np.std(scores))
    return model, oof_train, oof_test, np.mean(scores)

In [None]:
def run_lgb(params, X_train, X_test, groups=None, n_splits=5):
    verbose_eval = 1000
    num_rounds = 30000
    early_stop = 500

    kf = TotalKFold(n_splits, random_state=1337)

    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))

    i = 0
    scores = []
    for train_idx, valid_idx in kf.split(X_train, groups):
        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]

        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)

        d_train = lgb.Dataset(data=X_tr, label=y_tr)
        d_valid = lgb.Dataset(data=X_val, label=y_val)

        model = lgb.train(params, train_set=d_train, num_boost_round=num_rounds, valid_sets=d_valid,
                         early_stopping_rounds=early_stop, verbose_eval=verbose_eval)

        valid_pred = model.predict(X_val)
        test_pred = model.predict(X_test)

        oof_train[valid_idx] = valid_pred
        oof_test[:, i] = test_pred
        opt_rounder = OptimizedRounder()
        opt_rounder.fit(valid_pred.reshape(-1, ), y_val.reshape(-1, ))
        corr = opt_rounder.coefficients()
        score = quadratic_weighted_kappa(y_val.reshape(-1, ), 
                                         opt_rounder.predict(valid_pred.reshape(-1, ), corr))
        print("Fold:", i, "QWK:", score)
        i += 1
        scores.append(score)
    print("MEAN:", np.mean(scores), "STD:", np.std(scores))
    return model, oof_train, oof_test, np.mean(scores)

In [None]:
# params = {
#     'metric': 'rmse',  # rmse
#     "max_depth": 6,
#     "num_leaves": 64,
#     'seed': 1337,
#     'eta': 0.01,
#     'subsample': 0.7,
#     'colsample_bytree': 0.8,
#     'silent': 1,
# }
# run_lgb(params, train_org.drop(cols, axis=1), test_org.drop(cols, axis=1), groups=train_org.RescuerID.values)

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
# !pip install optuna==0.8.0
import optuna

optuna.__version__

In [None]:
def objective_xgboost(trial: optuna.Trial):
    params = {
        'eval_metric': 'rmse',  # rmse
        'seed': 1337,
        'eta': trial.suggest_uniform("eta", 1e-4, 3e-2),
        'subsample': trial.suggest_loguniform("subsample", .4, 1.),
        'colsample_bytree': trial.suggest_loguniform("colsample_bytree", .4, 1.),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-7, 0.1),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-7, 0.1),
        'tree_method': 'gpu_hist',
        'device': 'gpu',
        'silent': 1,
    }
    *_, score = run_xgb(params, train_org.drop(cols, axis=1), test_org.drop(cols, axis=1), groups=train_org.RescuerID.values)
    return -score

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_xgboost, n_trials=20, n_jobs=1)

In [None]:
print("start search parameter")

In [None]:
import lightgbm as lgb


def objective_lgb(trial: optuna.Trial):
    params = {
        'metric': 'rmse',
        'eval_metric': 'rmse',  # rmse
        'seed': 1337,
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "num_leaves": trial.suggest_int("num_leaves", 16, 512),
        'eta': trial.suggest_uniform("eta", 1e-4, 2e-2),
        'subsample': trial.suggest_loguniform("subsample", .4, 1.),
        'colsample_bytree': trial.suggest_loguniform("colsample_bytree", .4, 1.),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-7, 0.1),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-7, 0.1),
        'silent': 1,
    }
    model, oof_train, oof_test, score = run_lgb(params, train_org.drop(cols, axis=1), test_org.drop(cols, axis=1), 
                                                groups=train_org.RescuerID.values)
    return -score

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective_lgb, n_trials=35, n_jobs=1)

In [None]:
# xgb_params = {
#     'eval_metric': 'rmse',  # rmse
#     'seed': 1337,
# #     'max_depth': 7,
#     'eta': 0.01,
# #     'alpha': 0.0001,
#     'subsample': 0.8,
#     'colsample_bytree': 0.85,
#     'tree_method': 'gpu_hist',
#     'device': 'gpu',
#     'silent': 1,
# }
# xgb_params.update({
#     'eta': 0.0005899673187673149, 
#     'subsample': 0.7976729057454204, 
#     'colsample_bytree': 0.6887114635213195, 
#     'reg_alpha': 2.23534845452903e-05, 
#     'reg_lambda': 1.5691613063314457e-06
# })
# model, oof_train, oof_test, score = run_xgb(xgb_params, train_org.drop(cols, axis=1), 
#                                             test_org.drop(cols, axis=1), groups=train_org.RescuerID.values)
# # MEAN: 0.45203938601439786 STD: 0.008409033190494897 0.446778850593023


In [None]:
parameters = {
    'metric': 'rmse',
    'eval_metric': 'rmse',  # rmse
    'seed': 1337,
    'silent': 1,
}
parameters.update({
    'max_depth': 9, 
    'num_leaves': 62, 
    'eta': 0.005627994462834422, 
    'subsample': 0.7312372298663574, 
    'colsample_bytree': 0.4735271112037372, 
    'reg_alpha': 0.0007809743616060336, 
    'reg_lambda': 0.010986243903351383
})
model, oof_train, oof_test, score = run_lgb(parameters, train_org.drop(cols, axis=1), 
                                            test_org.drop(cols, axis=1), groups=train_org.RescuerID.values)

In [None]:
get_qwk(oof_train)

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
oof_preds = oof_train
opt_rounder = OptimizedRounder()
opt_rounder.fit(oof_preds.reshape(-1, ), train_org['AdoptionSpeed'].values.reshape(-1, ).astype(float))
corr = opt_rounder.coefficients()
# corr[0] = 1.65
# corr[1] = 2.12
# corr[3] = 2.84
print(Counter(opt_rounder.predict(oof_preds, corr)))
print(quadratic_weighted_kappa(y.reshape(-1, ), opt_rounder.predict(oof_preds, corr)))

In [None]:
# # cv
# # 3 0.46963526329272265 0.01506093289406102
# X_oof = np.concatenate((oof_preds.reshape(-1, 1), 
#                         oof_train.reshape(-1, 1),
# #                         oof_preds_2.reshape(-1, 1), 
# #                         oof_preds_3.reshape(-1, 1), 
# #                         oof_preds_4.reshape(-1, 1), 
# #                         oof_preds_5.reshape(-1, 1)
#                        ), axis=1)#oof_preds_4.reshape(-1, 1)
# y_oof = y.reshape(-1, )
# y_oof_onehot = OneHotEncoder().fit_transform(y_oof.reshape(-1, 1)).toarray()
# kfold = TotalKFold(5, random_state=42).split(X_oof, train_org.RescuerID.values)
# # kfold = KFold(5, True, random_state=42).split(X_oof, y_oof_onehot)

# oof_oof_preds = np.zeros((X_oof.shape[0], ))
# oof_test_preds = []
# x_test_oof = np.concatenate((np.mean(test_preds, axis=0).reshape(-1, 1), 
#                              np.mean(oof_test, axis=1).reshape(-1, 1),
# #                              np.mean(test_preds_2, axis=0).reshape(-1, 1), 
# #                              np.mean(test_preds_3, axis=0).reshape(-1, 1), 
# #                              np.mean(test_preds_4, axis=0).reshape(-1, 1), 
# #                              np.mean(test_preds_5, axis=0).reshape(-1, 1)
#                             ), axis=1)#np.mean(test_preds_4, axis=0).reshape(-1, 1)
# print(x_test_oof.shape)

# scores = []
# for i, (train_idx, val_idx) in enumerate(kfold, 1):
#     print("=== fold: ", i, "===")
#     x_train, x_val, y_train, y_val = X_oof[train_idx], X_oof[val_idx], y_oof[train_idx], y_oof[val_idx]
#     model = ElasticNet(alpha=.1, l1_ratio=0.2, random_state=42)
# #     model = xgb.XGBRegressor(max_depth=7, learning_rate=0.01, n_estimators=10000, n_jobs=-1, 
# #                              subsample=0.9, colsample_bytree=0.9, min_child_weight=10, reg_alpha=0.1, 
# #                              reg_lambda=0.4, objective='reg:linear',
# #                              random_state=42, seed=428, num_leaves=70)
# #     model = xgb.XGBClassifier(max_depth=7, n_estimators=10000, learning_rate=0.01, n_jobs=-1, 
# #                               subsample=0.9, colsample_bytree=0.9, random_state=42, seed=428, 
# #                               objective='multi:softprob', num_class=5)
#     model.fit(x_train, y_train)
# #     model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric='rmse', 
# #               early_stopping_rounds=200, verbose=100)
# #     model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric='mlogloss', 
# #           early_stopping_rounds=200, verbose=100)

#     pr = model.predict(x_val)
#     score = quadratic_weighted_kappa(y_val.reshape(-1, ), opt_rounder.predict(pr, corr))
#     print(score)
#     scores.append(score)
#     oof_oof_preds[val_idx] = pr
#     oof_test_preds.append(model.predict(x_test_oof))
# print(np.mean(scores), np.std(scores))

In [None]:
# test_sub_preds = np.mean(oof_test_preds, axis=0)

In [None]:
sub = pd.read_csv("../input/petfinder-adoption-prediction/test/sample_submission.csv")
sub.AdoptionSpeed = opt_rounder.predict(np.mean(oof_test, axis=1), corr).astype(int).reshape(-1, )
# sub.AdoptionSpeed = opt_rounder.predict(np.mean(test_preds, axis=0), corr).astype(int).reshape(-1, )
# sub.AdoptionSpeed = opt_rounder.predict(test_sub_preds, corr).astype(int).reshape(-1, )
sub.to_csv("submission.csv", index=False)
sub.head()

In [None]:
oof_train = pd.DataFrame({"oof_lgb": oof_train})
oof_train.to_csv("lgb_train_oof.csv", index=False)

In [None]:
import pickle

with open("lgb_corr.pkl", "wb") as f:
    pickle.dump(corr, f)

In [None]:
o = pd.DataFrame({"oof_lgb_preds": oof_preds})
o.to_csv("oof_lgb_preds.csv", index=False)

In [None]:
oof_test = pd.DataFrame({"oof_lgb": np.mean(oof_test, axis=1)})
oof_test.to_csv("lgb_test_pred.csv", index=False)