## Libraries

In [None]:
import os
import cv2
import json
import pickle
import random
import scipy as sp
import numpy as np
import pandas as pd
import seaborn as sns

import xgboost as xgb
import lightgbm as lgb

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

import torchvision.models as models
import torchvision.transforms as transforms

from pathlib import Path
from datetime import datetime as dt
from functools import partial
from collections import Counter, defaultdict

from PIL import Image

from joblib import Parallel, delayed

from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from fastprogress import master_bar, progress_bar

pd.options.display.max_columns = 128
torch.multiprocessing.set_start_method("spawn")

## Data Loading

In [None]:
input_dir = Path("../input/petfinder-adoption-prediction/")
train = pd.read_csv(input_dir / "train/train.csv")
test = pd.read_csv(input_dir / "test/test.csv")
sample_submission = pd.read_csv(input_dir / "test/sample_submission.csv")

train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()

In [None]:
train.PhotoAmt.mean(), test.PhotoAmt.mean()

## Image model loading

In [None]:
!cp ../input/pytorch-pretrained-image-models/* ./
!ls 

## Metadata and Sentiment data

In [None]:
def jopen(path):
    with open(path, "r", encoding="utf-8") as f:
        json_file = json.load(f)
    return json_file


def parse_sentiment_file(path):
    file = jopen(path)
    language: str = file["language"]

    sentiment: list = file["documentSentiment"]
    entities: list = [x["name"] for x in file["entities"]]
    entity = " ".join(entities)

    sentence_sentiment: list = [x["sentiment"] for x in file["sentences"]]
    magnitude: np.ndarray = np.array(
        [x["magnitude"] for x in sentence_sentiment])
    score: np.ndarray = np.array([x["score"] for x in sentence_sentiment])

    return_js = {
        "magnitude_sum": magnitude.sum(),
        "magnitude_mean": magnitude.mean(),
        "magnitude_var": magnitude.var(),
        "score_sum": score.sum(),
        "score_mean": score.mean(),
        "score_var": score.var(),
        "language": language,
        "entity": entity,
        "document_magnitude": sentiment["magnitude"],
        "document_score": sentiment["score"]
    }
    return return_js


def parse_metadata(path):
    file: dict = jopen(path)
    file_keys = list(file.keys())
    name_specified = 0
    if "labelAnnotations" in file_keys:
        file_annots = file["labelAnnotations"]
        file_mean_score = np.asarray([x["score"] for x in file_annots]).mean()
        file_desc = " ".join([x["description"] for x in file_annots])
        if "cat" in file_desc or "dog" in file_desc:
            name_specified = 1
    else:
        file_mean_score = np.nan
        file_desc = ""

    file_colors: list = file["imagePropertiesAnnotation"]["dominantColors"][
        "colors"]
    file_crops: list = file["cropHintsAnnotation"]["cropHints"]

    color_score = np.asarray([x["score"] for x in file_colors]).mean()
    pixel_frac = np.asarray([x["pixelFraction"] for x in file_colors]).mean()
    crop_conf = np.asarray([x["confidence"] for x in file_crops]).mean()

    if "importanceFraction" in file_crops[0].keys():
        crop_importance = np.asarray(
            [x["importanceFraction"] for x in file_crops]).mean()
    else:
        crop_importance = np.nan
    metadata = {
        "annot_score": file_mean_score,
        "color_score": color_score,
        "pixel_frac": pixel_frac,
        "crop_conf": crop_conf,
        "crop_importance": crop_importance,
        "desc": file_desc,
        "specified": name_specified
    }
    return metadata


def additinal_features_per_id(pet_id, sentiment_path: Path, meta_path: Path):
    sentiment_path = sentiment_path / f"{pet_id}.json"
    try:
        sentiment = parse_sentiment_file(sentiment_path)
        sentiment["pet_id"] = pet_id
    except FileNotFoundError:
        sentiment = {}

    meta_files = sorted(meta_path.glob(f"{pet_id}*.json"))
    metadata_list = []
    if len(meta_files) > 0:
        for f in meta_files:
            metadata = parse_metadata(f)
            metadata["pet_id"] = pet_id
            metadata_list.append(metadata)
    return sentiment, metadata_list


def load_additional_features(ped_ids: list, sentiment_path: Path,
                             meta_path: Path):
    features = Parallel(
        n_jobs=-1, verbose=1)(
            delayed(additinal_features_per_id)(i, sentiment_path, meta_path)
            for i in ped_ids)
    sentiments = [x[0] for x in features if len(x[0]) > 0]
    metadatas = [x[1] for x in features if len(x[1]) > 0]
    sentiment_keys = sentiments[0].keys()
    metadata_keys = metadatas[0][0].keys()
    sentiment_dict = {}
    metadata_dict = {}
    for key in sentiment_keys:
        sentiment_dict[key] = [x[key] for x in sentiments]

    for key in metadata_keys:
        meta_list = []
        for meta_per_pid in metadatas:
            meta_list += [meta[key] for meta in meta_per_pid]
        metadata_dict[key] = meta_list

    sentiment_df = pd.DataFrame(sentiment_dict)
    metadata_df = pd.DataFrame(metadata_dict)
    return sentiment_df, metadata_df


def aggregate_metadata(metadata_df: pd.DataFrame,
                       aggregates=["sum", "mean", "var"]):
    meta_desc: pd.DataFrame = metadata_df.groupby(["pet_id"])["desc"].unique()
    meta_desc = meta_desc.reset_index()
    meta_desc["desc"] = meta_desc["desc"].apply(lambda x: " ".join(x))

    meta_gr: pd.DataFrame = metadata_df.drop(["desc"], axis=1)
    for i in meta_gr.columns:
        if "pet_id" not in i:
            meta_gr[i] = meta_gr[i].astype(float)
    meta_gr = meta_gr.groupby(["pet_id"]).agg(aggregates)
    meta_gr.columns = pd.Index(
        [f"{c[0]}_{c[1].upper()}" for c in meta_gr.columns.tolist()])
    meta_gr = meta_gr.reset_index()
    return meta_gr, meta_desc


def aggregate_sentiment(sentiment_df: pd.DataFrame, aggregates=["sum"]):
    sentiment_desc: pd.DataFrame = sentiment_df.groupby(
        ["pet_id"])["entity"].unique()
    sentiment_desc = sentiment_desc.reset_index()
    sentiment_desc["entity"] = sentiment_desc["entity"].apply(
        lambda x: " ".join(x))
    sentiment_lang = sentiment_df.groupby(
        ["pet_id"])["language"].unique()
    sentiment_lang = sentiment_lang.reset_index()
    sentiment_lang["language"] = sentiment_lang["language"].apply(
        lambda x: " ".join(x))
    sentiment_desc = sentiment_desc.merge(
        sentiment_lang, how="left", on="pet_id")
    

    sentiment_gr: pd.DataFrame = sentiment_df.drop(["entity", "language"],
                                                   axis=1)
    for i in sentiment_gr.columns:
        if "pet_id" not in i:
            sentiment_gr[i] = sentiment_gr[i].astype(float)
    sentiment_gr = sentiment_gr.groupby(["pet_id"]).agg(aggregates)
    sentiment_gr.columns = pd.Index(
        [f"{c[0]}" for c in sentiment_gr.columns.tolist()])
    sentiment_gr = sentiment_gr.reset_index()
    return sentiment_gr, sentiment_desc

## Load data

In [None]:
input_dir = Path("../input/petfinder-adoption-prediction/")
train = pd.read_csv(input_dir / "train/train.csv")
test = pd.read_csv(input_dir / "test/test.csv")
sample_submission = pd.read_csv(input_dir / "test/sample_submission.csv")

In [None]:
sp_train = input_dir / Path("train_sentiment/")
mp_train = input_dir / Path("train_metadata/")
sp_test = input_dir / Path("test_sentiment/")
mp_test = input_dir / Path("test_metadata/")

In [None]:
train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()

In [None]:
train_sentiment_df, train_metadata_df = load_additional_features(
    train_pet_ids, sp_train, mp_train)

test_sentiment_df, test_metadata_df = load_additional_features(
    test_pet_ids, sp_test, mp_test)

## Aggregate sentiment data and metadata

In [None]:
train_meta_gr, train_meta_desc = aggregate_metadata(train_metadata_df)
test_meta_gr, test_meta_desc = aggregate_metadata(test_metadata_df)
train_sentiment_gr, train_sentiment_desc = \
    aggregate_sentiment(train_sentiment_df)
test_sentiment_gr, test_sentiment_desc = \
    aggregate_sentiment(test_sentiment_df)

## Merge processed DataFrames with base train/test DataFrame

In [None]:
train_proc = train.copy()
train_proc = train_proc.merge(
    train_sentiment_gr, how="left", left_on="PetID", right_on="pet_id")
train_proc = train_proc.merge(
    train_meta_gr, how="left", left_on="PetID", right_on="pet_id")
train_proc = train_proc.merge(
    train_sentiment_desc, how="left", left_on="PetID", right_on="pet_id")
train_proc = train_proc.merge(
    train_meta_desc, how="left", left_on="PetID", right_on = "pet_id")

test_proc = test.copy()
test_proc = test_proc.merge(
    test_sentiment_gr, how="left", left_on="PetID", right_on="pet_id")
test_proc = test_proc.merge(
    test_meta_gr, how="left", left_on="PetID", right_on="pet_id")
test_proc = test_proc.merge(
    test_sentiment_desc, how="left", left_on="PetID", right_on="pet_id")
test_proc = test_proc.merge(
    test_meta_desc, how="left", left_on="PetID", right_on = "pet_id")

In [None]:
print(train_proc.shape, test_proc.shape)
assert train_proc.shape[0] == train.shape[0]
assert test_proc.shape[0] == test.shape[0]

In [None]:
train_proc.drop(train_proc.filter(
    regex="pet_id", axis=1).columns.tolist(), 
    axis=1, 
    inplace=True)

test_proc.drop(test_proc.filter(
    regex="pet_id", axis=1).columns.tolist(),
    axis=1,
    inplace=True)

train_proc.head()

In [None]:
train_proc.language.fillna("", inplace=True)
test_proc.language.fillna("", inplace=True)

langs = train_proc.language.unique()
encode_dict = {k: i for i, k in enumerate(langs)}

train_proc.language = train_proc.language.map(encode_dict)
test_proc.language = test_proc.language.map(encode_dict)

## Add Breed Mapping

In [None]:
labels_breed = pd.read_csv("../input/petfinder-adoption-prediction/breed_labels.csv")
labels_state = pd.read_csv("../input/petfinder-adoption-prediction/state_labels.csv")
labels_color = pd.read_csv("../input/petfinder-adoption-prediction/color_labels.csv")

In [None]:
train_breed_main = train_proc[["Breed1"]].merge(
    labels_breed, how="left",
    left_on="Breed1", right_on="BreedID",
    suffixes=("", "_main_breed"))
train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix("main_breed_")

train_breed_second = train_proc[["Breed2"]].merge(
    labels_breed, how="left",
    left_on="Breed2", right_on="BreedID",
    suffixes=("", "_second_breed"))
train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix("second_breed_")

train_proc = pd.concat([
    train_proc, train_breed_main, train_breed_second
], axis=1)

test_breed_main = test_proc[["Breed1"]].merge(
    labels_breed, how="left",
    left_on="Breed1", right_on="BreedID",
    suffixes=("", "_main_breed"))
test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix("main_breed_")

test_breed_second = test_proc[["Breed2"]].merge(
    labels_breed, how="left",
    left_on="Breed2", right_on="BreedID",
    suffixes=("", "_second_breed"))
test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix("second_breed_")

test_proc = pd.concat([
    test_proc, test_breed_main, test_breed_second
], axis=1)

print(train_proc.shape, test_proc.shape)
train_proc.head()

## Feature engineering

In [None]:
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)
X_temp = X.copy()

text_columns = [
    "Description",
    "entity",
    "desc"]
categorical_columns = [
    "Type", "Breed1", "Breed2", "Gender",
    "Color1", "Color2", "Color3", "MaturitySize",
    "FurLength", "Vaccinated", "Dewormed", "Sterilized",
    "State", "language", "main_breed_BreedName", "second_breed_BreedName"
]
cat_c = ["main_breed_BreedName", "second_breed_BreedName"]
drop_columns = [
    "PetID", "Name", "RescuerID"
]

In [None]:
for i in cat_c:
    X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]

In [None]:
X_text = X_temp[text_columns]
for i in X_text.columns:
    X_text[i] = X_text[i].fillna("none")

In [None]:
X_temp["len_description"] = X_text["Description"].map(len)
X_temp["len_meta_desc"] = X_text["desc"].map(len)
X_temp["len_entity"] = X_text["entity"].map(len)

In [None]:
import re

from nltk.corpus import stopwords
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from nltk.tokenize import RegexpTokenizer
import nltk.stem as stm
from nltk import WordNetLemmatizer, word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [None]:
X_text["cleaned_text"] = X_text["Description"].map(lambda x: x.lower())
X_text["cleaned_text"] = X_text["cleaned_text"].map(lambda x: clean_text(x))
X_text["cleaned_text"] = X_text["cleaned_text"].map(lambda x: clean_numbers(x))
X_text["cleaned_text"] = X_text["cleaned_text"].map(lambda x: replace_typical_misspell(x))

In [None]:
eng_stopwords = set(stopwords.words("english"))
import string

X_temp["len_description"] = X_text["Description"].map(len)
X_temp["len_meta_desc"] = X_text["desc"].map(len)
X_temp["len_entity"] = X_text["entity"].map(len)

X_temp["num_description_words"] = X_text["Description"].map(lambda x: len(str(x).split()))
X_temp["num_desc_words"] = X_text["desc"].map(lambda x: len(str(x).split()))
X_temp["num_entity_words"] = X_text["entity"].map(lambda x: len(str(x).split()))

X_temp["uniq_description_words"] = X_text["Description"].map(lambda x: len(set(str(x).split())))
X_temp["uniq_desc_words"] = X_text["desc"].map(lambda x: len(set(str(x).split())))
X_temp["uniq_entity_words"] = X_text["entity"].map(lambda x: len(set(str(x).split())))

X_temp["num_description_stopwords"] = X_text["Description"].map(lambda x: len([
    w for w in str(x).lower().split() if w in eng_stopwords]))
X_temp["num_desc_stopwords"] = X_text["desc"].map(lambda x: len([
    w for w in str(x).lower().split() if w in eng_stopwords]))
X_temp["num_entity_stopwords"] = X_text["entity"].map(lambda x: len([
    w for w in str(x).lower().split() if w in eng_stopwords]))

X_temp["num_description_punctuation"] = X_text["Description"].map(lambda x: len([
    c for c in str(x) if c in string.punctuation]))

### State stats

In [None]:
state_gdp = {
    41336: 116.679,
    41325: 40.596,
    41367: 23.02,
    41401: 190.075,
    41415: 5.984,
    41324: 37.274,
    41332: 42.389,
    41335: 52.452,
    41330: 67.629,
    41380: 5.642,
    41327: 81.284,
    41345: 80.167,
    41342: 121.414,
    41326: 280.698,
    41361: 32.270
}

state_population = {
    41336: 33.48283,
    41325: 19.47651,
    41367: 15.39601,
    41401: 16.74621,
    41415: 0.86908,
    41324: 8.21110,
    41332: 10.21064,
    41335: 15.00817,
    41330: 23.52743,
    41380: 2.31541,
    41327: 15.61383,
    41345: 32.06742,
    41342: 24.71140,
    41326: 54.62141,
    41361: 10.35977
}

state_area ={
    41336:19102,
    41325:9500,
    41367:15099,
    41401:243,
    41415:91,
    41324:1664,
    41332:6686,
    41335:36137,
    41330:21035,
    41380:821,
    41327:1048,
    41345:73631,
    41342:124450,
    41326:8104,
    41361:13035
}
X_temp["state_gdp"] = X_temp.State.map(state_gdp)
X_temp["state_population"] = X_temp.State.map(state_population)
X_temp["state_area"] = X_temp.State.map(state_area)

X_temp["state_gdp_per_person"] = X_temp["state_gdp"] / X_temp["state_population"] * 1e4
X_temp["fee_per_gdp_per_person"] = X_temp.Fee / X_temp["state_gdp_per_person"]

In [None]:
X_temp.head()

### Name features

In [None]:
import re
def has_name(x):
    if isinstance(x, float):
        return 0
    if "no name" in x.lower():
        return 0
    return 1


def num_name_words(x):
    if isinstance(x, float):
        return 0
    name_words = x.split(" ")
    return len(name_words)


def contains_amp(x):
    if isinstance(x, float):
        return 0
    if "&" in x:
        return 1
    if "and" in x.lower():
        return 1
    if "+" in x.lower():
        return 1
    return 0


def contains_comma(x):
    if isinstance(x, float):
        return 0
    if "," in x:
        return 1
    return 0


def start_with_number(x):
    if isinstance(x, float):
        return 0
    match = re.match(f"\d", x)
    if match:
        return int(match.group())
    return 0


def contains_paren(x):
    if isinstance(x, float):
        return 0
    if "(" in x:
        return 1
    if ")" in x:
        return 1
    return 0


def contains_number(x):
    if isinstance(x, float):
        return 0
    if re.match(r".*\d", x):
        return 1
    return 0


def safe_calc_len(x):
    if isinstance(x, float):
        return 1
    return len(x)


def num_unlike_letters(x):
    if isinstance(x, float):
        return 0
    letters = {
        ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', 
        '&', '/', '[', ']', '>', '%', '=', '#', '*', '+',  
        '•',  '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`',
        '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 
        'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', 
        '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', 
        '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', 
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', 
        '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', 
        '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', 
        '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', "ã", "ç", "å", "ä",
        "¶", "ð"}

    letter_in = set(x)
    intersection = letter_in.intersection(letters)
    if len(intersection) == 0:
        return 0
    else:
        unlike_num = 0
        for l in intersection:
            unlike_num += len(re.findall(re.escape(l), x))
        return unlike_num

X_temp["num_name_words"] = X_temp.Name.map(lambda x: num_name_words(x))
X_temp["contains_amp"] = X_temp.Name.map(lambda x: contains_amp(x))
X_temp["contains_comma"] = X_temp.Name.map(lambda x: contains_comma(x))
X_temp["start_with_number"] = X_temp.Name.map(lambda x: start_with_number(x))
X_temp["contains_paren"] = X_temp.Name.map(lambda x: contains_paren(x))
X_temp["contains_number"] = X_temp.Name.map(lambda x: contains_number(x))
X_temp["name_length"] = X_temp.Name.map(lambda x: safe_calc_len(x))
X_temp["num_unlike_letters"] = X_temp.Name.map(lambda x: num_unlike_letters(x))
X_temp["rate_unlike_letters"] = X_temp.num_unlike_letters / X_temp.name_length
X_temp.head()

## Tfidf

In [None]:
n_components = 16
text_features = []

for i in text_columns:
    print(f"generating features from: {i}")
    tfv = TfidfVectorizer(
        min_df=2,
        strip_accents="unicode",
        analyzer="word",
        token_pattern=r"(?u)\b\w+\b",
        ngram_range=(1, 3),
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1)
    svd = TruncatedSVD(
        n_components=n_components,
        random_state=1337)
    tfidf_col = tfv.fit_transform(X_text.loc[:, i].values)
    svd_col = svd.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix("Tfidf_{}_".format(i))
    
    text_features.append(svd_col)

text_features = pd.concat(text_features, axis=1)
X_temp = pd.concat([X_temp, text_features], axis=1)

for i in text_columns:
    X_temp.drop(i, axis=1, inplace=True)

## Image size features

In [None]:
import os
import glob

train_image_files = sorted(
    glob.glob("../input/petfinder-adoption-prediction/train_images/*.jpg"))
test_image_files = sorted(
    glob.glob("../input/petfinder-adoption-prediction/test_images/*.jpg"))

train_df_imgs = pd.DataFrame(train_image_files)
test_df_imgs = pd.DataFrame(test_image_files)
train_df_imgs.columns = ["image_file_name"]
test_df_imgs.columns = ["image_file_name"]

train_imgs_pets = train_df_imgs["image_file_name"].apply(
    lambda x: x.split("/")[-1].split("-")[0])
test_imgs_pets = test_df_imgs["image_file_name"].apply(
    lambda x: x.split("/")[-1].split("-")[0])
train_df_imgs = train_df_imgs.assign(PetID=train_imgs_pets)
test_df_imgs = test_df_imgs.assign(PetID=test_imgs_pets)

def get_size(filename):
    st = os.stat(filename)
    return st.st_size

def get_dimensions(filename):
    img_size = Image.open(filename).size
    return img_size

train_df_imgs["image_size"] = train_df_imgs["image_file_name"].apply(get_size)
test_df_imgs["image_size"] = test_df_imgs["image_file_name"].apply(get_size)
train_df_imgs["temp_size"] = train_df_imgs["image_file_name"].apply(get_dimensions)
test_df_imgs["temp_size"] = test_df_imgs["image_file_name"].apply(get_dimensions)
train_df_imgs["width"] = train_df_imgs["temp_size"].apply(lambda x: x[0])
test_df_imgs["width"] = test_df_imgs["temp_size"].apply(lambda x: x[0])
train_df_imgs["height"] = train_df_imgs["temp_size"].apply(lambda x: x[1])
test_df_imgs["height"] = test_df_imgs["temp_size"].apply(lambda x: x[1])
train_df_imgs.drop(["temp_size"], axis=1, inplace=True)
test_df_imgs.drop(["temp_size"], axis=1, inplace=True)

aggs = {
    "image_size": ["sum", "mean", "var"],
    "width": ["sum", "mean", "var"],
    "height": ["sum", "mean", "var"]
}
agg_train_imgs = train_df_imgs.groupby("PetID").agg(aggs)
new_columns = [
    k + "_" + agg for k in aggs.keys() for agg in aggs[k]
]
agg_test_imgs = test_df_imgs.groupby("PetID").agg(aggs)
agg_train_imgs.columns = new_columns
agg_train_imgs = agg_train_imgs.reset_index()

agg_test_imgs.columns = new_columns
agg_test_imgs = agg_test_imgs.reset_index()


In [None]:
agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True)
X_temp = X_temp.merge(agg_imgs, how="left", on="PetID")
X_temp.head()

## Sanitize

In [None]:
hasnans = []
for c in X_temp.columns:
    if X_temp[c].hasnans:
        hasnans.append(c)

In [None]:
sanitize_col = list(
    set(hasnans) - {
        "Name", "AdoptionSpeed"
    })
X_temp[sanitize_col] = X_temp[sanitize_col].fillna(0.0)
X_temp.head()

In [None]:
big_num_cols = ["len_description", "len_meta_desc", "len_entity",
                "num_description_words", "num_desc_words", "num_entity_words", 
                "uniq_description_words", "uniq_desc_words", "uniq_entity_words",
                "num_description_stopwords", "num_desc_stopwords", "num_entity_stopwords",
                "num_description_punctuation",
                "state_gdp", "state_population", "state_area", "state_gdp_per_person",
                "image_size_sum", "image_size_mean", "image_size_var", "width_sum",
                "width_mean", "width_var", "height_sum", "height_mean", "height_var"]
for c in big_num_cols:
    X_temp[c] = X_temp[c].map(lambda x: np.log1p(x))
X_temp.head()

## Drop columns

In [None]:
train_resc = train.RescuerID
test_resc = test.RescuerID

In [None]:
X_temp.drop(drop_columns, axis=1, inplace=True)
X_temp.head()

In [None]:
train.shape[0]

In [None]:
train.shape
n_train = train.shape[0]
X_train = X_temp.loc[:n_train-1, :]
X_test = X_temp.loc[n_train:, :]
X_test.drop(["AdoptionSpeed"], axis=1, inplace=True)

assert X_train.shape[0] == train.shape[0]
assert X_test.shape[0] == test.shape[0]

In [None]:
train_cols = X_train.columns.tolist()
train_cols.remove("AdoptionSpeed")

test_cols = X_test.columns.tolist()
assert np.all(train_cols == test_cols)

In [None]:
X_train_non_null = X_train.fillna(-1)
X_test_non_null = X_test.fillna(-1)

X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()

In [None]:
cat_features = ["Type", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3",
                "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized",
                "Health", "Quantity", "State", "language", "main_breed_BreedName", 
                "second_breed_BreedName"]

X_train_cat = X_train_non_null.loc[:, cat_features]
X_test_cat = X_test_non_null.loc[:, cat_features]

## GroupStratifiedKFold

In [None]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

## Categorical Target Encoding

In [None]:
y = X_train_non_null.AdoptionSpeed
X_train_cat_y = X_train_cat.copy()
X_train_cat_y["AdoptionSpeed"] = y

X_train_cat_encoded = np.zeros((X_train_cat.shape[0], len(cat_features) * 2))
X_test_cat_encoded = np.zeros((X_test_cat.shape[0], len(cat_features) * 2))
k = 10
fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1213)

for trn_idx, val_idx in fold.split(X_train_cat, y.values.astype(int)):
    X_trn, X_val = X_train_cat_y.loc[trn_idx, :], X_train_cat_y.loc[val_idx, :]
    for j, c in enumerate(cat_features):
        X_trn_enc = X_trn.groupby(c).agg({
            "AdoptionSpeed": ["mean", "std"]
        })
        cte_columns = [f"{c}_{x}" for x in ["mean", "std"]]
        X_trn_enc.columns = cte_columns
        X_temp = np.zeros((X_test_cat.shape[0], 2))
        X_temp_df = pd.DataFrame(data=X_temp, columns=cte_columns)
        for x in X_trn_enc.columns:
            X_val[x] = X_val[c].map(X_trn_enc[x])
            X_temp_df[x] = X_test_cat[c].map(X_trn_enc[x]).reset_index(drop=True)
        X_train_cat_encoded[val_idx, 2 * j:2 * (j + 1)] = X_val[X_trn_enc.columns].values
        X_test_cat_encoded[:, 2 * j: 2 * (j + 1)] += X_temp_df.values / k

In [None]:
X_train_cat_encoded.shape, X_test_cat_encoded.shape

In [None]:
columns = [f"{c}_{x}" for c in cat_features for x in ["mean", "std"]]
X_train_cat_encoded_df = pd.DataFrame(data=X_train_cat_encoded, columns=columns)
X_test_cat_encoded_df = pd.DataFrame(data=X_test_cat_encoded, columns=columns)
X_test_cat_encoded_df.head()

## Numerical Features

In [None]:
X_train_num = X_train_non_null.drop(cat_features + ["AdoptionSpeed"], axis=1)
X_test_num = X_test_non_null.drop(cat_features, axis=1)

target = X_train_non_null["AdoptionSpeed"]

In [None]:
X_train_num = pd.concat([X_train_num, X_train_cat_encoded_df], axis=1)

X_test_cat_encoded_df.index = X_test_num.index
X_test_num = pd.concat([X_test_num, X_test_cat_encoded_df], axis=1)

In [None]:
X_train_num.fillna(0.0, inplace=True)
X_test_num.fillna(0.0, inplace=True)

In [None]:
X_train_num.replace(np.inf, np.nan).fillna(0.0, inplace=True)
X_test_num.replace(np.inf, np.nan).fillna(0.0, inplace=True)

X_train_num.replace(-np.inf, np.nan).fillna(0.0, inplace=True)
X_test_num.replace(-np.inf, np.nan).fillna(0.0, inplace=True)

In [None]:
X_train_num.values[np.isinf(X_train_num)]

In [None]:
X_all_num = pd.concat([X_train_num, X_test_num], axis=0)
ss = StandardScaler()
ss.fit(X_all_num)

X_train_ss = ss.transform(X_train_num)
X_test_ss = ss.transform(X_test_num)

In [None]:
X_train_ss[np.isnan(X_train_ss)] = 0.0
X_test_ss[np.isnan(X_test_ss)] = 0.0

In [None]:
cat_cat = pd.concat([X_train_cat, X_test_cat])

n_breed1 = cat_cat["Breed1"].nunique()
n_breed2 = cat_cat["Breed2"].nunique()
n_langs = cat_cat["language"].nunique()
n_color1 = cat_cat["Color1"].nunique()
n_color2 = cat_cat["Color2"].nunique()
n_color3 = cat_cat["Color3"].nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder
for c in X_train_cat.columns:
        le = LabelEncoder()
        le.fit(cat_cat[c])
        X_train_cat[c] = le.transform(X_train_cat[c])
        X_test_cat[c] = le.transform(X_test_cat[c])

## Sequences

In [None]:
import gc
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
    def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(100000, len(word_index)) + 1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    words_list = []
    for word, i in word_index.items():
        if i >= 100000: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            words_list.append(word)
    return embedding_matrix, set(words_list)

In [None]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(X_text["cleaned_text"] + " " + X_text["desc"] + " " + X_text["entity"])
len(tokenizer.word_index)

In [None]:
train_text, test_text = X_text.loc[:n_train-1, :], X_text.loc[n_train:, :]
train_text.shape, test_text.shape

In [None]:
x_train_text = tokenizer.texts_to_sequences(train_text["cleaned_text"])
x_test_text = tokenizer.texts_to_sequences(test_text["cleaned_text"])

x_train_text = pad_sequences(x_train_text, maxlen=70)
x_test_text = pad_sequences(x_test_text, maxlen=70)

In [None]:
%%time
embedding_matrix, words_set = load_fasttext(tokenizer.word_index)

In [None]:
gc.collect()

In [None]:
def calc_vector(text, word_index, words_set, embedding_matrix):
    words = set(text.split())
    n_skip = 0
    vec = np.zeros((embedding_matrix.shape[1],))
    if len(words) == 0:
        return vec
    for n_w, word in enumerate(words):
        if word in words_set:
            idx = word_index.get(word)
            vec_ = embedding_matrix[idx, :]
        else:
            n_skip += 1
            continue
        if n_w == 0:
            vec = vec_
        else:
            vec = vec + vec_
    vec = vec / (n_w - n_skip + 1)
    return vec

In [None]:
tqdm.pandas()

In [None]:
train_entity = train_text.entity.progress_apply(lambda x: calc_vector(
    x, tokenizer.word_index, words_set, embedding_matrix))
test_entity = test_text.entity.progress_apply(lambda x: calc_vector(
    x, tokenizer.word_index, words_set, embedding_matrix))

train_entity = np.vstack(train_entity.values.tolist())
test_entity = np.vstack(test_entity.values.tolist())

train_entity[np.isnan(train_entity)] = 0.0
test_entity[np.isnan(test_entity)] = 0.0

train_desc = train_text.desc.progress_apply(lambda x: calc_vector(
    x, tokenizer.word_index, words_set, embedding_matrix))
test_desc = test_text.desc.progress_apply(lambda x: calc_vector(
    x, tokenizer.word_index, words_set, embedding_matrix))

train_desc = np.vstack(train_desc.values.tolist())
test_desc = np.vstack(test_desc.values.tolist())

train_desc[np.isnan(train_desc)] = 0.0
test_desc[np.isnan(test_desc)] = 0.0

## Images

In [None]:
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
)
ds_trans = transforms.Compose([transforms.Resize(224),
                               transforms.CenterCrop(224),
                               transforms.ToTensor(),
                               normalize])

In [None]:
class ImageDataset(data.Dataset):
    def __init__(self, pet_ids, root_dir, transform):
        self.pet_ids = pet_ids
        self.root_dir = root_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.pet_ids)
    
    def __getitem__(self, idx):
        imgs = torch.zeros((4, 3, 224, 224))
        for i in range(4):
            img_name = f"{self.pet_ids[idx]}-{i+1}.jpg"
            fullname = self.root_dir / Path(img_name)
            try:
                image = Image.open(fullname).convert("RGB")
            except FileNotFoundError:
                image = np.zeros((3, 224, 224), dtype=np.uint8).transpose(1, 2, 0)
                image = Image.fromarray(np.uint8(image))
            if self.transform:
                image = self.transform(image)
            imgs[i, :, :, :] = image
        return [self.pet_ids[idx], imgs]

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        
    def forward(self, x):
        return x

class ImagePretrained(nn.Module):
    def __init__(self, path):
        super(ImagePretrained, self).__init__()
        self.densenet121 = models.densenet121()
        self.densenet121.load_state_dict(torch.load(path))
        self.densenet121.classifier = Classifier()
        dense = nn.Sequential(*list(self.densenet121.children())[:-1])
        for param in dense.parameters():
            param.requires_grad = False
            
    def forward(self, x):
        converted = torch.zeros(x.size(0), 4, 1024)
        for i in range(4):
            out = self.densenet121(x[:, i, :, :, :])
            converted[:, i, :] = out
        return converted

In [None]:
train_img_dataset = ImageDataset(train_pet_ids,
                             "../input/petfinder-adoption-prediction/train_images/",
                             transform=ds_trans)
test_img_dataset = ImageDataset(test_pet_ids,
                            "../input/petfinder-adoption-prediction/test_images/",
                            transform=ds_trans)
train_img_loader = data.DataLoader(train_img_dataset, batch_size=128, shuffle=False)
test_img_loader = data.DataLoader(test_img_dataset, batch_size=128, shuffle=False)

In [None]:
train_pids = []
train_img_matrix = np.zeros((len(train_pet_ids), 4, 1024))
model = ImagePretrained("densenet121.pth")
model.to("cuda:0")
for i, (pid, tensor) in tqdm(enumerate(train_img_loader)):
    train_pids += [*pid]
    tensor = tensor.to("cuda:0")
    pred = model(tensor).detach().cpu().numpy()
    train_img_matrix[i * 128:(i + 1) * 128, :, :] = pred
    
test_pids = []
test_img_matrix = np.zeros((len(test_pet_ids), 4, 1024))
for i, (pid, tensor) in tqdm(enumerate(test_img_loader)):
    test_pids += [*pid]
    tensor = tensor.to("cuda:0")
    pred = model(tensor).detach().cpu().numpy()
    test_img_matrix[i * 128:(i + 1) * 128, :, :] = pred

In [None]:
gc.collect()

## Metrics

In [None]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0
    
    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')
    
    def fit(self, X, y, initial_coef=[]):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = initial_coef
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
    
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    
    def coefficients(self):
        return self.coef_['x']
    
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    assert len(rater_a) == len(rater_b)

    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_rating = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_rating)] for j in range(num_rating)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    rater_a = y
    rater_b = y_pred
    min_rating = None
    max_rating = None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)

    assert len(rater_a) == len(rater_b)

    min_rating = min(min(rater_a), min(rater_b))
    max_rating = max(max(rater_a), max(rater_b))

    conf_mat = confusion_matrix(rater_a, rater_b, min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (
                hist_rater_a[i] * hist_rater_b[j]) / num_scored_items
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

## DataLoader

In [None]:
class PetDataset(data.Dataset):
    def __init__(self, 
                 img_tensor,
                 cat_features, 
                 num_features,
                 entity_tensor,
                 desc_tensor,
                 seq_tensor,
                 labels):
        self.img_tensor = img_tensor
        self.cat_features = cat_features
        self.num_features = num_features
        self.entity_tensor = entity_tensor
        self.desc_tensor = desc_tensor
        self.seq_tensor = seq_tensor
        if labels is not None:
            self.labels = labels
        else:
            self.labels = None
        
    def __len__(self):
        return len(self.cat_features)
    
    def __getitem__(self, idx):
        img_feature = self.img_tensor[idx]
        cat_feature = self.cat_features[idx]
        num_feature = self.num_features[idx]
        entity_feature = self.entity_tensor[idx]
        desc_feature = self.desc_tensor[idx]
        seq_feature = self.seq_tensor[idx]

        if self.labels is not None:
            label = self.labels[idx]
            return [
                img_feature, cat_feature, num_feature, 
                entity_feature, desc_feature, seq_feature, label]
        else:
            return [img_feature, cat_feature, num_feature,
                    entity_feature, desc_feature, seq_feature]

## Trainer

In [None]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

def seed_torch(seed=1029):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

def run_xgb(params,
            X,
            y,
            X_test,
            resc,
            n_splits=10,
            num_rounds=60000,
            early_stop=500,
            verbose_eval=1000):
    oof_train = np.zeros((X.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))
    fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1213)

    for i, (trn_index, val_index) in enumerate(fold.split(X, y.astype(int))):
        X_tr = X.iloc[trn_index, :]
        X_val = X.iloc[val_index, :]

        y_tr = y[trn_index]
        y_val = y[val_index]
        d_train = xgb.DMatrix(
            data=X_tr, label=y_tr, feature_names=X_tr.columns)
        d_valid = xgb.DMatrix(
            data=X_val, label=y_val, feature_names=X_val.columns)

        watchlist = [(d_train, "train"), (d_valid, "valid")]
        model = xgb.train(
            params=params,
            dtrain=d_train,
            num_boost_round=num_rounds,
            evals=watchlist,
            early_stopping_rounds=early_stop,
            verbose_eval=verbose_eval)
        valid_pred = model.predict(
            xgb.DMatrix(X_val, feature_names=X_val.columns),
            ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(
            xgb.DMatrix(X_test, feature_names=X_test.columns),
            ntree_limit=model.best_ntree_limit)
        oof_train[val_index] = valid_pred
        oof_test[:, i] = test_pred
    return model, oof_train, oof_test


class Trainer:
    def __init__(self, 
                 model,
                 resc,
                 n_splits=5, 
                 seed=42, 
                 device="cuda:0", 
                 train_batch=16,
                 val_batch=32,
                 kwargs={}):
        self.model = model
        self.n_splits = n_splits
        self.seed = seed
        self.device = device
        self.train_batch = train_batch
        self.val_batch = val_batch
        self.kwargs = kwargs
        self.resc = resc
        
        self.best_score = None
        self.tag = dt.now().strftime("%Y-%m-%d-%H-%M-%S")
        
        self.loss_fn = nn.MSELoss(reduction="mean").to(self.device)
        path = Path(f"bin/{self.tag}")
        path.mkdir(exist_ok=True, parents=True)
        self.path = path
        
    def fit(self, 
            img_feats, 
            cat_feats, 
            num_feats, 
            entity_feats,
            desc_feats,
            seq_feats, 
            answer, 
            n_epochs=30):
        self.train_preds = np.zeros((train.shape[0]))
        answer = answer.values.astype(int)
        fold = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=self.seed)
        cat_feats = cat_feats.values
        for i, (trn_idx, val_idx) in enumerate(fold.split(img_feats, 
                                                          answer)):
            self.fold_num = i
            print(f"Fold: {i+1}")
            img_train, img_val = img_feats[trn_idx], img_feats[val_idx]
            cat_train, cat_val = cat_feats[trn_idx], cat_feats[val_idx]
            num_train, num_val = num_feats[trn_idx], num_feats[val_idx]
            ent_train, ent_val = entity_feats[trn_idx], entity_feats[val_idx]
            dsc_train, dsc_val = desc_feats[trn_idx], desc_feats[val_idx]
            seq_train, seq_val = seq_feats[trn_idx], seq_feats[val_idx]
            y_train, y_val = answer[trn_idx] / 4, answer[val_idx] / 4
            
            valid_preds = self._fit(img_train, 
                                    cat_train, 
                                    num_train,
                                    ent_train,
                                    dsc_train,
                                    seq_train,
                                    y_train,
                                    n_epochs,
                                    img_val,
                                    cat_val,
                                    num_val,
                                    ent_val,
                                    dsc_val,
                                    seq_val,
                                    y_val)
            self.train_preds[val_idx] = valid_preds
        
    def _fit(self, 
             img, 
             cat, 
             num,
             ent,
             dsc,
             seq,
             y, 
             n_epochs, 
             img_val, 
             cat_val, 
             num_val,
             ent_val,
             dsc_val,
             seq_val,
             y_val):
        seed_torch(self.seed)
        img_tensor = torch.tensor(img, dtype=torch.float32).to(self.device)
        cat_tensor = torch.tensor(cat, dtype=torch.long).to(self.device)
        num_tensor = torch.tensor(num, dtype=torch.float32).to(self.device)
        ent_tensor = torch.tensor(ent, dtype=torch.float32).to(self.device)
        dsc_tensor = torch.tensor(dsc, dtype=torch.float32).to(self.device)
        seq_tensor = torch.tensor(seq, dtype=torch.long).to(self.device)
        y_tensor = torch.tensor(y[:, np.newaxis], dtype=torch.float32).to(self.device)
        train = PetDataset(img_tensor, 
                           cat_tensor, 
                           num_tensor,
                           ent_tensor,
                           dsc_tensor,
                           seq_tensor,
                           y_tensor)
        train_loader = data.DataLoader(train, 
                                       batch_size=self.train_batch, shuffle=True)
        img_eval = torch.tensor(img_val, dtype=torch.float32).to(self.device)
        cat_eval = torch.tensor(cat_val, dtype=torch.long).to(self.device)
        num_eval = torch.tensor(num_val, dtype=torch.float32).to(self.device)
        ent_eval = torch.tensor(ent_val, dtype=torch.float32).to(self.device)
        dsc_eval = torch.tensor(dsc_val, dtype=torch.float32).to(self.device)
        seq_eval = torch.tensor(seq_val, dtype=torch.long).to(self.device)
        y_eval = torch.tensor(y_val[:, np.newaxis], dtype=torch.float32).to(self.device)
        eval_ = PetDataset(img_eval,
                           cat_eval,
                           num_eval,
                           ent_eval,
                           dsc_eval,
                           seq_eval,
                           y_eval)
        eval_loader = data.DataLoader(eval_,
                                      batch_size=self.val_batch, shuffle=False)
        
        model = self.model(**self.kwargs)
        model = model.to(self.device)
        optimizer = optim.Adam(model.parameters())
        best_score = np.inf
        mb = master_bar(range(n_epochs))
        
        for epoch in mb:
            model.train()
            avg_loss = 0.
            for i_batch, c_batch, n_batch, e_batch, d_batch, s_batch, y_batch in progress_bar(train_loader, parent=mb):
                y_pred = model(i_batch, c_batch, n_batch, e_batch, d_batch, s_batch)
                loss = self.loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)
            valid_preds, avg_val_loss = self._val(eval_loader, model)
            print(f"epoch {epoch+1}/{n_epochs}")
            print(f"avg_loss: {avg_loss:.4f}")
            print(f"avg_val_loss: {avg_val_loss:.4f}")
            if best_score > avg_val_loss:
                torch.save(model.state_dict(),
                           self.path / f"best{self.fold_num}.pt")
                print(f"Save model on epoch {epoch + 1}")
                best_score = avg_val_loss
        model.load_state_dict(torch.load(self.path / f"best{self.fold_num}.pt"))
        valid_preds, avg_val_loss = self._val(eval_loader, model)
        print(f"Validation loss: {avg_val_loss}")
        return valid_preds
    
    def _val(self, loader, model):
        model.eval()
        valid_preds = np.zeros(loader.dataset.cat_features.size(0))
        avg_val_loss = 0.

        for i, (i_batch, c_batch, n_batch, e_batch, d_batch, s_batch, y_batch) in enumerate(loader):
            with torch.no_grad():
                y_pred = model(i_batch, c_batch, n_batch, e_batch, d_batch, s_batch).detach()
                avg_val_loss += self.loss_fn(y_pred,
                                             y_batch).item() / len(loader)
                valid_preds[i * self.val_batch:(i + 1) * self.val_batch] = y_pred.cpu().numpy()[:, 0]
        return valid_preds, avg_val_loss

In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dims, step_dims, n_middle, n_attention,
                 **kwargs):
        super(Attention, self).__init__(**kwargs)
        self.support_masking = True
        self.feature_dims = feature_dims
        self.step_dims = step_dims
        self.n_middle = n_middle
        self.n_attention = n_attention
        self.features_dim = 0

        self.lin1 = nn.Linear(feature_dims, n_middle, bias=False)
        self.lin2 = nn.Linear(n_middle, n_attention, bias=False)

    def forward(self, x, mask=None):
        step_dims = self.step_dims

        eij = self.lin1(x)
        eij = torch.tanh(eij)
        eij = self.lin2(eij)

        a = torch.exp(eij).reshape(-1, self.n_attention, step_dims)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 2, keepdim=True) + 1e-10

        weighted_input = torch.bmm(a, x)
        return torch.sum(weighted_input, 1)

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, 
                 emb_dims, 
                 num_dims, 
                 img_linear,
                 ent_linear,
                 dsc_linear,
                 seq_linear,
                 linear_size,
                 embedding_matrix,
                 hidden_size,
                 maxlen,
                 n_attention):
        super(NeuralNet, self).__init__()
        self.img_linear = img_linear
        n_features, embed_size = embedding_matrix.shape
        self.img_lin = nn.Linear(1024, img_linear)
        self.img_attn = Attention(img_linear, 4, 2, 2)
        self.ent_lin = nn.Linear(300, ent_linear)
        self.dsc_lin = nn.Linear(300, dsc_linear)
        self.seq_emb = nn.Embedding(n_features, embed_size)
        self.seq_emb.weight = nn.Parameter(torch.tensor(
            embedding_matrix, dtype=torch.float32))
        self.seq_emb.weight.requires_grad = False
        self.seq_emb_dropout = nn.Dropout2d(0.2)
        self.lstm = nn.LSTM(
            embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.attn = Attention(hidden_size * 2, maxlen, n_attention,
                              n_attention)
        self.seq_lin = nn.Linear(hidden_size * 2, seq_linear)

        self.embeddings = nn.ModuleList(
            [nn.Embedding(x, y) for x, y in emb_dims])
        n_emb_out = sum([y for x, y in emb_dims])
        self.fc1 = nn.Linear(
            img_linear + n_emb_out + num_dims + ent_linear + dsc_linear + seq_linear, 
            linear_size)
        self.bn1 = nn.BatchNorm1d(linear_size)
        self.fc2 = nn.Linear(linear_size, 1)
        self.drop = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

    def forward(self, i, c, n, e, d, s):
        imgs = torch.zeros(i.size(0), 4, self.img_linear).to("cuda:0")
        for j in range(4):
            img_f = self.img_lin(i[:, j, :])
            imgs[:, j, :] = img_f
        img_feats = self.drop(self.tanh(self.img_attn(imgs)))
        ent_feats = self.drop(self.tanh(self.ent_lin(e)))
        dsc_feats = self.drop(self.tanh(self.dsc_lin(d)))
        emb = [
            emb_layer(c[:, j]) for j, emb_layer in enumerate(self.embeddings)
        ]
        emb = self.drop(self.tanh(torch.cat(emb, 1)))
        
        h_seq = self.seq_emb(s)
        h_seq = torch.squeeze(
            self.seq_emb_dropout(torch.unsqueeze(h_seq, 0)))
        h_lstm, _ = self.lstm(h_seq)
        h_attn = self.attn(h_lstm)
        h_lin = self.tanh(self.seq_lin(h_attn))
        data = torch.cat([img_feats, emb, n, ent_feats, dsc_feats, h_lin], 1)
        out = self.relu(self.fc1(data))
        # out = self.drop(out)
        out = self.bn1(out)
        out = self.fc2(out)
        return out

## Train

In [None]:
emb_dims = [(2, 1), (n_breed1, 3), (n_breed2, 3), (3, 1), (n_color1, 1),
            (n_color2, 1), (n_color3, 1), (4, 1), (3, 1), (3, 1),
            (3, 1), (3, 1), (3, 1), (19, 1), (14, 1),(n_langs, 1)]
num_dims = X_test_ss.shape[1]
trainer = Trainer(
    NeuralNet,
    resc=train_resc,
    n_splits=10,
    train_batch=64,
    val_batch=512,
    seed=328,
    kwargs={
        "emb_dims": emb_dims,
        "num_dims": num_dims,
        "img_linear": 48,
        "linear_size": 144,
        "ent_linear": 10,
        "dsc_linear": 10,
        "seq_linear": 20,
        "embedding_matrix": embedding_matrix,
        "hidden_size": 64,
        "maxlen": 70,
        "n_attention": 20
    })

In [None]:
trainer.fit(train_img_matrix, 
            X_train_cat, 
            X_train_ss, 
            train_entity, 
            train_desc, 
            x_train_text, target, 10)

In [None]:
bin_path = trainer.path
test_preds = np.zeros((X_test_cat.shape[0]))
i_tensor = torch.tensor(test_img_matrix, dtype=torch.float32).to(trainer.device)
c_tensor = torch.tensor(X_test_cat.values, dtype=torch.long).to(trainer.device)
n_tensor = torch.tensor(X_test_ss, dtype=torch.float32).to(trainer.device)
e_tensor = torch.tensor(test_entity, dtype=torch.float32).to(trainer.device)
d_tensor = torch.tensor(test_desc, dtype=torch.float32).to(trainer.device)
s_tensor = torch.tensor(x_test_text, dtype=torch.long).to(trainer.device)
test_dataset = PetDataset(i_tensor, 
                          c_tensor, 
                          n_tensor,
                          e_tensor,
                          d_tensor,
                          s_tensor,
                          labels=None)
test_loader = data.DataLoader(test_dataset, batch_size=512, shuffle=False)

for path in bin_path.iterdir():
    print(f"using {str(path)}")
    model = NeuralNet(**trainer.kwargs)
    model.to("cuda:0")
    model.load_state_dict(torch.load(path))

    model.eval()
    temp = np.zeros((X_test_cat.shape[0]))
    for i, (i_batch, c_batch, n_batch, e_batch, d_batch, s_batch) in enumerate(test_loader):
        i_batch = i_batch.to(trainer.device)
        with torch.no_grad():
            y_pred = model(i_batch, c_batch, n_batch, e_batch, d_batch, s_batch).detach()
            temp[i * 512:(i + 1) * 512] = y_pred.cpu().numpy()[:, 0]
    test_preds += temp / trainer.n_splits

In [None]:
test_preds.mean()

## Image

In [None]:
class ImageDataset(data.Dataset):
    def __init__(self, imat):
        self.imat = imat
        
    def __len__(self):
        return len(self.imat)
    
    def __getitem__(self, idx):
        image = self.imat[idx]
        
        return [image]

In [None]:
train_i_tensor = torch.tensor(train_img_matrix, dtype=torch.float32).to("cuda:0")
test_i_tensor = torch.tensor(test_img_matrix, dtype=torch.float32).to("cuda:0")
train_dataset = ImageDataset(train_i_tensor)
batch = 256
n_img_dim = 48
train_loader = data.DataLoader(train_dataset,
                               batch_size=batch,
                               shuffle=False)
X_train_img = np.zeros((len(train_pet_ids), n_img_dim))

test_dataset = ImageDataset(test_i_tensor)
test_loader = data.DataLoader(test_dataset,
                              batch_size=batch,
                              shuffle=False)
X_test_img = np.zeros((len(test_pet_ids), n_img_dim))
bin_path = trainer.path
for path in bin_path.iterdir():
    model = NeuralNet(**trainer.kwargs)
    model.to("cuda:0")
    model.load_state_dict(torch.load(path))
    model.eval()
    temp = np.zeros((len(train_pet_ids), n_img_dim))
    
    for i, (i_batch, ) in tqdm(enumerate(train_loader)):
        with torch.no_grad():
            imgs = torch.zeros(i_batch.size(0), 4, n_img_dim).to("cuda:0")
            for j in range(4):
                pre = model.img_lin(i_batch[:, j, :])
                imgs[:, j, :] = pre
            y_pred = model.img_attn(imgs).detach()
            temp[i * batch:(i + 1) * batch, :] = y_pred.cpu().numpy()
    X_train_img += temp / trainer.n_splits
    
    temp = np.zeros((len(test_pet_ids), n_img_dim))
    for i, (i_batch, ) in tqdm(enumerate(test_loader)):
        with torch.no_grad():
            imgs = torch.zeros(i_batch.size(0), 4, n_img_dim).to("cuda:0")
            for j in range(4):
                pre = model.img_lin(i_batch[:, j, :])
                imgs[:, j, :] = pre
            y_pred = model.img_attn(imgs).detach()
            temp[i * batch:(i + 1) * batch, :] = y_pred.cpu().numpy()
    X_test_img += temp / trainer.n_splits

In [None]:
X_train_img.shape, X_test_img.shape

In [None]:
train_img = pd.DataFrame(data=X_train_img, columns=[
    f"img{i}" for i in range(X_train_img.shape[1])
])
test_img = pd.DataFrame(data=X_test_img, columns=[
    f"img{i}" for i in range(X_test_img.shape[1])
])

In [None]:
num_columns = X_train_num.columns
X_train_num_df = pd.DataFrame(data=X_train_ss, columns=num_columns)
X_test_num_df = pd.DataFrame(data=X_test_ss, columns=num_columns)

X_test_num_df.index = X_test_cat.index
test_img.index = X_test_cat.index

X_train_all = pd.concat([X_train_num_df, X_train_cat, train_img], axis=1)
X_test_all = pd.concat([X_test_num_df, X_test_cat, test_img], axis=1)

print(X_train_all.shape, X_test_all.shape)
X_train_all.head()

In [None]:
"AdoptionSpeed" in X_train_all.columns, "AdoptionSpeed" in X_test_all.columns

In [None]:
X_train_all.columns.tolist() == X_test_all.columns.tolist()

## Category Embedding

In [None]:
class CategoryDataset(data.Dataset):
    def __init__(self, category):
        self.category = category
        
    def __len__(self):
        return len(self.category)
    
    def __getitem__(self, idx):
        category = self.category[idx, :]
        return [category]

In [None]:
c_train = torch.tensor(X_train_cat.values, dtype=torch.long).to("cuda:0")
c_test = torch.tensor(X_test_cat.values, dtype=torch.long).to("cuda:0")
train_dataset = CategoryDataset(c_train)
test_dataset = CategoryDataset(c_test)
train_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=False)
test_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False)

X_train_cat_ = np.zeros((len(train_pet_ids), 20))
X_test_cat_ = np.zeros((len(test_pet_ids), 20))
bin_path = trainer.path
for path in bin_path.iterdir():
    model = NeuralNet(**trainer.kwargs)
    model.to("cuda:0")
    model.load_state_dict(torch.load(path))
    model.eval()
    temp = np.zeros((len(train_pet_ids), 20))
    for i, (c_batch, ) in tqdm(enumerate(train_loader)):
        with torch.no_grad():
            y_pred = [model.embeddings[i](c_batch[:, i]) for i in range(len(model.embeddings))]
            y_pred = torch.cat(y_pred, 1).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_train_cat_ += temp / trainer.n_splits
    temp = np.zeros((len(test_pet_ids), 20))
    for i, (c_batch, ) in tqdm(enumerate(test_loader)):
        with torch.no_grad():
            y_pred = [model.embeddings[i](c_batch[:, i]) for i in range(len(model.embeddings))]
            y_pred = torch.cat(y_pred, 1).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_test_cat_ += temp / trainer.n_splits

In [None]:
X_train_cat_.shape, X_test_cat_.shape

In [None]:
train_emb = pd.DataFrame(data=X_train_cat_, columns=[
    f"emb{i}" for i in range(X_train_cat_.shape[1])
])
test_emb = pd.DataFrame(data=X_test_cat_, columns=[
    f"emb{i}" for i in range(X_test_cat_.shape[1])
])

In [None]:
test_emb.index = X_test_all.index

X_train_all = pd.concat([X_train_all, train_emb], axis=1)
X_test_all = pd.concat([X_test_all, test_emb], axis=1)

print(X_train_all.shape, X_test_all.shape)
X_train_all.head()

## Entity and Desc

In [None]:
class WordVecDataset(data.Dataset):
    def __init__(self, wv):
        self.wv = wv
        
    def __len__(self):
        return len(self.wv)
    
    def __getitem__(self, idx):
        wv = self.wv[idx, :]
        return [wv]

In [None]:
e_train = torch.tensor(train_entity, dtype=torch.float32).to("cuda:0")
e_test = torch.tensor(test_entity, dtype=torch.float32).to("cuda:0")
train_dataset = WordVecDataset(e_train)
test_dataset = WordVecDataset(e_test)
train_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=False)
test_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False)

X_train_ent = np.zeros((len(train_pet_ids), 10))
X_test_ent = np.zeros((len(test_pet_ids), 10))
bin_path = trainer.path
for path in bin_path.iterdir():
    model = NeuralNet(**trainer.kwargs)
    model.to("cuda:0")
    model.load_state_dict(torch.load(path))
    model.eval()
    temp = np.zeros((len(train_pet_ids), 10))
    for i, (e_batch, ) in tqdm(enumerate(train_loader), ascii=True):
        with torch.no_grad():
            y_pred = model.ent_lin(e_batch).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_train_ent += temp / trainer.n_splits
    temp = np.zeros((len(test_pet_ids), 10))
    for i, (e_batch, ) in tqdm(enumerate(test_loader), ascii=True):
        with torch.no_grad():
            y_pred = model.ent_lin(e_batch).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_test_ent += temp / trainer.n_splits

In [None]:
X_train_ent.shape, X_test_ent.shape

In [None]:
train_ent = pd.DataFrame(data=X_train_ent, columns=[
    f"ent{i}" for i in range(X_train_ent.shape[1])
])
test_ent = pd.DataFrame(data=X_test_ent, columns=[
    f"ent{i}" for i in range(X_test_ent.shape[1])
])
test_ent.index = X_test_all.index

X_train_all = pd.concat([X_train_all, train_ent], axis=1)
X_test_all = pd.concat([X_test_all, test_ent], axis=1)

print(X_train_all.shape, X_test_all.shape)
X_train_all.head()

In [None]:
d_train = torch.tensor(train_desc, dtype=torch.float32).to("cuda:0")
d_test = torch.tensor(test_desc, dtype=torch.float32).to("cuda:0")
train_dataset = WordVecDataset(d_train)
test_dataset = WordVecDataset(d_test)
train_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=False)
test_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False)

X_train_dsc = np.zeros((len(train_pet_ids), 10))
X_test_dsc = np.zeros((len(test_pet_ids), 10))
bin_path = trainer.path
for path in bin_path.iterdir():
    model = NeuralNet(**trainer.kwargs)
    model.to("cuda:0")
    model.load_state_dict(torch.load(path))
    model.eval()
    temp = np.zeros((len(train_pet_ids), 10))
    for i, (d_batch, ) in tqdm(enumerate(train_loader), ascii=True):
        with torch.no_grad():
            y_pred = model.dsc_lin(d_batch).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_train_dsc += temp / trainer.n_splits
    temp = np.zeros((len(test_pet_ids), 10))
    for i, (d_batch, ) in tqdm(enumerate(test_loader), ascii=True):
        with torch.no_grad():
            y_pred = model.dsc_lin(d_batch).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_test_dsc += temp / trainer.n_splits

In [None]:
X_train_dsc.shape, X_test_dsc.shape

In [None]:
train_dsc = pd.DataFrame(data=X_train_dsc, columns=[
    f"dsc{i}" for i in range(X_train_dsc.shape[1])
])
test_dsc = pd.DataFrame(data=X_test_dsc, columns=[
    f"dsc{i}" for i in range(X_test_dsc.shape[1])
])
test_dsc.index = X_test_all.index

X_train_all = pd.concat([X_train_all, train_dsc], axis=1)
X_test_all = pd.concat([X_test_all, test_dsc], axis=1)

print(X_train_all.shape, X_test_all.shape)
X_train_all.head()

## Sequences

In [None]:
class SeqDataset(data.Dataset):
    def __init__(self, seq):
        self.seq = seq
        
    def __len__(self):
        return len(self.seq)
    
    def __getitem__(self, idx):
        seq = self.seq[idx, :]
        return [seq]

In [None]:
s_train = torch.tensor(x_train_text, dtype=torch.long).to("cuda:0")
s_test = torch.tensor(x_test_text, dtype=torch.long).to("cuda:0")
train_dataset = SeqDataset(s_train)
test_dataset = SeqDataset(s_test)
train_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=False)
test_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False)

X_train_seq = np.zeros((len(train_pet_ids), 20))
X_test_seq = np.zeros((len(test_pet_ids), 20))
bin_path = trainer.path
for path in bin_path.iterdir():
    model = NeuralNet(**trainer.kwargs)
    model.to("cuda:0")
    model.load_state_dict(torch.load(path))
    model.eval()
    temp = np.zeros((len(train_pet_ids), 20))
    for i, (s_batch, ) in tqdm(enumerate(train_loader), ascii=True):
        with torch.no_grad():
            h_emb = model.seq_emb(s_batch)
            h_emb = torch.squeeze(model.seq_emb_dropout(torch.unsqueeze(h_emb, 0)))
            h_lstm, _ = model.lstm(h_emb)
            h_attn = model.attn(h_lstm)
            y_pred = model.seq_lin(h_attn).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_train_seq += temp / trainer.n_splits
    temp = np.zeros((len(test_pet_ids), 20))
    for i, (s_batch, ) in tqdm(enumerate(test_loader), ascii=True):
        with torch.no_grad():
            h_emb = model.seq_emb(s_batch)
            h_emb = torch.squeeze(model.seq_emb_dropout(torch.unsqueeze(h_emb, 0)))
            h_lstm, _ = model.lstm(h_emb)
            h_attn = model.attn(h_lstm)
            y_pred = model.seq_lin(h_attn).detach()
            temp[i * 128:(i + 1) * 128, :] = y_pred.cpu().numpy()
    X_test_seq += temp / trainer.n_splits

In [None]:
X_train_seq.shape, X_test_seq.shape

In [None]:
train_seq = pd.DataFrame(data=X_train_seq, columns=[
    f"seq{i}" for i in range(X_train_seq.shape[1])
])
test_seq = pd.DataFrame(data=X_test_seq, columns=[
    f"seq{i}" for i in range(X_test_seq.shape[1])
])
test_seq.index = X_test_all.index

X_train_all = pd.concat([X_train_all, train_seq], axis=1)
X_test_all = pd.concat([X_test_all, test_seq], axis=1)

print(X_train_all.shape, X_test_all.shape)
X_train_all.head()

## XGBoost

In [None]:
xgb_params = {
    "eval_metric": "rmse",
    "seed": 1337,
    "eta": 0.0123,
    "subsample": 0.8,
    "colsample_bytree": 0.85,
    "tree_method": "gpu_hist",
    "device": "gpu",
    "silent": 1
}

xgb_X = X_train_all
xgb_y = target
xgb_X_test = X_test_all

model, oof_train_xgb, oof_test_xgb= run_xgb(
    xgb_params, 
    xgb_X, 
    xgb_y, 
    xgb_X_test,
    resc=train_resc,
    n_splits=10,
    num_rounds=10000)

## Run LGBM

In [None]:
def run_lgb(params,
            X,
            y,
            X_test,
            resc,
            cat_features,
            n_splits=10,
            early_stop=500):
    oof_train = np.zeros((X.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))
    fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=229)

    for i, (trn_index, val_index) in enumerate(fold.split(X, 
                                                        y.astype(int))):
        X_tr = X.iloc[trn_index, :]
        X_val = X.iloc[val_index, :]

        y_tr = y[trn_index]
        y_val = y[val_index]
        model = lgb.LGBMRegressor(**params)
        model.fit(X_tr, 
                  y_tr, 
                  eval_set=(X_val, y_val),
                  verbose=500,
                  early_stopping_rounds=early_stop,
                  categorical_feature=cat_features)
        valid_pred = model.predict(X_val, num_iteration=model.best_iteration_)
        test_pred = model.predict(X_test, num_iteration=model.best_iteration_)
        oof_train[val_index] = valid_pred
        oof_test[:, i] = test_pred
    return model, oof_train, oof_test

In [None]:
lgb_params = {
    "boosting_type": "gbdt",
    "num_leaves": 146,
    "max_depth": 12,
    "max_bin": 32,
    "learning_rate": 0.01,
    "n_estimators": 10000,
    "subsample": 0.9212945843023237,
    "subsample_freq": 2,
    "colsample_bytree": 0.6334740217238963,
    "reg_lambda": 1.543309192604612,
    "min_child_samples": 45,
    "min_child_weight": 0.5878240657385082,
    "min_split_gain": 0.004619759404679957,
    "n_jobs": -1
}

model, oof_train, oof_test = run_lgb(
    lgb_params, 
    xgb_X, 
    xgb_y, 
    xgb_X_test,
    train_resc,
    cat_features,
    n_splits=6,
    early_stop=500)

In [None]:
xgb_X.columns[np.argwhere(model.feature_importances_ > 2000).reshape(-1)]

## Post process

In [None]:
def plot_pred(pred):
    sns.distplot(pred, kde=True, hist_kws={"range": [0, 5]})

In [None]:
plot_pred(oof_train)

In [None]:
plot_pred(oof_train_xgb)

In [None]:
oof_train.shape

In [None]:
plot_pred(oof_test.mean(axis=1))

In [None]:
plot_pred(oof_test_xgb.mean(1))

In [None]:
plot_pred(trainer.train_preds * 4)

In [None]:
plot_pred(test_preds * 4)

In [None]:
lgb_xgb = 0.5 * oof_train + 0.5 * oof_train_xgb
lgb_xgb_test = 0.5 * oof_test.mean(1) + 0.5 * oof_test_xgb.mean(1)
plot_pred(lgb_xgb)
plot_pred(lgb_xgb_test)

In [None]:
nn_preds = np.clip(trainer.train_preds, a_min=0.0, a_max=1.0)
nn_preds_test = np.clip(test_preds, a_min=0.0, a_max=1.0)
plot_pred(nn_preds * 4)
plot_pred(nn_preds_test * 4)

In [None]:
lgb_xgb_nn = 0.9 * lgb_xgb + 0.1 * (nn_preds * 4)
lgb_xgb_nn_test = 0.9 * lgb_xgb_test + 0.1 * nn_preds_test * 4

In [None]:
plot_pred(lgb_xgb_nn)

In [None]:
plot_pred(lgb_xgb_nn_test)

In [None]:
opt = OptimizedRounder()
opt.fit(lgb_xgb, target, [1.6, 2.1, 2.8, 3.5])
coeff = opt.coefficients()
valid_pred = opt.predict(lgb_xgb, coeff)
qwk = quadratic_weighted_kappa(xgb_y, valid_pred)
print("QWK = ", qwk)
coeffs = coeff.copy()
train_predictions = opt.predict(lgb_xgb, coeffs).astype(np.int8)
print(f"train_preds: {Counter(train_predictions)}")
test_predictions = opt.predict(lgb_xgb_test, coeffs).astype(np.int8)
print(f"test_preds: {Counter(test_predictions)}")
submission = pd.DataFrame({"PetID": test.PetID.values, "AdoptionSpeed": test_predictions})
submission.to_csv("submission_tree.csv", index=False)
submission.head()

In [None]:
opt = OptimizedRounder()
opt.fit(lgb_xgb_nn, target, [1.5, 2.0, 2.5, 3.5])
coeff = opt.coefficients()
valid_pred = opt.predict(lgb_xgb_nn, coeff)
qwk = quadratic_weighted_kappa(xgb_y, valid_pred)
print("QWK = ", qwk)
coeffs = coeff.copy()
train_predictions = opt.predict(lgb_xgb_nn, coeffs).astype(np.int8)
print(f"train_preds: {Counter(train_predictions)}")
test_predictions = opt.predict(lgb_xgb_nn_test, coeffs).astype(np.int8)
print(f"test_preds: {Counter(test_predictions)}")
submission = pd.DataFrame({"PetID": test.PetID.values, "AdoptionSpeed": test_predictions})
submission.to_csv("submission.csv", index=False)
submission.head()

In [None]:
opt = OptimizedRounder()
opt.fit(nn_preds * 4, target, [1.5, 2.0, 2.5, 3.5])
coeff = opt.coefficients()
valid_pred = opt.predict(nn_preds * 4, coeff)
qwk = quadratic_weighted_kappa(target, valid_pred)
print("QWK = ", qwk)
coeffs = coeff.copy()
train_predictions = opt.predict(nn_preds * 4, coeffs).astype(np.int8)
print(f"train_preds: {Counter(train_predictions)}")
test_predictions = opt.predict(nn_preds_test * 4, coeffs).astype(np.int8)
print(f"test_preds: {Counter(test_predictions)}")
submission = pd.DataFrame({"PetID": test.PetID.values, "AdoptionSpeed": test_predictions})
submission.to_csv("submission_nn.csv", index=False)
submission.head()