# PetFinder.my Adoption Prediction: Hyperparameter tuning and Model evaluation

### David Mora Garrido, Bachelor Dissertation (3rd part)

In [None]:
# !pip install --upgrade language_tool_python
# import language_tool_python

In [None]:
# !cp -r ../input/tfg-pet-adoption-data/pycontractions-master/pycontractions-master/* ./
# !python setup.py install
# from pycontractions import Contractions

In [None]:
# !pip install emoji --upgrade
# !pip install googletrans==3.1.0a0

In [None]:
import ast
import category_encoders as ce
import cv2
import emoji
# import googletrans
import keras.backend as K
import lightgbm as lgbm
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
# import rfpimp
import string
import time
import transformers_tfg_pet_adoption as transformers
import utils_tfg_pet_adoption_eda as utils_eda
import warnings
import xgboost as xgb

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from keras import Model
from keras.applications.densenet import preprocess_input as preprocess_input_densenet
from keras.models import load_model
# from rdc import rdc
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, f1_score
from sklearn.metrics import make_scorer, get_scorer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix, precision_recall_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer
from tqdm import tqdm

In [None]:
dir(transformers)

In [None]:
seed = 27912
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
breeds = pd.read_csv('../input/petfinder-adoption-prediction/PetFinder-BreedLabels.csv')
colors = pd.read_csv('../input/petfinder-adoption-prediction/PetFinder-ColorLabels.csv')
states = pd.read_csv('../input/petfinder-adoption-prediction/PetFinder-StateLabels.csv')

breeds_dict = {0: np.nan}
for index, row in breeds.iterrows():
    breeds_dict[row["BreedID"]] = row["BreedName"]
    
colors_dict = {0: np.nan}
for index, row in colors.iterrows():
    colors_dict[row["ColorID"]] = row["ColorName"]
    
states_dict = {}
for index, row in states.iterrows():
    states_dict[row["StateID"]] = row["StateName"]

In [None]:
target = "AdoptionSpeed"

X = train.drop(target, axis=1)
y = train[target]

In [None]:
pet_ids_train, pet_ids_val, _, _ = train_test_split(
    train[["PetID", "AdoptionSpeed"]], train["AdoptionSpeed"],
    test_size=0.2, stratify=train["AdoptionSpeed"], random_state=seed)
set_pet_ids_train = set(pet_ids_train["PetID"])

In [None]:
X_train_CNN = X.loc[X["PetID"].isin(set_pet_ids_train)].copy().sample(frac=1, random_state=seed)
y_train_CNN = train.loc[train["PetID"].isin(set_pet_ids_train), "AdoptionSpeed"].copy().sample(frac=1, random_state=seed)
X_val_CNN = X.loc[~X["PetID"].isin(set_pet_ids_train)].copy().sample(frac=1, random_state=seed)
y_val_CNN = train.loc[~train["PetID"].isin(set_pet_ids_train), "AdoptionSpeed"].copy().sample(frac=1, random_state=seed)

In [None]:
X_train_CNN.head(5)

In [None]:
y_train_CNN.head(5)

In [None]:
X_val_CNN.head(5)

In [None]:
y_val_CNN.head(5)

In [None]:
gdp_per_capita = {
    "Kuala Lumpur": 129472,
    "Labuan": 77798,
    "Penang": 55243,
    "Selangor": 54995,
    "Sarawak": 53358,
    "Malacca": 49172,
    "Negeri Sembilan": 45373,
    "Johor": 37342,
    "Pahang": 36474,
    "Perak": 31668,
    "Terengganu": 30933,
    "Perlis": 25656,
    "Sabah": 25326,
    "Kedah": 22412,
    "Kelantan": 14300
}

ordinal_vars_mapping = [
    {"col": "MaturitySize", "mapping": {"Small": 0, "Medium": 1, "Large": 2, "Extra Large": 3}},
    {"col": "FurLength", "mapping": {"Short": 0, "Medium": 1, "Long": 2}},
    {"col": "Health", "mapping": {"Healthy": 0, "Minor Injury": 1, "Serious Injury": 2}},
]

columns_to_be_removed_desc_feats_eval = ["Name", "Breed1", "Breed2", "Gender", "Color1",
                                          "Color2", "Color3", "Vaccinated", "Dewormed",
                                          "Sterilized", "State", "RescuerID",
                                          "MaturitySize", "FurLength", "Health",
                                          "ImageMetadataDescription"]

numeric_columns_desc_feats_eval = ["Age", "Quantity", "Fee", "VideoAmt", "PhotoAmt",
                                    "StateGDP", "RescuerCount", "DescriptionLength"]

In [None]:
train_description_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/train_description_metadata.csv",
    index_col=0)
train_profile_image_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/train_profile_images_metadata.csv",
    index_col=0)
train_all_images_metadata_agg = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_train_images_metadata_agg.csv",
    index_col=0)
train_profile_image_properties = pd.read_csv(
    "../input/tfg-pet-adoption-data/train_profile_images_properties.csv",
    index_col=0)
train_all_images_properties_agg = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_train_images_properties_agg.csv",
    index_col=0)
train_aggregated_image_features_regression_model_ensemble_2_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_DenseNet121_regression__ensemble__2__layer-16_in-256.csv", index_col=0)
train_desc_transformations_df = \
    pd.read_csv("../input/tfg-pet-adoption-data/descriptions_transformations.csv",
                index_col=0)
train_desc_transformations_df["Description"] = \
    train_desc_transformations_df["Description"].replace(np.nan, '')
train_desc_transformations_df["translation"] = \
    train_desc_transformations_df["translation"].replace(np.nan, '')
train_desc_transformations_df["expanded"] = \
    train_desc_transformations_df["expanded"].replace(np.nan, '')

In [None]:
ife = transformers.ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=train_aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

tfidf_vectorizer = transformers.CustomTfidfVectorizer(svd_n_components=16,
                                                     seed=seed)

pipeline_6_transformers = [
    ('replace_breeds',
     transformers.LeftJoinReplace(values_dict=breeds_dict,
                                  variables=["Breed1", "Breed2"])),
    ('replace_colors',
     transformers.LeftJoinReplace(values_dict=colors_dict,
                                  variables=["Color1", "Color2", "Color3"])),
    ('replace_states',
     transformers.LeftJoinReplace(values_dict=states_dict, variables=["State"])),
    ('replace_by_strings',
     FunctionTransformer(func=transformers.replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=transformers.has_significant_name)),
    ('pure_breed', FunctionTransformer(func=transformers.has_pure_breed)),
    ('breed_matches_fur_length',
     FunctionTransformer(func=transformers.breed_matches_fur_length)),
    ('impute_breed', transformers.BreedImputer()),
    ('include_prof_im_metadata',
     transformers.IncludeProfileImageMetadata(
         profile_image_metadata=train_profile_image_metadata,
         all_images_metadata_agg=train_all_images_metadata_agg)),
    ('correct_wrong_type', transformers.CorrectWrongType(breeds)),
    ('encode_breed', transformers.BreedEncoding()),
    ('ordinal_vars_encoder', 
     transformers.OrdinalVariableEncoder(
         columns=["MaturitySize", "FurLength", "Health"],
         enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp',
     transformers.ReplaceState(gdp_per_capita=gdp_per_capita,
                               impute_nan_value=46450)),
    ('rescuer_count', transformers.ReplaceRescuerID()),
    ('description_length',
     FunctionTransformer(func=transformers.include_description_length)),
    ('include_desc_metadata',
     transformers.IncludeDescriptionMetadata(
         description_metadata=train_description_metadata)),
    ('correct_desc_language', transformers.CorrectDescriptionLanguage()),
    ('one_hot_encoder',
     transformers.CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                                     "Color3", "Vaccinated", "Dewormed",
                                     "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties',
     transformers.IncludeProfileImageProperties(
         profile_image_properties=train_profile_image_properties,
         aggregated_images_properties=train_all_images_properties_agg)),
    ('drop_columns',
     transformers.ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio',
     FunctionTransformer(func=transformers.include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer',
     transformers.DescriptionTransformer(
         transformations_df=train_desc_transformations_df)),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('drop_petid_desc',
     transformers.ColumnRemover(
         columns=["PetID", "DescriptionLanguage", "Description"])),
    ('useless_vars_remover',
     transformers.UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props',
     transformers.CustomIterativeImputer())
#     ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval))
]

In [None]:
pipeline_params = {
    'encode_breed__enc_type': ["target_and_frequency", "one-hot_svd"],
    'include_prof_im_metadata__aggregate_metadata': [False, True],
    'include_prof_im_properties__aggregate_properties': [False, True],
    'tfidf_vectorizer__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf_vectorizer__sublinear_tf': [False, True],
    'tfidf_vectorizer__max_df': [1.0, 0.9]
}

Only the search space for some of the parameters of the transformers are 96 combinations. Even if the number of hyperparameters combinations of the model is small, depending on the average fit time of the model with the data it receives, we will surpass Kaggle 9h hours limit. Thus, we may have to use RandomizedSearchCV instead of an exhaustive search with GridSearchCV.

In [None]:
def qwk(y_test, y_pred):
    return cohen_kappa_score(y_test, y_pred, weights='quadratic')

qwk_scorer = make_scorer(qwk, greater_is_better=True)

scorers = {
    "qwk": qwk_scorer,
    "accuracy": get_scorer("accuracy"),
    "f1": get_scorer("f1_macro")
}

In [None]:
def get_best_qwk_params(cv_results_):
    cv_results_ = pd.DataFrame.from_dict(cv_results_)
    index_max_test_qwk = cv_results_.loc[
        cv_results_["mean_test_qwk"] == cv_results_["mean_test_qwk"].max()
    ].index[0]
    return index_max_test_qwk

In [None]:
def hyperparams_tuning(model, params, inner_cv, outer_cv, scorers, X, y,
                       randomized_search=True, n_iter=None, seed=seed):
    model = clone(model)
    best_params_list = []
    best_params_outer_score = []
    search_inner_cv_results = []
    if randomized_search:
        search = RandomizedSearchCV(
            model,
            params,
            n_iter=n_iter,
            scoring=scorers,
            cv=inner_cv,
            refit=get_best_qwk_params,
            return_train_score=True,
            random_state=seed,
            n_jobs=-1,
            verbose=1
        )
    else:
        search = GridSearchCV(
            model,
            params,
            scoring=scorers,
            cv=inner_cv,
            refit=get_best_qwk_params,
            return_train_score=True,
            n_jobs=-1,
            verbose=1
        )
    
    for train_index, test_index in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            search_result = search.fit(X_train, y_train)
        best_model = search_result.best_estimator_
        
        best_params_list.append(search_result.best_params_)
        df_search_inner_cv_results = pd.DataFrame.from_dict(
            search_result.cv_results_)
        regex = df_search_inner_cv_results.filter(regex="split")
        df_search_inner_cv_results.drop(regex, axis=1, inplace=True)
        search_inner_cv_results.append(df_search_inner_cv_results)

        y_pred = best_model.predict(X_test)
        best_params_outer_score.append([])
        for scorer_desc, scorer in scorers.items():
            best_params_outer_score[-1].append((scorer_desc, scorer(best_model, X_test, y_test)))
        
    return best_params_list, best_params_outer_score, search_inner_cv_results

In [None]:
inner_cv = StratifiedKFold(n_splits=3)
outer_cv = StratifiedKFold(n_splits=5)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)

In [None]:
def get_best_hyperparams(model, inner, outer,
                         directory="../input/tfg-pet-adoption-data"):
    concatenation_search_dfs = None
    for i in range(1, outer+1):
        search_df = pd.read_csv(
            f"{directory}/{model}_search_results_inner-{inner}-CV_outer-iter-{i}.csv",
            index_col=0
        )
        if i == 1:
            concatenation_search_dfs = pd.DataFrame([],
                                columns=search_df.columns)
        concatenation_search_dfs = concatenation_search_dfs.append(
            search_df, ignore_index=True)
        
    rank_columns = list(filter(lambda x: "rank_test_" in x,
                              concatenation_search_dfs.columns))
    concatenation_search_dfs[rank_columns] = \
        concatenation_search_dfs[rank_columns].astype('int64')
    
    print(f"Top ranked hyperparameters combinations in the {inner}-CV inner validations:")
    display(concatenation_search_dfs.sort_values(["rank_test_qwk"]).head(outer*2))
    param_names = list(filter(lambda x: "param_" in x,
                              concatenation_search_dfs.columns))
    top_1_params = (concatenation_search_dfs.sort_values(["rank_test_qwk"])
                   .head(outer)[param_names].copy())
    rank_1_all_outer = top_1_params.merge(concatenation_search_dfs,
                                         on=param_names, how="left")

    print(f"\n\nMean ranking (QWK) in inner {inner}-CV validation:")
    mean_qwk_ranking = (rank_1_all_outer.groupby(param_names)["rank_test_qwk"]
                        .mean()
                        .reset_index().sort_values(["rank_test_qwk"]))
    display(mean_qwk_ranking)
    
    print("\n\nBest params:")
    best_params = mean_qwk_ranking.head(1)[param_names].squeeze()
    best_params = dict(map(lambda x: (x[0][x[0].index("_")+1:], x[1]),
                           best_params.items()))
    if "tfidf_vectorizer__ngram_range" in best_params:
        best_params["tfidf_vectorizer__ngram_range"] = \
            ast.literal_eval(best_params["tfidf_vectorizer__ngram_range"])
    print(pd.Series(best_params))
    
    return best_params

## Random Forest Classifier

In [None]:
rf_clf_params = {
    'model__n_estimators': [80, 100, 120],
    'model__min_samples_leaf': [1, 5]
}

Total number of combinations: 576. The average fit time is 15 seconds, so it would exceed 9 hours (15s/combination x 576 combinations x 3 inner x 5 outer) = 36 hours (with just one thread). Let's try 180 random combinations.

In [None]:
# rf_clf = RandomForestClassifier(random_state=seed, n_jobs=-1)

In [None]:
# model = Pipeline(steps=pipeline_6_transformers + [('model', rf_clf)])
# params = {**pipeline_params, **rf_clf_params}

# best_params, outer_scores, search_results_dfs = hyperparams_tuning(
#     model=model,
#     params=params,
#     inner_cv=inner_cv,
#     outer_cv=outer_cv,
#     n_iter=180,
#     scorers=scorers,
#     X = X_train_CNN,
#     y = y_train_CNN,
#     seed=seed
# )

In [None]:
# for i, (params, outer_score, search_df) in enumerate(zip(best_params, outer_scores, search_results_dfs)):
#     print(params)
#     print(outer_score)
#     display(search_df.sort_values(["rank_test_qwk"]).head(5))
#     print("\n\n")
#     search_df.to_csv(f"RandomForestClassifier_search_results_inner-3-CV_outer-iter-{i+1}.csv")

In [None]:
rf_outer_scores = [
    [('qwk', 0.4388425878601222), ('accuracy', 0.4514380992080033), ('f1', 0.3496026590576998)],
    [('qwk', 0.4638184271537107), ('accuracy', 0.4697790746144227), ('f1', 0.36926202308458944)],
    [('qwk', 0.4326586045283932), ('accuracy', 0.4493538974572739), ('f1', 0.3552094817434665)],
    [('qwk', 0.46125046914127155), ('accuracy', 0.46352646936223424), ('f1', 0.3588884817370581)],
    [('qwk', 0.4420106051569376), ('accuracy', 0.46663886572143454), ('f1', 0.36026779959502236)]
]

(outer validation results extracted from Version 4)

In [None]:
rf_outer_qwk_scores = [x[0][1] for x in rf_outer_scores]
rf_outer_accuracy_scores = [x[1][1] for x in rf_outer_scores]
rf_outer_f1_macro_scores = [x[2][1] for x in rf_outer_scores]

print("Outer QWK scores:", rf_outer_qwk_scores)
print("Mean QWK score (5-CV):", np.mean(rf_outer_qwk_scores))
print("Outer accuracy scores:", rf_outer_accuracy_scores)
print("Mean accuracy score (5-CV):", np.mean(rf_outer_accuracy_scores))
print("Outer f1_macro scores:", rf_outer_f1_macro_scores)
print("Mean f1_macro score (5-CV):", np.mean(rf_outer_f1_macro_scores))

In [None]:
rf_clf_best_params = get_best_hyperparams(
    model="RandomForestClassifier",
    inner=3,
    outer=5
)

### 5-CV on complete training dataset (as preliminary evaluations)

In [None]:
rf_clf = RandomForestClassifier(random_state=seed, n_jobs=-1)
rf_model = Pipeline(steps=pipeline_6_transformers + [('model', rf_clf)])
rf_model = clone(rf_model).set_params(**rf_clf_best_params)
_ = transformers.evaluate_model(rf_model, cv, X, y, model_type="classification")

### Single split validation (the one used to train the CNNs)

In [None]:
rf_clf = RandomForestClassifier(random_state=seed, n_jobs=-1)
rf_model = Pipeline(steps=pipeline_6_transformers + [('model', rf_clf)])
rf_model = clone(rf_model).set_params(**rf_clf_best_params)
_ = transformers.evaluate_model_single_split(
    rf_model,
    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
    display_results=True)

## XGBoost Classifier

In [None]:
xgb_clf_params = {
    'model__n_estimators': [80, 100, 120],
    'model__max_depth': [4, 5, 6],
    'model__learning_rate': [0.1, 0.3],
    'model__gamma': [0.0, 1.0],
    'model__reg_lambda': [0.0, 1.0, 5.0]
}

In [None]:
xgb_classifier = xgb.XGBClassifier(
    tree_method="gpu_hist",
    eval_metric="mlogloss",
    random_state=seed,
    use_label_encoder=False,
    verbosity=0
)

We will search 180 out of 10368 possible combinations using the histogram method with GPU:

In [None]:
# model = Pipeline(steps=pipeline_6_transformers + [('model', xgb_classifier)])
# params = {**pipeline_params, **xgb_clf_params}

# best_params, outer_scores, search_results_dfs = hyperparams_tuning(
#     model=model,
#     params=params,
#     inner_cv=inner_cv,
#     outer_cv=outer_cv,
#     n_iter=180,
#     scorers=scorers,
#     X = X_train_CNN,
#     y = y_train_CNN,
#     seed=seed
# )

In [None]:
# for i, (params, outer_score, search_df) in enumerate(zip(best_params, outer_scores, search_results_dfs)):
#     print(params)
#     print(outer_score)
#     display(search_df.sort_values(["rank_test_qwk"]).head(5))
#     print("\n\n")
#     search_df.to_csv(f"XGBClassifier_search_results_inner-3-CV_outer-iter-{i+1}.csv")

In [None]:
xgb_outer_scores = [
    [('qwk', 0.43393140569387), ('accuracy', 0.45477282200917046), ('f1', 0.34854249172900187)],
    [('qwk', 0.46344678424469077), ('accuracy', 0.460608586911213), ('f1', 0.3645345856065142)],
    [('qwk', 0.43482719895584043), ('accuracy', 0.45727386411004584), ('f1', 0.3573660365203477)],
    [('qwk', 0.43799320113452567), ('accuracy', 0.4601917465610671), ('f1', 0.36463521563502105)],
    [('qwk', 0.43024713907175727), ('accuracy', 0.4549624687239366), ('f1', 0.34407247612782166)]
]

(outer validation results extracted from Version 4)

In [None]:
xgb_outer_qwk_scores = [x[0][1] for x in xgb_outer_scores]
xgb_outer_accuracy_scores = [x[1][1] for x in xgb_outer_scores]
xgb_outer_f1_macro_scores = [x[2][1] for x in xgb_outer_scores]

print("Outer QWK scores:", xgb_outer_qwk_scores)
print("Mean QWK score (5-CV):", np.mean(xgb_outer_qwk_scores))
print("Outer accuracy scores:", xgb_outer_accuracy_scores)
print("Mean accuracy score (5-CV):", np.mean(xgb_outer_accuracy_scores))
print("Outer f1_macro scores:", xgb_outer_f1_macro_scores)
print("Mean f1_macro score (5-CV):", np.mean(xgb_outer_f1_macro_scores))

In [None]:
xgb_clf_best_params = get_best_hyperparams(
    model="XGBClassifier",
    inner=3,
    outer=5
)

### 5-CV on complete training dataset (as preliminary evaluations)

In [None]:
xgb_clf = xgb.XGBClassifier(
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=seed,
    use_label_encoder=False,
    n_jobs=-1
)

xgb_model = Pipeline(steps=pipeline_6_transformers + [('model', xgb_clf)])
xgb_model = clone(xgb_model).set_params(**xgb_clf_best_params)
_ = transformers.evaluate_model(xgb_model, cv, X, y, model_type="classification")

### Single split validation (the one used to train the CNNs)

In [None]:
xgb_clf = xgb.XGBClassifier(
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=seed,
    use_label_encoder=False,
    n_jobs=-1
)

xgb_model = Pipeline(steps=pipeline_6_transformers + [('model', xgb_clf)])
xgb_model = clone(xgb_model).set_params(**xgb_clf_best_params)
_ = transformers.evaluate_model_single_split(
    xgb_model,
    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
    display_results=True)

## LightGBM

In [None]:
lgbm_clf_params = {
    'model__n_estimators': [80, 100, 120],
    'model__num_leaves': [15, 31, 63],
    'model__learning_rate': [0.05, 0.1],
    'model__min_child_samples': [20, 30, 40],
    'model__reg_lambda': [1.0, 5.0]
}

In [None]:
lgbm_classifier = lgbm.LGBMClassifier(
    boosting_type="goss",
    objective="multiclass",
    colsample_bytree=0.6,
    random_state=seed,
    n_jobs=-1
)

In [None]:
# model = Pipeline(steps=pipeline_6_transformers + [('model', lgbm_classifier)])
# params = {**pipeline_params, **lgbm_clf_params}

# best_params, outer_scores, search_results_dfs = hyperparams_tuning(
#     model=model,
#     params=params,
#     inner_cv=inner_cv,
#     outer_cv=outer_cv,
#     n_iter=200,
#     scorers=scorers,
#     X = X_train_CNN,
#     y = y_train_CNN,
#     seed=seed
# )

In [None]:
# for i, (params, outer_score, search_df) in enumerate(zip(best_params, outer_scores, search_results_dfs)):
#     print(params)
#     print(outer_score)
#     display(search_df.sort_values(["rank_test_qwk"]).head(5))
#     print("\n\n")
#     search_df.to_csv(f"LGBMClassifier_search_results_inner-3-CV_outer-iter-{i+1}.csv")

In [None]:
lgbm_outer_scores = [
    [('qwk', 0.4249232067166937), ('accuracy', 0.44476865360566903), ('f1', 0.3392171738936955)],
    [('qwk', 0.4577509227782489), ('accuracy', 0.4531054606085869), ('f1', 0.34626250112638035)],
    [('qwk', 0.43361058640524364), ('accuracy', 0.44852021675698206), ('f1', 0.3522426210859976)],
    [('qwk', 0.428611432133983), ('accuracy', 0.4535223009587328), ('f1', 0.35858242542139934)],
    [('qwk', 0.43574521097747165), ('accuracy', 0.4549624687239366), ('f1', 0.3472511841879512)]
]

(outer 5-CV results extracted from Version 6)

In [None]:
lgbm_outer_qwk_scores = [x[0][1] for x in lgbm_outer_scores]
lgbm_outer_accuracy_scores = [x[1][1] for x in lgbm_outer_scores]
lgbm_outer_f1_macro_scores = [x[2][1] for x in lgbm_outer_scores]

print("Outer QWK scores:", lgbm_outer_qwk_scores)
print("Mean QWK score (5-CV):", np.mean(lgbm_outer_qwk_scores))
print("Outer accuracy scores:", lgbm_outer_accuracy_scores)
print("Mean accuracy score (5-CV):", np.mean(lgbm_outer_accuracy_scores))
print("Outer f1_macro scores:", lgbm_outer_f1_macro_scores)
print("Mean f1_macro score (5-CV):", np.mean(lgbm_outer_f1_macro_scores))

In [None]:
lgbm_clf_best_params = get_best_hyperparams(
    model="LGBMClassifier",
    inner=3,
    outer=5
)

### 5-CV on complete training dataset (as preliminary evaluations)

In [None]:
lgbm_clf = lgbm.LGBMClassifier(
    boosting_type="goss",
    objective="multiclass",
    colsample_bytree=0.6,
    random_state=seed,
    n_jobs=-1
)

lgbm_model = Pipeline(steps=pipeline_6_transformers + [('model', lgbm_clf)])
lgbm_model = clone(lgbm_model).set_params(**lgbm_clf_best_params)
_ = transformers.evaluate_model(lgbm_model, cv, X, y, model_type="classification")

### Single split validation (the one used to train the CNNs)

In [None]:
lgbm_clf = lgbm.LGBMClassifier(
    boosting_type="goss",
    objective="multiclass",
    colsample_bytree=0.6,
    random_state=seed,
    n_jobs=-1
)

lgbm_model = Pipeline(steps=pipeline_6_transformers + [('model', lgbm_clf)])
lgbm_model = clone(lgbm_model).set_params(**lgbm_clf_best_params)
_ = transformers.evaluate_model_single_split(
    lgbm_model,
    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
    display_results=True)

## Ensembles

In [None]:
rf_clf = RandomForestClassifier(random_state=seed, n_jobs=-1)
rf_model = Pipeline(steps=pipeline_6_transformers + [('model', rf_clf)])
rf_model = clone(rf_model).set_params(**rf_clf_best_params)

In [None]:
xgb_clf = xgb.XGBClassifier(
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=seed,
    use_label_encoder=False,
    n_jobs=-1
)

xgb_model = Pipeline(steps=pipeline_6_transformers + [('model', xgb_clf)])
xgb_model = clone(xgb_model).set_params(**xgb_clf_best_params)

In [None]:
lgbm_clf = lgbm.LGBMClassifier(
    boosting_type="goss",
    objective="multiclass",
    colsample_bytree=0.6,
    random_state=seed,
    n_jobs=-1
)

lgbm_model = Pipeline(steps=pipeline_6_transformers + [('model', lgbm_clf)])
lgbm_model = clone(lgbm_model).set_params(**lgbm_clf_best_params)

### Majority vote

In [None]:
estimators = [
    ('rf', rf_model),
    ('xgboost', xgb_model),
    ('lightgbm', lgbm_model)
]

voting_model = VotingClassifier(estimators=estimators, voting="hard",
                                n_jobs=-1)

_ = transformers.evaluate_model_single_split(
    voting_model,
    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
    display_results=True)

### Stacking

In [None]:
level0 = []
level0.append(('rf', rf_model))
level0.append(('xgboost', xgb_model))
level0.append(('lightgbm', lgbm_model))

level1 = LogisticRegression(n_jobs=-1, random_state=seed)

stacking_model = StackingClassifier(estimators=level0, final_estimator=level1,
                                    cv=outer_cv, n_jobs=-1)

_ = transformers.evaluate_model_single_split(
    stacking_model,
    X_train_CNN, X_val_CNN, y_train_CNN, y_val_CNN,
    display_results=True)

## Setting up the necessary test data

In [None]:
test = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")

In [None]:
# _ = utils_eda.get_description_metadata(type_data="test")

In [None]:
# path = '../input/tfg-pet-adoption-data/test_description_metadata.json'
# test_description_metadata_json = utils_eda.load_json(path)
# test_description_metadata = pd.DataFrame.from_dict(test_description_metadata_json, orient='index')
# test_description_metadata.drop(["DescriptionNumEntities"], axis=1, inplace=True)
# test_description_metadata.rename(lambda x: x if x == "DescriptionLanguage" else x + "_num", axis=1, inplace=True)
# test_description_metadata.to_csv("test_description_metadata.csv")

test_description_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/test_description_metadata.csv",
    index_col=0)
test_description_metadata

In [None]:
# _ = utils_eda.get_image_metadata(type_data="test", all_images=True)

In [None]:
# _ = utils_eda.get_image_metadata(type_data="test")

In [None]:
# path = '../input/tfg-pet-adoption-data/test_profile_images_metadata.json'
# test_profile_image_metadata_json = utils_eda.load_json(path)
# test_profile_image_metadata = pd.DataFrame.from_dict(test_profile_image_metadata_json, orient='index')
# test_profile_image_metadata.drop(["faces"], axis=1, inplace=True)
# test_profile_image_metadata.rename(columns={
#     'sum_pixelFraction':'ImageMetadataSumPixelFraction_num',
#     'max_pet_topicality':'ImageMetadataMaxPetTopicality_num',
#     'num_entities':'ImageMetadataNumEntities_num',
#     'desc_concatenation':'ImageMetadataDescription',
#     'has_text':'ImageMetadataHasText'}, inplace=True)
# test_profile_image_metadata.to_csv("test_profile_images_metadata.csv")

test_profile_image_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/test_profile_images_metadata.csv",
    index_col=0)
test_profile_image_metadata

In [None]:
# path = "../input/tfg-pet-adoption-data/test_all_images_metadata.json"
# all_test_images_metadata_json = utils_eda.load_json(path)
# all_test_images_metadata = pd.DataFrame.from_dict(all_test_images_metadata_json, orient='index')
# all_test_images_metadata.drop(["faces", "PetID"], axis=1, inplace=True)
# all_test_images_metadata.rename(columns={
#     'sum_pixelFraction':'ImageMetadataSumPixelFraction',
#     'max_pet_topicality':'ImageMetadataMaxPetTopicality',
#     'num_entities':'ImageMetadataNumEntities',
#     'desc_concatenation':'ImageMetadataDescription',
#     'has_text':'ImageMetadataHasText'}, inplace=True)
# all_test_images_metadata.to_csv("all_test_images_metadata.csv")

test_all_images_metadata = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_test_images_metadata.csv",
    index_col=0)
test_all_images_metadata

In [None]:
# all_test_images_metadata_text_agg = transformers.aggregate_images_metadata(all_test_images_metadata, type_data="test")

test_all_images_metadata_agg = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_test_images_metadata_agg.csv",
    index_col=0)
test_all_images_metadata_agg

In [None]:
# _ = utils_eda.get_image_properties(type_data="test", all_images=True)

In [None]:
# _ = utils_eda.get_image_properties(type_data="test")

In [None]:
# path = '../input/tfg-pet-adoption-data/test_profile_images_properties.json'
# test_profile_image_properties_json = utils_eda.load_json(path)
# test_profile_image_properties = pd.DataFrame.from_dict(test_profile_image_properties_json, orient='index')
# test_profile_image_properties.rename(columns={
#     "dullness": "ImagePropertyDullness_num",
#     "whiteness": "ImagePropertyWhiteness_num",
#     "blurrness": "ImagePropertyBlurrness_num",
#     "size": "ImagePropertySize_num",
#     "width": "ImagePropertyWidth_num",
#     "height": "ImagePropertyHeight_num"
# }, inplace=True)
# test_profile_image_properties.to_csv("test_profile_images_properties.csv")

test_profile_image_properties = pd.read_csv(
    "../input/tfg-pet-adoption-data/test_profile_images_properties.csv",
    index_col=0)
test_profile_image_properties

In [None]:
# path = '../input/tfg-pet-adoption-data/all_test_images_properties.json'
# all_test_images_properties_json = utils_eda.load_json(path)
# all_test_images_properties = pd.DataFrame.from_dict(all_test_images_properties_json, orient='index')
# all_test_images_properties.rename(columns={
#     "dullness": "ImagePropertyDullness",
#     "whiteness": "ImagePropertyWhiteness",
#     "blurrness": "ImagePropertyBlurrness",
#     "size": "ImagePropertySize",
#     "width": "ImagePropertyWidth",
#     "height": "ImagePropertyHeight"
# }, inplace=True)
# rounded_width = round(all_test_images_properties["ImagePropertyWidth"]/100)*100
# rounded_height = round(all_test_images_properties["ImagePropertyHeight"]/100)*100
# all_test_images_properties["ImagePropertyAspectRatio"] = np.divide(rounded_width, rounded_height, out=np.zeros_like(rounded_width), where=rounded_height!=0)
# all_test_images_properties.to_csv("all_test_images_properties.csv")

test_all_images_properties = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_test_images_properties.csv",
    index_col=0)
test_all_images_properties

In [None]:
# all_test_images_properties_agg = transformers.aggregate_images_properties(all_test_images_properties, type_data="test")

test_all_images_properties_agg = pd.read_csv(
    "../input/tfg-pet-adoption-data/all_test_images_properties_agg.csv",
    index_col=0)
test_all_images_properties_agg

In [None]:
# regression_model_dropout_64_16_ensemble_2 = load_model("../input/tfg-pet-adoption-data/DenseNet121_regression__ensemble__2__06-epochs_val_loss-1.160883.h5")

In [None]:
# model = Model(inputs=regression_model_dropout_64_16_ensemble_2.input,
#               outputs=regression_model_dropout_64_16_ensemble_2.layers[-2].output)
# ife = transformers.ImageFeatureExtractor(construct_from_cnn_backbone=False, model=model,
#                  images_directory="../input/petfinder-adoption-prediction/test_images",
#                  from_image=True, preprocess_input=preprocess_input_densenet, img_size=256,
#                  model_name=f"test_DenseNet121_regression__ensemble__2__layer-16",
#                  save=True, debug=True, include_feats=False, multiple_instances_per_petid=True)

# _ = ife.fit_transform(test, y=[])

In [None]:
test_aggregated_image_features_regression_model_ensemble_2_layer_16_features = pd.read_csv(
    "../input/tfg-pet-adoption-data/AGGREGATED_image_features_test_DenseNet121_regression__ensemble__2__layer-16_in-256.csv",
    index_col=0)
test_aggregated_image_features_regression_model_ensemble_2_layer_16_features

In [None]:
pipeline_transformers = [
    ('include_desc_metadata', transformers.IncludeDescriptionMetadata(
        description_metadata=test_description_metadata)),
    ('correct_desc_language', transformers.CorrectDescriptionLanguage()),
]

pipeline = Pipeline(steps=pipeline_transformers)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = pipeline.fit_transform(test,y=[])

test_pets_descriptions = result[["PetID", "DescriptionLanguage", "Description"]]

In [None]:
# test_pets_descriptions

In [None]:
test_pets_descriptions["DescriptionLanguage"].value_counts(dropna=False)

In [None]:
# _ = glove2word2vec("../input/glove-twitter/glove.twitter.27B.25d.txt", "w2v_glove.twitter.27B.25d.txt")
# kv_model = KeyedVectors.load_word2vec_format("./w2v_glove.twitter.27B.25d.txt", binary=False)
# cont = Contractions(kv_model=kv_model)
# cont.load_models()

In [None]:
# desc_transformer = transformers.DescriptionTransformer(kv_model=kv_model,
#                                                        save=True,
#                                                        debug=True)
# _ = desc_transformer.fit_transform(
#     test_pets_descriptions.loc[
#         test_pets_descriptions["DescriptionLanguage"].isin({"en", "ms", np.nan})],
#     y=[])

In [None]:
# desc_transformer_2 = transformers.DescriptionTransformer(kv_model=kv_model,
#                                                          save=True,
#                                                          debug=True)
# _ = desc_transformer_2.fit_transform(
#     test_pets_descriptions.loc[
#         test_pets_descriptions["DescriptionLanguage"] == "zh"],
#     y=[])

(translations of zh-cn descriptions were done separately since the API imposes a limit)

In [None]:
# desc_transformer.transformations_df

In [None]:
# desc_transformer_2.transformations_df

In [None]:
# test_transformations_df = desc_transformer.transformations_df.append(
#     desc_transformer_2.transformations_df)
# test_transformations_df.to_csv("test_description_transformations.csv")

test_desc_transformations_df = pd.read_csv(
    "../input/tfg-pet-adoption-data/test_description_transformations.csv",
    index_col=0)
test_desc_transformations_df["Description"] = \
    test_desc_transformations_df["Description"].replace(np.nan, '')
test_desc_transformations_df["translation"] = \
    test_desc_transformations_df["translation"].replace(np.nan, '')
test_desc_transformations_df["expanded"] = \
    test_desc_transformations_df["expanded"].replace(np.nan, '')
test_desc_transformations_df

In [None]:
mixed_description_metadata = \
    train_description_metadata.append(test_description_metadata)
mixed_profile_image_metadata = \
    train_profile_image_metadata.append(test_profile_image_metadata)
mixed_all_images_metadata_agg = \
    train_all_images_metadata_agg.append(test_all_images_metadata_agg)
mixed_profile_image_properties = \
    train_profile_image_properties.append(test_profile_image_properties)
mixed_all_images_properties_agg = \
    train_all_images_properties_agg.append(test_all_images_properties_agg)
mixed_aggregated_image_features_regression_model_ensemble_2_layer_16_features = \
    train_aggregated_image_features_regression_model_ensemble_2_layer_16_features.append(
        test_aggregated_image_features_regression_model_ensemble_2_layer_16_features)
mixed_desc_transformations_df = train_desc_transformations_df.append(test_desc_transformations_df)

In [None]:
mixed_desc_transformations_df

In [None]:
ife = transformers.ImageFeatureExtractor(
    construct_from_cnn_backbone=False,
    loaded_features=mixed_aggregated_image_features_regression_model_ensemble_2_layer_16_features
)

tfidf_vectorizer = transformers.CustomTfidfVectorizer(svd_n_components=16,
                                                     seed=seed)

final_pipeline_transformers = [
    ('replace_breeds',
     transformers.LeftJoinReplace(values_dict=breeds_dict,
                                  variables=["Breed1", "Breed2"])),
    ('replace_colors',
     transformers.LeftJoinReplace(values_dict=colors_dict,
                                  variables=["Color1", "Color2", "Color3"])),
    ('replace_states',
     transformers.LeftJoinReplace(values_dict=states_dict, variables=["State"])),
    ('replace_by_strings',
     FunctionTransformer(func=transformers.replace_integers_by_strings)),
    ('has_name', FunctionTransformer(func=transformers.has_significant_name)),
    ('pure_breed', FunctionTransformer(func=transformers.has_pure_breed)),
    ('breed_matches_fur_length',
     FunctionTransformer(func=transformers.breed_matches_fur_length)),
    ('impute_breed', transformers.BreedImputer()),
    ('include_prof_im_metadata',
     transformers.IncludeProfileImageMetadata(
         profile_image_metadata=mixed_profile_image_metadata,
         all_images_metadata_agg=mixed_all_images_metadata_agg)),
    ('correct_wrong_type', transformers.CorrectWrongType(breeds)),
    ('encode_breed', transformers.BreedEncoding()),
    ('ordinal_vars_encoder', 
     transformers.OrdinalVariableEncoder(
         columns=["MaturitySize", "FurLength", "Health"],
         enc_type="ordinal", mapping=ordinal_vars_mapping)),
    ('state_gdp',
     transformers.ReplaceState(gdp_per_capita=gdp_per_capita,
                               impute_nan_value=46450)),
    ('rescuer_count', transformers.ReplaceRescuerID()),
    ('description_length',
     FunctionTransformer(func=transformers.include_description_length)),
    ('include_desc_metadata',
     transformers.IncludeDescriptionMetadata(
         description_metadata=mixed_description_metadata)),
    ('correct_desc_language', transformers.CorrectDescriptionLanguage()),
    ('one_hot_encoder',
     transformers.CustomOneHotEncoder(columns=["Gender", "Color1", "Color2",
                                     "Color3", "Vaccinated", "Dewormed",
                                     "Sterilized", "DescriptionLanguage"])),
    ('include_prof_im_properties',
     transformers.IncludeProfileImageProperties(
         profile_image_properties=mixed_profile_image_properties,
         aggregated_images_properties=mixed_all_images_properties_agg)),
    ('drop_columns',
     transformers.ColumnRemover(columns=columns_to_be_removed_desc_feats_eval)),
    ('round_im_dims_aspect_ratio',
     FunctionTransformer(func=transformers.include_aspect_ratio)),
    ('image_features_extractor', ife),
    ('description_transformer',
     transformers.DescriptionTransformer(
         transformations_df=mixed_desc_transformations_df)),
    ('tfidf_vectorizer', tfidf_vectorizer),
    ('drop_petid_desc',
     transformers.ColumnRemover(
         columns=["PetID", "DescriptionLanguage", "Description"])),
    ('useless_vars_remover',
     transformers.UselessVariablesRemover(tolerance=0.000001)),
    ('impute_malay_desc_missing_prof_im_props',
     transformers.CustomIterativeImputer())
#     ('custom_standard_scaler', CustomStandardScaler(numeric_columns_desc_feats_eval))
]

In [None]:
def create_submission(model, X_train, y_train, test_data):
    model.fit(X_train, y_train)
    predictions = model.predict(test_data)
    submission = test_data[['PetID']].copy()
    submission["AdoptionSpeed"] = predictions.astype("int64")
    submission.to_csv("submission.csv", index=False)
    return submission

In [None]:
xgb_clf = xgb.XGBClassifier(
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=seed,
    use_label_encoder=False,
    n_jobs=-1
)

xgb_model = Pipeline(steps=final_pipeline_transformers + [('model', xgb_clf)])
xgb_model = clone(xgb_model).set_params(**xgb_clf_best_params)

In [None]:
submission = create_submission(xgb_model, X, y, test)
submission

In [None]:
submission["AdoptionSpeed"].value_counts(normalize=True,dropna=False)

In [None]:
node_params = {
    'shape': 'box',
    'style': 'filled, rounded',
    'fillcolor': '#78cbe'
}

leaf_params = {
    'shape': 'box',
    'style': 'filled',
    'fillcolor': '#e48038'
}

In [None]:
trained_xgb_clf = xgb_model.named_steps["model"]
xgb.to_graphviz(trained_xgb_clf, num_trees=4, size="10,10",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)

#Set a different dpi (work only if format == 'png')
# image.graph_attr = {'dpi':'400'}

# image.render("xgb_trees", format="png")

In [None]:
#_, ax = plt.subplots(figsize=(15,50))
xgb.plot_importance(trained_xgb_clf, max_num_features=10,
                    importance_type="gain", xlabel="Average gain",
                    grid=False)

The proportion of each class in the predictions are very far from the original in the training data...

### Just in case... Regression (competition mindset)

In [None]:
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from functools import partial
import scipy as sp

class OptimizedRounder:
    def _kappa_loss(self, coef, y, y_pred):
        y_pred = np.copy(y_pred)
        for i, pred in enumerate(y_pred):
            if pred < coef[0]:
                y_pred[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                y_pred[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                y_pred[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                y_pred[i] = 3
            else:
                y_pred[i] = 4

        ll = cohen_kappa_score(y, y_pred, weights='quadratic')
        return -ll

    def fit(self, y, y_pred):
        loss_partial = partial(self._kappa_loss, y=y, y_pred=y_pred)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coefficients = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')['x']

    def predict(self, y_pred):
        y_pred = np.copy(y_pred)
        for i, pred in enumerate(y_pred):
            if pred < self.coefficients[0]:
                y_pred[i] = 0
            elif pred >= self.coefficients[0] and pred < self.coefficients[1]:
                y_pred[i] = 1
            elif pred >= self.coefficients[1] and pred < self.coefficients[2]:
                y_pred[i] = 2
            elif pred >= self.coefficients[2] and pred < self.coefficients[3]:
                y_pred[i] = 3
            else:
                y_pred[i] = 4
        return y_pred

In [None]:
rf_reg = RandomForestRegressor(random_state=seed, n_jobs=-1)
rf_reg_model = Pipeline(steps=final_pipeline_transformers + [('model', rf_reg)])
rf_reg_model = clone(rf_reg_model).set_params(**rf_clf_best_params)

In [None]:
xgb_reg = xgb.XGBRegressor(
    tree_method="hist",
    random_state=seed,
    n_jobs=-1
)

xgb_reg_model = Pipeline(steps=final_pipeline_transformers + [('model', xgb_reg)])
xgb_reg_model = clone(xgb_reg_model).set_params(**xgb_clf_best_params)

In [None]:
lgbm_reg = lgbm.LGBMRegressor(
    boosting_type="goss",
    colsample_bytree=0.6,
    random_state=seed,
    n_jobs=-1
)

lgbm_reg_model = Pipeline(steps=final_pipeline_transformers + [('model', lgbm_reg)])
lgbm_reg_model = clone(xgb_reg_model).set_params(**lgbm_clf_best_params)

In [None]:
level0 = [
    ('rf', rf_reg_model),
    ('xgb', xgb_reg_model),
    ('lgbm', lgbm_reg_model)
]

level1 = LinearRegression(n_jobs=-1)

stacking_reg_model = StackingRegressor(estimators=level0, final_estimator=level1,
                                   cv=cv, n_jobs=-1)

In [None]:
stacking_reg_model.fit(X, y)

# Fitting rounder:
optR = OptimizedRounder()
y_pred_train = stacking_reg_model.predict(X)
optR.fit(y, y_pred_train)

predictions = stacking_reg_model.predict(test)
predictions = optR.predict(predictions)

submission = test[['PetID']].copy()
submission["AdoptionSpeed"] = predictions.astype("int64")
submission.to_csv("submission.csv", index=False)

submission["AdoptionSpeed"].value_counts(normalize=True,dropna=False)