# PetFinder adoption prediction capstone project for 3.4

In this project we will try to predict animal adoption speed based on the features about the animal, such as breed, color, size, health condition etc. There are also additional data sources about the description and images, which are processed with Google Vision and Google Natural Language APIs. This work is divided into two parts:
- Data loading and exploratory data analysis
- Model training (with data cleaning and feature engineering) and result evaluation

In [None]:
import pandas as pd
import seaborn as sns
import warnings
import numpy as np
import json
import scipy as sp
from matplotlib import pyplot as plt
from collections import Counter
from functools import partial
from math import sqrt
import optuna

import sklearn.metrics
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb
from xgboost import plot_importance

warnings.filterwarnings('ignore')
sns.set(style="darkgrid")

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

## Helper functions for EDA

In [None]:
def autolabel(bar_plot) -> None:
    """
    Attach a text label above each bar displaying its height
    """
    for p in bar_plot.patches:
        bar_plot.annotate(format(p.get_height(), '.0f'), 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha = 'center', va = 'center', 
                       xytext = (0, 9), 
                       textcoords = 'offset points')
        

def generate_groupBY_data(col_name: str) -> pd.DataFrame:
    """
    Helper function to generate percentage plots
    """
    l = train.groupby(['AdoptionSpeed', col_name])[['PetID']].count().reset_index().rename(
        columns={'PetID':'count'})
    count_pets = train.groupby(['AdoptionSpeed', col_name])[['PetID']].count().reset_index().groupby(
        [col_name]).sum()[['PetID']].reset_index()
    new_col_name = 'total_pets_by' + col_name
    count_pets.rename(columns={'PetID': new_col_name}, inplace=True)

    temp = l.merge(count_pets, on=[col_name], how='left')
    temp['fraction'] = temp['count'] * 100 / temp[new_col_name]

    temp = temp.pivot("AdoptionSpeed", col_name, "fraction")
    
    return temp


def plot_by_feature(data: pd.DataFrame, feature: str, plt1_title: str, plt2_title: str) -> None:
    """
    Helper function to plot feature versus adoption speed and percentages of adoption
    """
    fig = plt.figure(figsize=(15,8))
    ax = fig.add_subplot(1,2,1)
    sns.countplot(x=feature, data=data, palette="YlOrRd", edgecolor="black")
    autolabel(ax)
    ax.set_xlabel(feature)
    ax.set_ylabel('Adoption count')
    ax.set_title(plt1_title)

    temp = generate_groupBY_data(feature)
    ax = fig.add_subplot(1,2,2)
    sns.heatmap(temp, annot=True, cmap='YlOrRd')
    ax.set_xlabel(feature)
    ax.set_ylabel('Adoption Speed')
    ax.set_title(plt2_title)

    plt.show()

## Helper functions for sentiment and metadata extraction

In [None]:
def extract_sentiment(dataset: pd.DataFrame, ids: list, folder: str) -> pd.DataFrame:
    """
    Finds all sentiment files with provided pet ids and extracts document sentiment,
    magnitude and score values, which are collected and appended to the original
    dataframe. If no file is found, appends np.nan instead. 
    """
    doc_sent_mag = []
    doc_sent_score = []
    
    for pet in ids:
        try:
            with open(f'{folder}/{pet}.json', 'r', encoding='utf-8') as f:
                sentiment = json.load(f)
                file_sentiment = sentiment['documentSentiment']
                doc_sent_mag.append(file_sentiment['magnitude'])
                doc_sent_score.append(file_sentiment['score'])
        
        except FileNotFoundError:
            doc_sent_mag.append(np.nan)
            doc_sent_score.append(np.nan)
            
    dataset['doc_sent_mag'] = doc_sent_mag
    dataset['doc_sent_score'] = doc_sent_score
    
    return dataset


def get_label_score(json_file: dict, json_keys: list) -> np.array:
    """
    Finds and extracts sentiment score from provided json file
    """
    return np.asarray([x['score'] for x in json_file['labelAnnotations']]).mean() if 'labelAnnotations' in json_keys else np.nan


def get_img_color_score_pixelfrac(json_file: dict, json_keys: list) -> (np.array, np.array):
    """
    Finds and extracts image color score and image color pixel fraction 
    values from provided json file
    """
    if 'imagePropertiesAnnotation' in json_keys:

        img_colors = json_file['imagePropertiesAnnotation']['dominantColors']['colors']
        img_color_score = np.asarray([x['score'] for x in img_colors]).mean()
        img_color_pixelfrac = np.asarray([x['pixelFraction'] for x in img_colors]).mean()

    else:
        img_color_score = np.nan
        img_color_pixelfrac = np.nan
        
    return img_color_score, img_color_pixelfrac
    
    
def get_img_crop_conf_importance(json_file: dict, json_keys: list) -> (np.array, np.array):
    """
    Finds and extracts image crop hints annotation confidence and image crop 
    importance values from provided json file
    """
    if 'cropHintsAnnotation' in json_keys:
        img_crops = json_file['cropHintsAnnotation']['cropHints']
        img_crop_conf = np.asarray([x['confidence'] for x in img_crops]).mean()

        if 'importanceFraction' in img_crops[0].keys():
            img_crop_importance = np.asarray([x['importanceFraction'] for x in img_crops]).mean()
        else:
            img_crop_importance = np.nan

    else:
        img_crop_conf = np.nan
        img_crop_importance = np.nan
        
    return img_crop_conf, img_crop_importance
    

def extract_metadata(dataset: pd.DataFrame, ids: list, folder: str) -> pd.DataFrame:
    """
    Collects all the image metadata for provided list of pets and appends to 
    the original dataframe. 
    """
    metadata_label_score_column = []
    metadata_color_score_column = []
    metadata_color_pixelfrac_column = []
    metadata_crop_conf_column = []
    metadata_crop_importance_column = []

    for pet in ids:
        
        metadata_label_scores = []
        metadata_color_scores = []
        metadata_color_pixelfracs = []
        metadata_crop_confs = []
        metadata_crop_importances = []
        
        more_image_exist = True
        iterator = 1

        while more_image_exist:

            try:

                with open(f'{folder}/{pet}-{iterator}.json', 'r', encoding='utf-8') as f:

                    metadata = json.load(f)
                    keys = list(metadata.keys())
                    
                    label_score = get_label_score(metadata, keys)
                    img_color_score, img_color_pixelfrac = get_img_color_score_pixelfrac(metadata, keys)
                    img_crop_conf, img_crop_importance = get_img_crop_conf_importance(metadata, keys)

                    metadata_label_scores.append(label_score)
                    metadata_color_scores.append(img_color_score)
                    metadata_color_pixelfracs.append(img_color_pixelfrac)
                    metadata_crop_confs.append(img_crop_conf)
                    metadata_crop_importances.append(img_crop_importance)

                iterator += 1

            except FileNotFoundError:
                more_image_exist = False
                
        metadata_label_score_column.append(np.mean(metadata_label_scores))
        metadata_color_score_column.append(np.mean(metadata_color_scores))
        metadata_color_pixelfrac_column.append(np.mean(metadata_color_pixelfracs))
        metadata_crop_conf_column.append(np.mean(metadata_crop_confs))
        metadata_crop_importance_column.append(np.mean(metadata_crop_importances))
    
    dataset['metadata_label_score'] = metadata_label_score_column
    dataset['metadata_color_score'] = metadata_color_score_column
    dataset['metadata_color_pixelfrac'] = metadata_color_pixelfrac_column
    dataset['metadata_crop_conf'] = metadata_crop_conf_column
    dataset['metadata_crop_importance'] = metadata_crop_importance_column
    
    return dataset

## Helper functions for data cleaning and feature engineering

In [None]:
def drop_columns(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Drops irrelevant columns
    """
    dataframe = dataframe.drop(['Name', 'RescuerID', 'Description', 'PetID'], axis=1)
    return dataframe


def convert_to_cat(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Converts categorical columns from int to category type
    """
    categorical_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength',
                    'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State']
    
    for col in categorical_cols:
        dataframe[col] = dataframe[col].astype('category')
        
    return dataframe


def append_description_length(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Appends column with description length
    """
    dataframe['Description_length'] = dataframe['Description'].apply(lambda x: len(x.split()) if x is not np.nan else 0)
    return dataframe


def fill_na(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Fills NaN with zeroes
    """
    try:
        dataframe['metadata_label_score'].fillna(0,inplace=True)
        dataframe['metadata_color_score'].fillna(0,inplace=True)
        dataframe['metadata_color_pixelfrac'].fillna(0,inplace=True)
        dataframe['metadata_crop_conf'].fillna(0,inplace=True)
        dataframe['metadata_crop_importance'].fillna(0,inplace=True)
        dataframe['doc_sent_mag'].fillna(0,inplace=True)
        dataframe['doc_sent_score'].fillna(0,inplace=True)
    except:
        return dataframe
    
    return dataframe


def feature_eng(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Drops irrelevant columns, adds description length, converts to categorical and fills NaNs
    """
    dataframe = append_description_length(dataframe)
    dataframe = drop_columns(dataframe)
    dataframe = convert_to_cat(dataframe)
    dataframe = fill_na(dataframe)
    
    return dataframe

## Helper functions for quadratic weighted kappa score

The following 3 functions have been taken from Ben Hamner's github repository
https://github.com/benhamner/Metrics

In [None]:
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    quadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

## Helper functions for Optimized Rounder
Most successful Kaggle notebooks which are using lgb also use optimized rounder.

In [None]:
class OptimizedRounder(object):
    """
    Optimized rounder is used to convert regression problem to classification problem.
    After regressor returns predicted values, these values are put into bins depending
    on certain coefficients.
    """
    def __init__(self):
        self.coef_ = 0

        
    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

        
    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    
    def coefficients(self):
        return self.coef_['x']
    
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# Exploratory data analysis
### Adoption Speed Counts and Adoption Speed by Type
- 0 - Pet was adopted on the same day as it was listed.
- 1 - Pet was adopted between 1 and 7 days (1st week) after being listed.
- 2 - Pet was adopted between 8 and 30 days (1st month) after being listed.
- 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed.
- 4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

As we can see, there is a minority of animals that get adopted instantly. At first, cats tend to be adopted more often at faster adoption speeds, but as the speed gets lower, dogs start to take over. Significant part of pets are not adopted at all even after 100 days.

In [None]:
fig = plt.figure(figsize=(15,8))

ax = fig.add_subplot(1,2,1) 
ax = sns.countplot(x="AdoptionSpeed", data=train, palette="YlOrRd", edgecolor="black")
autolabel(ax)
ax.set_ylabel('Count')
ax.set_xlabel('Adoption Speed')
ax.set_title('Adoption speed counts')

ax = fig.add_subplot(1,2,2) 
ax = sns.countplot(x="AdoptionSpeed", hue="Type", data=train, palette="YlOrRd", edgecolor="black")
autolabel(ax)
ax.set_ylabel('Count')
ax.set_xlabel('Adoption Speed')
ax.set_title('Adoption speed by type (1 = Dog, 2 = Cat)')

plt.show()

### Adoption speed by Type
It seems that dogs tend to be abandoned a little more often than cats. Of course this is just a guess since proportions of animals in the country are unknown. Cats tend to be adopted faster than dogs, but at lower speed dogs tend to take over.

In [None]:
plot_by_feature(train, 
                'Type', 
                'Count of pets by Type (1 = Dog, 2 = Cat)', 
                'Percentage of pets adopted by Type (1 = Dog, 2 = Cat)')

### Adoption speed by Gender
Females tend to be abandoned more often. Again, this is just a guess since proportions of animal gender in the country are unknown. Males tend to get adopted more often at lower speeds, then females take over.

In [None]:
plot_by_feature(train, 
                'Gender', 
                'Adoption count of pets by Gender (1 = Male, 2 = Female, 3 = Mixed)', 
                'Percentage of pets adopted by Gender')

### Adoption speed by Color1
Black and brown pets dominate the shelters it seems.

In [None]:
plot_by_feature(train, 
                'Color1', 
                'Adoption count of pets by Color1 (1-Black, 2-Brown, 3-Golden, 4-Yellow, 5-Cream, 6-Gray, 7-White)', 
                'Percentage of pets adopted by Color1')

### Adoption speed by MaturitySize

Medium sized pets are most often in shelters. Interesting thing is that almost all extra large animals are adopted sooner or later.

In [None]:
plot_by_feature(train, 
                'MaturitySize', 
                'Adoption count of pets by MaturitySize (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large)', 
                'Percentage of pets adopted by MaturitySize')

### Adoption speed by FurLength

The majority of abandoned animals have short-medium length hair. Long-haired pets have a pretty good chance to get adopted earlier.

In [None]:
plot_by_feature(train, 
                'FurLength', 
                'Adoption count of pets by FurLength (1 = Short, 2 = Medium, 3 = Long)', 
                'Percentage of pets adopted by FurLength')

### Adoption speed by Vaccinated

It's a good practice to vaccinate the animal before adoption, but a bigger part of them are adopted without vaccination anyway.

In [None]:
plot_by_feature(train, 
                'Vaccinated', 
                'Adoption count of pets by Vaccinated (1 = Yes, 2 = No, 3 = Not Sure)', 
                'Percentage of pets adopted by Vaccinated')

### Adoption speed by Dewormed

It's a good practice to deworm the animal before adoption, but deworming doesn't seem like an important factor for adoption percentage.

In [None]:
plot_by_feature(train, 
                'Dewormed', 
                'Adoption count of pets by Dewormed (1 = Yes, 2 = No, 3 = Not Sure)', 
                'Percentage of pets adopted by Dewormed')

### Adoption speed by Sterilized

Majority of abandoned animals still have means to reproduce. Good for them! Also it seems that absence of sterilization doesn't matter much. In fact, the percentage of sterilized and yet not adopted pets is much higher.

In [None]:
plot_by_feature(train, 
                'Sterilized', 
                'Adoption count of pets by Sterilized (1 = Yes, 2 = No, 3 = Not Sure)', 
                'Percentage of pets adopted by Sterilized')

### Adoption speed by Health

Almost all abandoned pets are in perfect health. It's good to see that the majority of injured animals is adopted sooner or later as well.

In [None]:
plot_by_feature(train, 
                'Health', 
                'Adoption count of pets by Health (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury)', 
                'Percentage of pets adopted by Health')

### Adoption speed by PhotoAmount
It looks like the amount of photos provided have minor influence over adoption speed.

In [None]:
g = sns.catplot(x="AdoptionSpeed", y="PhotoAmt", kind="box", data=train, height=8, aspect=1.5, palette="YlOrRd",
               showfliers=False)

g.axes[0,0].set_xlabel('Adoption Speed')
g.axes[0,0].set_ylabel('Photo Amount')
g.axes[0,0].set_title('Photo Amount vs Adoption Speed ')

plt.show()

# Data addition

### This part only loads additional sentiment and metadata information. Feature engineering and data cleaning methods are provided above and they are used in model training part.

In [None]:
train_full = train.copy()
train_full = extract_metadata(train_full, train_full['PetID'], '../input/petfinder-adoption-prediction/train_metadata')
train_full = extract_sentiment(train_full, train_full['PetID'], '../input/petfinder-adoption-prediction/train_sentiment')

In [None]:
test_full = test.copy()
test_full = extract_metadata(test_full, test_full['PetID'], '../input/petfinder-adoption-prediction/test_metadata')
test_full = extract_sentiment(test_full, test_full['PetID'], '../input/petfinder-adoption-prediction/test_sentiment')

# Model training

### In this part three models were selected for evaluation - light gradient boosting, extreme gradient boosting and logistic regression. CV and Kaggle scores are provided and the beginning of each section and summary table is shown at the end.

## Light gradient boosting

Here we will use Optuna hyperparameter optimization framework to find best parameters for light gradient boosting algorithm. Then lgb will be trained on three sets of data: vanilla, full and with top selected features. Vanilla dataset is loaded as it is provided, cleaned and engineered. Full dataset consists of additional metadata and sentiment data. Top features are selected after checking feature importance on model, trained on full dataset.

Vanilla dataset (with no metadata and sentiment)
- Prediction time: 11.3 ms
- Kaggle score: 0.351
- QWK on validation: 0.403

Full dataset
- Prediction time: 9.95 ms
- Kaggle score: 0.333
- QWK on validation: 0.391

Top features dataset
- Prediction time: 9.69 ms
- Kaggle score: 0.348
- QWK on validation: 0.412

In [None]:
def objective(trial): 
    """
    Parameter search method for optuna framework
    """
    param = {
        'metric': 'multi_logloss',
        "verbosity": -1,
        'data_random_seed': 42,
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'num_iterations': trial.suggest_int('num_iterations', 1, 1000),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 0, 11),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'feature_pre_filter': False
    }
    
    gbm = lgb.train(param, x_train, valid_sets=x_val) 
    preds = gbm.predict(x_val).argmax(axis=1) 
    accuracy = sklearn.metrics.accuracy_score(y_val, preds)
    
    return accuracy


def run_lgb(params, train_data: pd.DataFrame, test_data:pd.DataFrame):
    """
    Runs light gradient boosting algorithm with stratified-k fold cross validation.
    Returns trained model, train and test predictions and list of feature labels for
    feature importance.
    """
    labels = train_data.drop('AdoptionSpeed', axis=1)
    target = train_data['AdoptionSpeed']
    
    early_stop = 1000
    verbose_eval = 100
    num_rounds = 10000
    n_splits = 10

    kfold = StratifiedKFold(n_splits=n_splits)

    oof_train = np.zeros((train_data.shape[0]))
    oof_test = np.zeros((test_data.shape[0], n_splits))

    i = 0

    for train_index, valid_index in kfold.split(labels, target):

        X_tr = labels.iloc[train_index, :]
        y_tr = target.iloc[train_index]

        X_val = labels.iloc[valid_index, :]
        y_val = target.iloc[valid_index]

        d_train = lgb.Dataset(X_tr, label=y_tr)
        d_valid = lgb.Dataset(X_val, label=y_val)
        watchlist = [d_train, d_valid]

        model = lgb.train(best_trial_params,
                          train_set=d_train,
                          num_boost_round=num_rounds,
                          valid_sets=watchlist,
                          verbose_eval=verbose_eval,
                          early_stopping_rounds=early_stop)

        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        test_pred = model.predict(test_data, num_iteration=model.best_iteration)

        oof_train[valid_index] = val_pred
        oof_test[:, i] = test_pred

        i += 1

    return model, oof_train, oof_test, labels


def plot_feature_importance(model, labels):
    
    features_importance = pd.Series(model.feature_importance(), index=labels.columns)
    features_importance = features_importance.sort_values(ascending=False)
    df = features_importance.to_frame()
    df['feature'] = df.index
    df = df.rename(columns={0: 'importance'})

    fig = plt.figure(figsize=(15,10))
    ax = sns.barplot(x="importance", y="feature", data=df)
    ax.set_xlabel('Importance')
    ax.set_ylabel('Feature')
    plt.show()

### OPTUNA HYPERPARAMETER OPTIMIZATION FOR LGB

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_engineered.drop(['AdoptionSpeed'], axis=1), train_engineered['AdoptionSpeed'], test_size=0.2)
train_df = lgb.Dataset(data=x_train, label=y_train)
val_df = lgb.Dataset(data=x_val, label=y_val)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
best_trial_params = {'application': 'regression',
              'boosting': 'gbdt',
              'metric': 'rmse',
              'lambda_l1': 9.74072609344885e-07,
              'num_iterations': 993, 
              'lambda_l2': 1.0171525633411312e-08, 
              'num_leaves': 185, 
              'feature_fraction': 0.5408811645819958, 
              'bagging_fraction': 0.6886405927130945, 
              'bagging_freq': 6, 
              'min_child_samples': 8, 
              'max_depth': 0, 
              'learning_rate': 0.0011093564515343194}

### Running lgb on vanilla dataset

In [None]:
train_vanilla = train.copy()
test_vanilla = test.copy()

train_vanilla = feature_eng(train_vanilla)
test_vanilla = feature_eng(test_vanilla)
model_vanilla, oof_train_vanilla, oof_test_vanilla, labels_vanilla = run_lgb(best_trial_params, train_vanilla, test_vanilla)

### Running lgb on full dataset (with metadata and sentiment)

In [None]:
train_meta_sent = train_full.copy()
test_meta_sent = test_full.copy()

train_engineered = feature_eng(train_meta_sent)
test_engineered = feature_eng(test_meta_sent)
model_full, oof_train_full, oof_test_full, labels_full = run_lgb(best_trial_params, train_engineered, test_engineered)

Let's look at feature importances

In [None]:
plot_feature_importance(model_full, labels_full)

### Running lgb on top important features

In [None]:
train_top_features = train_engineered.copy()
test_top_features = test_engineered.copy()
train_top_features = train_top_features.drop(['Color3', 'VideoAmt', 'Health', 'Breed2', 'metadata_crop_conf', 'Color1', 'Color2', 'State', 'Type'], axis=1)
test_top_features = test_top_features.drop(['Color3', 'VideoAmt', 'Health', 'Breed2', 'metadata_crop_conf', 'Color1', 'Color2', 'State', 'Type'], axis=1)
model_top_features, oof_train_top_features, oof_test_top_features, labels_top_features = run_lgb(best_trial_params, train_engineered, test_engineered)

# XGBoost

Here we will use Optuna hyperparameter optimization framework to find best parameters for XGBoost algorithm. Then xgb will be trained on three sets of data: vanilla, full and with top selected features. Vanilla dataset is loaded as it is provided, cleaned and engineered. Full dataset consists of additional metadata and sentiment data. Top features are selected after checking feature importance on model, trained on full dataset.

Vanilla dataset (with no metadata and sentiment)
- Prediction time: 11.5 ms
- Kaggle score: 0.17
- QWK on validation: 0.24

Full dataset
- Prediction time:  10.5 ms
- Kaggle score: 0.23
- QWK on validation: 0.31

Top features dataset
- Prediction time: 10.0 ms
- Kaggle score: 0.21
- QWK on validation: 0.27

In [None]:
def objective(trial): 
    
    param = {
        "num_class": 5,
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.6),
        'subsample': trial.suggest_uniform('subsample', 0.3, 0.9),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 4)
             }
    
    xgb_opt = xgb.train(param, train_df, evals=eval_sets) 
    preds = xgb_opt.predict(val_df) 
    accuracy = sklearn.metrics.accuracy_score(y_val, preds)
    
    return accuracy


def run_xgb(params, train_data: pd.DataFrame, test_data:pd.DataFrame):
    """
    Runs XGBoost algorithm with stratified-k fold cross validation.
    Returns trained model, train and test predictions and list of feature labels for
    feature importance.
    """
    labels = train_data.drop('AdoptionSpeed', axis=1)
    target = train_data['AdoptionSpeed']
    
    early_stop = 100
    verbose_eval = 100
    num_rounds = 10000
    n_splits = 10

    kfold = StratifiedKFold(n_splits=n_splits)

    oof_train = np.zeros((train_data.shape[0]))
    oof_test = np.zeros((test_data.shape[0], n_splits))

    i = 0

    for train_index, valid_index in kfold.split(labels, target):

        X_tr = labels.iloc[train_index, :]
        y_tr = target.iloc[train_index]

        X_val = labels.iloc[valid_index, :]
        y_val = target.iloc[valid_index]
        
        d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns, enable_categorical=True)
        d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns, enable_categorical=True)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        model = xgb.train(dtrain=d_train, 
                          num_boost_round=num_rounds, 
                          evals=watchlist,
                          early_stopping_rounds=early_stop, 
                          verbose_eval=verbose_eval, 
                          params=params)

        valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns, enable_categorical=True), ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(test_data, feature_names=test_data.columns, enable_categorical=True), ntree_limit=model.best_ntree_limit)

        oof_train[valid_index] = valid_pred
        oof_test[:, i] = test_pred

        i += 1

    return model, oof_train, oof_test, labels

### OPTUNA HYPERPARAMETER OPTIMIZATION FOR XGBOOST

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_engineered.drop(['AdoptionSpeed'], axis=1), train_engineered['AdoptionSpeed'], test_size=0.2)
train_df = xgb.DMatrix(data=x_train, label=y_train, enable_categorical=True)
val_df = xgb.DMatrix(data=x_val, label=y_val, enable_categorical=True)
eval_sets = [(train_df, 'train'), (val_df, 'eval')]

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

In [None]:
xgb_params = {'learning_rate': 0.27590960722568275, 
              'subsample': 0.5877312259287172, 
              'max_depth': 6, 
              'colsample_bytree': 0.8009700602022006, 
              'min_child_weight': 1}

### Running xgb on vanilla dataset

In [None]:
train_vanilla = train.copy()
test_vanilla = test.copy()

train_vanilla = feature_eng(train_vanilla)
test_vanilla = feature_eng(test_vanilla)

model, oof_train, oof_test, labels = run_xgb(xgb_params, train_vanilla, test_vanilla)

### Running xgb on full dataset (with metadata and sentiment)

In [None]:
train_meta_sent = train_full.copy()
test_meta_sent = test_full.copy()

train_engineered = feature_eng(train_meta_sent)
test_engineered = feature_eng(test_meta_sent)

model, oof_train, oof_test, labels = run_xgb(xgb_params, train_engineered, test_engineered)

xgboost method plot_importance() stopped working one day, returning an error. Good thing that I managed to get feature importance before so I will use that information. Sorry for no picture.

In [None]:
#plot_importance(model) XGBoostError: [11:34:15] ../include/xgboost/feature_map.h:85: unknown feature type, use i for indicator and q for quantity

### Running xgb on top important features

In [None]:
train_top_features = train_engineered.copy()
test_top_features = test_engineered.copy()

train_top_features = train_top_features[['metadata_label_score', 'metadata_color_score', 'metadata_color_pixelfrac', 
                                         'Description_length', 'Age', 'doc_sent_mag', 'doc_sent_score',
                                         'PhotoAmt', 'Fee', 'Quantity', 'metadata_crop_importance', 'Type', 'AdoptionSpeed']]

test_top_features = test_top_features[['metadata_label_score', 'metadata_color_score', 'metadata_color_pixelfrac', 
                                         'Description_length', 'Age', 'doc_sent_mag', 'doc_sent_score',
                                         'PhotoAmt', 'Fee', 'Quantity', 'metadata_crop_importance', 'Type']]

model, oof_train, oof_test, labels = run_xgb(xgb_params, train_top_features, test_top_features)

# Logistic Regression

Vanilla dataset (with no metadata and sentiment)
- Prediction time: 9.77 ms
- Kaggle score: 0.18668
- QWK on validation: 0.17221939557390697

Full dataset
- Prediction time: 12.4 ms
- Kaggle score: 0.20678
- QWK on validation: 0.20868

Top features dataset
- Prediction time: 5.26 ms
- Kaggle score: 0.05049
- QWK on validation: 0.0831

In [None]:
def objective(trial):

    param = {
            'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'sag']),
            'penalty': trial.suggest_categorical('penalty', ["l2", "none"]),
            'C': trial.suggest_uniform('C', 0.1, 1.0),
            'class_weight': trial.suggest_categorical('class_weight', ["balanced", None])
        }
        
    model = LogisticRegression(**param, max_iter=500, random_state=12)
    model.fit(x_train, y_train)
    preds = model.predict(x_val)
    accuracy = sklearn.metrics.accuracy_score(y_val, preds)
    
    return accuracy

### OPTUNA HYPERPARAMETER OPTIMIZATION FOR LOGISTIC REGRESSION 

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_engineered.drop(['AdoptionSpeed'], axis=1), train_engineered['AdoptionSpeed'], test_size=0.2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
params = {'solver': 'newton-cg', 'penalty': 'none', 'C': 0.1, 'class_weight': 'balanced'}
clf = LogisticRegression(**params, max_iter=5000)

### Running logistic regression on vanilla dataset

In [None]:
train_vanilla = train.copy()
test_vanilla = test.copy()

train_vanilla = feature_eng(train_vanilla)
test_vanilla = feature_eng(test_vanilla)

target = train_vanilla['AdoptionSpeed']
features = train_vanilla.drop('AdoptionSpeed', axis=1)

clf.fit(features, target)

### Running logistic regression on full dataset

In [None]:
train_full_copy = train_full.copy()
test_full_copy = test_full.copy()

train_full_copy = feature_eng(train_full_copy)
test_full_copy = feature_eng(test_full_copy)

target = train_full_copy['AdoptionSpeed']
features = train_full_copy.drop('AdoptionSpeed', axis=1)

clf.fit(features, target)

### Cheking feature importance with RFE

In [None]:
selector = RFE(clf, n_features_to_select=1)
selector = selector.fit(features, target)
feature_ranks = []
for i in selector.ranking_:
    feature_ranks.append(f'{i}. {train_full_copy.columns[i]}')

### Running logistic regression on top features

In [None]:
train_full_copy = train_full.copy()
test_full_copy = test_full.copy()

train_full_copy = feature_eng(train_full_copy)
test_full_copy = feature_eng(test_full_copy)

target = train['AdoptionSpeed']
features = train_full_copy[['Color1', 'Gender', 'Breed2', 'Fee', 'PhotoAmt', 'metadata_color_score',
                           'metadata_crop_importance', 'metadata_color_pixelfrac',
                           'Color3', 'Quantity', 'Health', 'MaturitySize', 'doc_sent_score']]

test_features = test_full_copy[['Color1', 'Gender', 'Breed2', 'Fee', 'PhotoAmt', 'metadata_color_score',
                           'metadata_crop_importance', 'metadata_color_pixelfrac',
                           'Color3', 'Quantity', 'Health', 'MaturitySize', 'doc_sent_score']]

clf.fit(features, target)

- Code snippet to calculate quadratic weighted kappa score for validation dataset for logistic regression

In [None]:
x_train, x_val, y_train, y_val = train_test_split(features, target, test_size=0.2)
clf.fit(x_train, y_train)
pred_val = clf.predict(x_val)
qwk = quadratic_weighted_kappa(y_val, pred_val)

- Code snippet to predict with logistic regression and submit predictions

In [None]:
test_predictions = clf.predict(test_features)
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions.astype(np.int32)})
submission.to_csv('submission.csv', index=False)

- Code snippet to calculate quadratic weighted kappa score for validation datasets for LGBM and XGBoost

In [None]:
optR = OptimizedRounder()
optR.fit(oof_train, target)
coefficients = optR.coefficients()
pred_test_y_k = optR.predict(oof_train, coefficients)
qwk = quadratic_weighted_kappa(target, pred_test_y_k)

- Code snippet to predict with LGBM and XGBoost and submit predictions

In [None]:
test_predictions = optR.predict(oof_test.mean(axis=1), coefficients)
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions.astype(np.int32)})
submission.to_csv('submission.csv', index=False)

# Summary table

In [None]:
data = {'Algorithm': ['Vanilla features', 'All features', 'Top features'], 
        'Logistic reg CV': [0.172, 0.209, 0.083], 
        'Logistic reg Kaggle': [0.187, 0.206, 0.05],
        'XGBoost CV': [0.241, 0.317, 0.272],
        'XGBoost Kaggle': [0.173, 0.235, 0.218],
        'LightGBM CV': [0.403, 0.391, 0.412],
        'LightGBM Kaggle': [0.351, 0.333, 0.348]}

df = pd.DataFrame(data=data)
df