In [4]:
import numpy as np
import pandas as pd

import os
import json
from tqdm import tqdm

import lightgbm as lgb
import scipy as sp

from functools import partial
from sklearn.model_selection import StratifiedKFold

In [43]:
RANDOM_STATE = 322

In [47]:
np.random.seed(RANDOM_STATE)

In [5]:
input_path = "../../input"

In [6]:
train_df = pd.read_csv(os.path.join(input_path, 'train/train.csv'))

In [7]:
test_df = pd.read_csv(os.path.join(input_path, 'test/test.csv'))

### Looking on label balance in dataset

In [8]:
adoption_speed_hist = train_df.hist('AdoptionSpeed')

### Quadratic weighted kappa score

In [9]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

### Coefficients optimizer

In [10]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

### Generate features based on 'Name' ('No_name',  'Bad_name')

In [11]:
train_df['Name'] = train_df['Name'].fillna('No Name')
test_df['Name'] = test_df['Name'].fillna('No Name')

In [12]:
train_df['No_name'] = 0
train_df.loc[(train_df['Name'] == 'No Name') | 
             (train_df['Name'] == 'No Name Yet'), 'No_name'] = 1
test_df['No_name'] = 0
test_df.loc[(test_df['Name'] == 'No Name')| 
             (test_df['Name'] == 'No Name Yet'), 'No_name'] = 1

In [13]:
train_df['Bad_name'] = 0
train_df.loc[train_df['Name'].apply(lambda x: len(str(x))) < 3, 'Bad_name'] = 1

In [14]:
test_df['Bad_name'] = 0
test_df.loc[test_df['Name'].apply(lambda x: len(str(x))) < 3, 'Bad_name'] = 1

In [15]:
train_df.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,No_name,Bad_name
0,2,Nibble,3,299,0,1,1,7,0,1,...,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0,0
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,1,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,0,0
3,1,Miko,4,307,0,2,1,2,0,2,...,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,0,0
4,1,Hunter,1,307,0,1,1,0,0,2,...,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,0,0


### Add features from sentiment data

In [16]:
def add_sentiment_features(df, id_column_name, sent_dir):
    doc_sent_mag = []
    doc_sent_score = []
    not_found_count = 0
    
    for id in tqdm(df[id_column_name]):
        try:
            with open(os.path.join(sent_dir, id + '.json'), 'r') as f:
                sentiment = json.load(f)
            doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
            doc_sent_score.append(sentiment['documentSentiment']['score'])
        except Exception as e:
            not_found_count += 1
            doc_sent_mag.append(-1)
            doc_sent_score.append(-1)
    df.loc[:, 'doc_sent_mag'] = doc_sent_mag
    df.loc[:, 'doc_sent_score'] = doc_sent_score
    print("Number of all objects: {}".format(df.shape[0]))
    print("Not found: {}".format(not_found_count))
    return df

In [17]:
train_df = add_sentiment_features(train_df, 'PetID', input_path + '/train_sentiment')

100%|██████████| 14993/14993 [00:04<00:00, 3548.55it/s]

Number of all objects: 14993
Not found: 551





In [18]:
test_df = add_sentiment_features(test_df, 'PetID', input_path + '/test_sentiment')

100%|██████████| 3948/3948 [00:01<00:00, 3363.39it/s]

Number of all objects: 3948
Not found: 133





In [19]:
def add_image_metadata_features(df, id_column_name, metadata_dir):
    vertex_xs = []
    vertex_ys = []
    bounding_confidences = []
    bounding_importance_fracs = []
    dominant_blues = []
    dominant_greens = []
    dominant_reds = []
    dominant_pixel_fracs = []
    dominant_scores = []
    label_scores = []
    nf_count = 0
    nl_count = 0
    
    df_id = df[id_column_name]
    for pet in tqdm(df_id):
        try:
            with open(os.path.join(metadata_dir, pet + '-1.json'), 'r') as f:
                data = json.load(f)
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            if data.get('labelAnnotations'):
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)
            else:
                nl_count += 1
                label_scores.append(-1)
        except FileNotFoundError:
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            label_scores.append(-1)

    print(nf_count)
    print(nl_count)
    df.loc[:, 'vertex_x'] = vertex_xs
    df.loc[:, 'vertex_y'] = vertex_ys
    df.loc[:, 'bounding_confidence'] = bounding_confidences
    df.loc[:, 'bounding_importance'] = bounding_importance_fracs
    df.loc[:, 'dominant_blue'] = dominant_blues
    df.loc[:, 'dominant_green'] = dominant_greens
    df.loc[:, 'dominant_red'] = dominant_reds
    df.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
    df.loc[:, 'dominant_score'] = dominant_scores
    df.loc[:, 'label_score'] = label_scores
    return df

In [20]:
train_df = add_image_metadata_features(train_df, 'PetID', input_path + '/train_metadata')

100%|██████████| 14993/14993 [00:04<00:00, 3312.87it/s]


341
2


In [21]:
test_df = add_image_metadata_features(test_df, 'PetID', input_path + '/test_metadata')

100%|██████████| 3948/3948 [00:01<00:00, 3227.52it/s]

128
0





### Generate simple (without words embeddings) features based on 'Description'

In [22]:
def add_text_length(df, text_column='Description'):
    df[text_column+'Length'] = df[text_column].fillna(" ").apply(lambda x: len(x))
    return df

In [23]:
def add_number_of_words(df, text_column='Description'):
    df['NumberOfWords'] = df[text_column].fillna(" ").apply(lambda x: len(x.split()))
    return df

In [24]:
def add_average_word_length(df, text_length_col='DescriptionLength', number_of_words_col='NumberOfWords'):
    
    df['AverageWordLength'] = df[text_length_col] / df[number_of_words_col]
    df['AverageWordLength'] = df['AverageWordLength'].replace([np.inf, -np.inf], 0)
    return df

In [25]:
def add_simple_text_features(df, text_column='Description'):
    df = add_text_length(df, text_column)
    df = add_number_of_words(df, text_column)
    df = add_average_word_length(df, 'DescriptionLength', 'NumberOfWords')
    return df

In [26]:
train_df = add_simple_text_features(train_df)

In [27]:
test_df = add_simple_text_features(test_df)

### TF-IDF features from description

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [29]:
SVD_COMPONENTS = 120

train_desc = train_df.Description.fillna("none").values
test_desc = test_df.Description.fillna("none").values

tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')
    
# Fit TFIDF
tfv.fit(list(train_desc))
X_tf_idf =  tfv.transform(train_desc)
X_test_tf_idf = tfv.transform(test_desc)

svd = TruncatedSVD(n_components=SVD_COMPONENTS)
svd.fit(X_tf_idf)
# print(svd.explained_variance_ratio_.sum())
# print(svd.explained_variance_ratio_)
X_tf_idf = svd.transform(X_tf_idf)
X_tf_idf = pd.DataFrame(X_tf_idf, columns=['svd_{}'.format(i) for i in range(SVD_COMPONENTS)])
train_df = pd.concat((train_df, X_tf_idf), axis=1)
X_test_tf_idf = svd.transform(X_test_tf_idf)
X_test_tf_idf = pd.DataFrame(X_test_tf_idf, columns=['svd_{}'.format(i) for i in range(SVD_COMPONENTS)])
test_df = pd.concat((test_df, X_test_tf_idf), axis=1)

### Drop not-using features

In [30]:
# drop text features and hash id`s
drop_list = ['PetID', 
             'RescuerID', 
             'Description']

In [31]:
pet_id_test = test_df['PetID']

In [32]:
y = train_df['AdoptionSpeed']

In [33]:
X = train_df.drop(drop_list, axis=1)

In [34]:
# X = X.drop('AdoptionSpeed', axis=1)

In [35]:
X_test = test_df.drop(drop_list, axis=1)

### Select cat features

In [36]:
def get_column_indeces_from_names(df,name_list):
    indeces = []
    for name in name_list:
        indeces.append(df.columns.get_loc(name))
    return indeces

In [48]:
cat_feature_names = [
    'Type',
    'Breed1',
    'Breed2',
    'Gender',
    'Color1',
    'Color2',
    'Color3',
    'Vaccinated',
    'Dewormed',
    'Sterilized',
    'Health',
    'State',
    'No_name',
    'Bad_name'
]

### Create cd and pool

In [38]:
X = X.drop(['Name', 'AdoptionSpeed'], axis=1)

In [39]:
X.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,svd_110,svd_111,svd_112,svd_113,svd_114,svd_115,svd_116,svd_117,svd_118,svd_119
0,2,3,299,0,1,1,7,0,1,1,...,-0.044593,0.042551,-0.050962,0.005873,-0.012918,0.024615,-0.02477,0.078667,0.039082,0.015529
1,2,1,265,0,1,1,2,0,2,2,...,0.063578,0.004131,-0.02184,-0.026501,-0.021954,0.026265,-0.014663,0.073447,-0.042642,-0.029909
2,1,1,307,0,1,2,7,0,2,2,...,-0.027405,0.024693,-0.031883,-0.018671,0.005586,-0.029937,0.02708,-0.01468,0.003163,0.024323
3,1,4,307,0,2,1,2,0,2,1,...,0.064175,0.01091,0.00962,0.03565,0.013052,0.009769,0.006746,-0.005976,-0.020458,-0.006212
4,1,1,307,0,1,1,0,0,2,1,...,-0.044997,-0.009131,-0.000749,-0.000777,0.015006,3e-06,0.018006,0.041699,-0.022665,0.036202


In [56]:
X_test = X_test.drop(['Name'], axis=1)

In [59]:
def cross_val(X, y, n_splits):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=False, random_state=RANDOM_STATE)
    test_predictions = np.zeros((X_test.shape[0], 1))

    cv_results = []
    scores = []
    coefficients = np.zeros((n_splits, 4))
    fold = 0
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X.loc[tr_ind]
        y_train = y.loc[tr_ind]
        
        X_valid = X.loc[val_ind]
        y_valid = y.loc[val_ind]
        
        lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'learning_rate': 0.005,
            'subsample': .8,
            'colsample_bytree': 0.8,
            'min_split_gain': 0.006,
            'min_child_samples': 150,
            'min_child_weight': 0.1,
            'max_depth': 17,
            'n_estimators': 10000,
            'num_leaves': 80,
            'silent': -1,
            'verbose': -1,
            'max_depth': 11,
            'random_state': RANDOM_STATE
        }
        
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='rmse',
            verbose=500,
            early_stopping_rounds=100
        )
        
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
#         y_pred = list(map(lambda x: int(x[0]), y_pred))
        test_pred = model.predict(X_test, num_iteration=model.best_iteration_)
        test_predictions += test_pred.reshape(-1, 1)
        
        optR = OptimizedRounder()
        optR.fit(y_pred, y_valid.values)
        coefficients[fold, :] = optR.coefficients()
        
        pred = optR.predict(y_pred, coefficients[fold, :])
        
        kappa_scr = quadratic_weighted_kappa(y_valid, pred)
        
        print("Fold = {}. QWK = {}. Coef = {}".format(fold, kappa_scr, coefficients[fold,:]))
        cv_result = {}
        cv_result['Fold'] = fold
        cv_result['Model'] = model
        cv_result['QWK'] = kappa_scr
        cv_result['Coef'] = coefficients[fold, :]
        cv_results.append(cv_result)
        scores.append(kappa_scr)
        fold += 1
    print('Average: {}'.format(sum(scores)/n_splits))
    test_predictions = test_predictions * 1./n_splits

    return {'predictions': test_predictions,
            'coefficients': np.mean(coefficients, axis=0)}

In [60]:
results = cross_val(X, y, n_splits = 3)

Training until validation scores don't improve for 100 rounds.
[500]	valid_0's rmse: 1.0718	valid_0's l2: 1.14875
[1000]	valid_0's rmse: 1.05697	valid_0's l2: 1.11719
[1500]	valid_0's rmse: 1.05277	valid_0's l2: 1.10833
[2000]	valid_0's rmse: 1.05084	valid_0's l2: 1.10426
[2500]	valid_0's rmse: 1.04951	valid_0's l2: 1.10148
[3000]	valid_0's rmse: 1.04876	valid_0's l2: 1.0999
Early stopping, best iteration is:
[3231]	valid_0's rmse: 1.04859	valid_0's l2: 1.09955
Fold = 0. QWK = 0.42118468687948296. Coef = [0.5206481  1.79718725 2.49789042 2.92877396]
Training until validation scores don't improve for 100 rounds.
[500]	valid_0's rmse: 1.06401	valid_0's l2: 1.13212
[1000]	valid_0's rmse: 1.04824	valid_0's l2: 1.0988
[1500]	valid_0's rmse: 1.04258	valid_0's l2: 1.08698
[2000]	valid_0's rmse: 1.04033	valid_0's l2: 1.08228
[2500]	valid_0's rmse: 1.0391	valid_0's l2: 1.07972
[3000]	valid_0's rmse: 1.0385	valid_0's l2: 1.07849
[3500]	valid_0's rmse: 1.03793	valid_0's l2: 1.0773
Early stopping,

In [61]:
print(results['predictions'])

[[1.82790913]
 [3.00003842]
 [2.92666864]
 ...
 [1.3671936 ]
 [1.9859169 ]
 [2.71706688]]


In [63]:
optR = OptimizedRounder()

In [64]:
predictions = optR.predict(results['predictions'], results['coefficients']).astype(int)


In [67]:
predictions = list(map(lambda x: x[0], predictions))

In [68]:
predictions

[1,
 4,
 4,
 3,
 4,
 4,
 4,
 4,
 3,
 2,
 4,
 2,
 4,
 2,
 4,
 4,
 3,
 4,
 3,
 4,
 3,
 2,
 3,
 4,
 4,
 3,
 2,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 3,
 3,
 4,
 3,
 2,
 3,
 4,
 2,
 3,
 3,
 4,
 3,
 4,
 2,
 3,
 4,
 2,
 4,
 4,
 4,
 3,
 3,
 3,
 4,
 4,
 4,
 4,
 4,
 2,
 3,
 4,
 3,
 2,
 4,
 4,
 4,
 3,
 3,
 1,
 4,
 4,
 3,
 2,
 3,
 3,
 4,
 4,
 4,
 2,
 3,
 4,
 4,
 4,
 3,
 4,
 3,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 3,
 4,
 4,
 2,
 2,
 4,
 2,
 4,
 4,
 4,
 4,
 4,
 4,
 3,
 2,
 4,
 4,
 4,
 4,
 3,
 4,
 4,
 2,
 3,
 4,
 4,
 4,
 3,
 3,
 3,
 4,
 3,
 4,
 3,
 2,
 4,
 3,
 4,
 4,
 4,
 2,
 4,
 2,
 4,
 1,
 4,
 2,
 1,
 2,
 2,
 3,
 2,
 2,
 2,
 4,
 2,
 3,
 2,
 2,
 3,
 3,
 2,
 3,
 2,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 3,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 2,
 4,
 2,
 1,
 1,
 3,
 1,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 3,
 2,
 4,
 4,
 3,
 4,
 3,


In [71]:
submit_df = pd.read_csv(os.path.join(input_path, 'test/sample_submission.csv'))


In [72]:
submit_df.AdoptionSpeed = predictions


In [73]:
submit_df.head()

Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,1
1,73c10e136,4
2,72000c4c5,4
3,e147a4b9f,3
4,43fbba852,4


In [74]:
submit_df.to_csv('submission.csv', index=False)