## Based almost entirealy on Abishek's and Bluefool's kernels. For no ther reason than to annoy them. No original contribution whatsoever. Peter Hurford also did some stuff, but it's not worth even trolling. You are welcome. 


In [None]:
import json

import scipy as sp
import pandas as pd
import numpy as np

from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from collections import Counter
from os.path import join

import nltk
import string
from gensim.models import word2vec
from tqdm import tqdm
from keras.preprocessing.text import text_to_word_sequence
from nltk.corpus import stopwords

import lightgbm as lgb
np.random.seed(369)

In [None]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [None]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [None]:
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [None]:
def get_sentiment(ids, path):
    dic = {
        "doc_sent_mag": [],
        "doc_sent_score": []
    }
    for pet in ids:
        try:
            with open(path + pet + '.json', 'r') as f:
                sentiment = json.load(f)
            dic["doc_sent_mag"].append(sentiment['documentSentiment']['magnitude'])
            dic["doc_sent_score"].append(sentiment['documentSentiment']['score'])
        except FileNotFoundError:
            dic["doc_sent_mag"].append(np.nan)
            dic["doc_sent_score"].append(np.nan)
    return dic

def get_meta(ids, path, n):
    dic = {
        "vertex_xs": [],
        "vertex_ys": [],
        "bounding_confidences": [],
        "bounding_importance_fracs": [],
        "dominant_blues": [],
        "dominant_greens": [],
        "dominant_reds": [],
        "dominant_pixel_fracs": [],
        "dominant_scores": [],
        "mean_red": [],
        "mean_blue": [],
        "mean_green": [],
        "label_descriptions": [],
        "label_scores": []
    }
    for pet in ids:
        try:
            with open(path+ pet + '-{}.json'.format(str(n)), 'r') as f:
                data = json.load(f)
            dic["vertex_xs"].append(data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x'])
            dic["vertex_ys"].append(data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y'])
            dic["bounding_confidences"].append(data['cropHintsAnnotation']['cropHints'][0]['confidence'])
            dic["bounding_importance_fracs"].append(data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1))
            try:
                dic["dominant_blues"].append(data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue'])
            except:
                dic["dominant_blues"].append(0)
            try:
                dic["dominant_greens"].append(data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green'])
            except:
                dic["dominant_greens"].append(0)
            try:
                dic["dominant_reds"].append(data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red'])
            except:
                dic["dominant_reds"].append(0)
            dic["dominant_pixel_fracs"].append(data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction'])
            dic["dominant_scores"].append(data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score'])
            """r, g, b = 0, 0, 0
            for i, color in enumerate(data['imagePropertiesAnnotation']['dominantColors']['colors']):
                try:
                    r += data['imagePropertiesAnnotation']['dominantColors']['colors'][i]['color']['red']
                except:
                    b += 0
                try:
                    g += data['imagePropertiesAnnotation']['dominantColors']['colors'][i]['color']['green']
                except:
                    b += 0
                try:
                    b += data['imagePropertiesAnnotation']['dominantColors']['colors'][i]['color']['blue']
                except:
                    b += 0
            dic["mean_red"].append(r / i)
            dic["mean_blue"].append(b / i)
            dic["mean_green"].append(g / i)"""
            if data.get('labelAnnotations'):
                dic["label_descriptions"].append(data['labelAnnotations'][0]['description'])
                dic["label_scores"].append(data['labelAnnotations'][0]['score'])
            else:
                dic["label_descriptions"].append('nothing')
                dic["label_scores"].append(np.nan)
        except FileNotFoundError:
            dic["vertex_xs"].append(np.nan)
            dic["vertex_ys"].append(np.nan)
            dic["bounding_confidences"].append(np.nan)
            dic["bounding_importance_fracs"].append(np.nan)
            dic["dominant_blues"].append(np.nan)
            dic["dominant_greens"].append(np.nan)
            dic["dominant_reds"].append(np.nan)
            dic["dominant_pixel_fracs"].append(np.nan)
            dic["dominant_scores"].append(np.nan)
            #dic["mean_red"].append(np.nan)
            #dic["mean_blue"].append(np.nan)
            #dic["mean_green"].append(np.nan)
            dic["label_descriptions"].append('nothing')
            dic["label_scores"].append(np.nan)
    return dic

def replace_nan(train, test, replace_dic):
    for col, value in replace_dic.items():
        train[col] = train[col].replace(value, np.nan)
        test[col] = test[col].replace(value, np.nan)
    return train, test

def get_color(x):
    result = [0 for i in range(7)]
    for i in range(1, 8):
        if i in list(x.values):
            result[i-1] = 1
    return pd.Series(result)

In [None]:
%%time
train = pd.read_csv("../input/train/train.csv")
test = pd.read_csv("../input/test/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
fe_input_path = "../input/"
n_meta = 1
target = train['AdoptionSpeed']
train_id = train['PetID']
test_id = test['PetID']
train.drop(['AdoptionSpeed', 'PetID'], axis=1, inplace=True)
test.drop(['PetID'], axis=1, inplace=True)

color_result_col = ["color{}".format(i) for i in range(1, 8)]
color_pick_col = ["Color1", "Color2", "Color3"]
train_color = pd.DataFrame(train[color_pick_col].apply(get_color, axis=1).values, columns=color_result_col)
test_color = pd.DataFrame(test[color_pick_col].apply(get_color, axis=1).values, columns=color_result_col)

train = pd.concat((train, train_color), axis=1)
test = pd.concat((test, test_color), axis=1)

train.drop(color_pick_col, axis=1, inplace=True)
test.drop(color_pick_col, axis=1, inplace=True)

In [None]:
%%time
dic = get_sentiment(train_id, join(fe_input_path, 'train_sentiment/'))
train.loc[:, 'doc_sent_mag'] = dic["doc_sent_mag"]
train.loc[:, 'doc_sent_score'] = dic["doc_sent_score"]

dic = get_sentiment(test_id, join(fe_input_path, 'test_sentiment/'))
test.loc[:, 'doc_sent_mag'] = dic["doc_sent_mag"]
test.loc[:, 'doc_sent_score'] = dic["doc_sent_score"]

In [None]:
for i in range(1, n_meta+1):
    dic = get_meta(train_id, join(fe_input_path,  'train_metadata/'), i)
    train.loc[:, 'vertex_x_{}'.format(str(i))] = dic["vertex_xs"]
    train.loc[:, 'vertex_y_{}'.format(str(i))] = dic["vertex_ys"]
    train.loc[:, 'bounding_confidence_{}'.format(str(i))] = dic["bounding_confidences"]
    train.loc[:, 'bounding_importance_{}'.format(str(i))] = dic["bounding_importance_fracs"]
    train.loc[:, 'dominant_blue_{}'.format(str(i))] = dic["dominant_blues"]
    train.loc[:, 'dominant_green_{}'.format(str(i))] = dic["dominant_greens"]
    train.loc[:, 'dominant_red_{}'.format(str(i))] = dic["dominant_reds"]
    train.loc[:, 'dominant_pixel_frac_{}'.format(str(i))] = dic["dominant_pixel_fracs"]
    train.loc[:, 'dominant_score_{}'.format(str(i))] = dic["dominant_scores"]
    train.loc[:, 'label_description_{}'.format(str(i))] = dic["label_descriptions"]
    train.loc[:, 'label_score_{}'.format(str(i))] = dic["label_scores"]

    dic = get_meta(test_id, join(fe_input_path,  'test_metadata/'), i)
    test.loc[:, 'vertex_x_{}'.format(str(i))] = dic["vertex_xs"]
    test.loc[:, 'vertex_y_{}'.format(str(i))] = dic["vertex_ys"]
    test.loc[:, 'bounding_confidence_{}'.format(str(i))] = dic["bounding_confidences"]
    test.loc[:, 'bounding_importance_{}'.format(str(i))] = dic["bounding_importance_fracs"]
    test.loc[:, 'dominant_blue_{}'.format(str(i))] = dic["dominant_blues"]
    test.loc[:, 'dominant_green_{}'.format(str(i))] = dic["dominant_greens"]
    test.loc[:, 'dominant_red_{}'.format(str(i))] = dic["dominant_reds"]
    test.loc[:, 'dominant_pixel_frac_{}'.format(str(i))] = dic["dominant_pixel_fracs"]
    test.loc[:, 'dominant_score_{}'.format(str(i))] = dic["dominant_scores"]
    test.loc[:, 'label_description_{}'.format(str(i))] = dic["label_descriptions"]
    test.loc[:, 'label_score_{}'.format(str(i))] = dic["label_scores"]

In [None]:
eng_stopwords = set(stopwords.words("english"))
remove_punctuation_map = dict((ord(char), ' ') for char in string.punctuation)
#stemmer = nltk.stem.snowball.SnowballStemmer('english')
stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens):
    lst = [stemmer.stem(item) for item in tokens]
    return ' '.join(lst)

def get_textfeats(df, col, flag=True):
    df[col] = df[col].fillna('none').astype(str)
    df[col] = df[col].str.lower()
    df[col] = df[col].apply(lambda x: stem_tokens(nltk.word_tokenize(x.translate(remove_punctuation_map))))
    
    return df

def load_text(train, test):
    train = get_textfeats(train, "Description")
    test = get_textfeats(test, "Description")
    train_desc = train['Description'].values
    test_desc = test['Description'].values

    train_corpus = [text_to_word_sequence(text) for text in tqdm(train_desc)]
    test_corpus = [text_to_word_sequence(text) for text in tqdm(test_desc)]
    
    return train_corpus, test_corpus

def get_result(corpus, model):
    result = []
    for text in corpus:
        n_skip = 0
        for n_w, word in enumerate(text):
            try:
                vec_ = model.wv[word]
            except:
                n_skip += 1
                continue
            if n_w == 0:
                vec = vec_
            else:
                vec = vec + vec_
        vec = vec / (n_w - n_skip + 1)
        result.append(vec)
        
    return result

train_corpus, test_corpus = load_text(train, test)
model = word2vec.Word2Vec(train_corpus+test_corpus, size=200, window=10, max_vocab_size=50000, seed=0)
train_result = get_result(train_corpus, model)
test_result = get_result(test_corpus, model)

w2v_cols = ["wv{}".format(i) for i in range(1, 201)]
train_result = pd.DataFrame(train_result)
train_result.columns = w2v_cols
test_result = pd.DataFrame(test_result)
test_result.columns = w2v_cols

train = pd.concat((train, train_result), axis=1)
test = pd.concat((test, test_result), axis=1)

In [None]:
train.head()

In [None]:
%%time
train.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)
test.drop(['Name', 'RescuerID', 'Description'], axis=1, inplace=True)

In [None]:
replace_dic = {
    "Gender": 3,
    "MaturitySize": 0,
    "FurLength": 0,
    "MaturitySize": 0,
    "Vaccinated": 3,
    "Dewormed": 3,
    "Sterilized": 3,
    "Health": 0
}
train, test = replace_nan(train, test, replace_dic)
cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'MaturitySize', 
            'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health',
            'State', 'label_description_1']
train.loc[:, cat_cols] = train[cat_cols].astype('category')
test.loc[:, cat_cols] = test[cat_cols].astype('category')
print(train.shape)
print(test.shape)
train.head()

In [None]:
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    fold_splits = kf.split(train, target)
    cv_scores = []
    qwk_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0], 5))
    all_coefficients = np.zeros((5, 4))
    feature_importance_df = pd.DataFrame()
    i = 1
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/5')
        if isinstance(train, pd.DataFrame):
            dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        else:
            dev_X, val_X = train[dev_index], train[val_index]
            dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y, importances, coefficients, qwk = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        all_coefficients[i-1, :] = coefficients
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            qwk_scores.append(qwk)
            print(label + ' cv score {}: RMSE {} QWK {}'.format(i, cv_score, qwk))
        fold_importance_df = pd.DataFrame()
        fold_importance_df['feature'] = train.columns.values
        fold_importance_df['importance'] = importances
        fold_importance_df['fold'] = i
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)        
        i += 1
    print('{} cv RMSE scores : {}'.format(label, cv_scores))
    print('{} cv mean RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std RMSE score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv QWK scores : {}'.format(label, qwk_scores))
    print('{} cv mean QWK score : {}'.format(label, np.mean(qwk_scores)))
    print('{} cv std QWK score : {}'.format(label, np.std(qwk_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label,
               'train': pred_train, 'test': pred_full_test,
                'cv': cv_scores, 'qwk': qwk_scores,
               'importance': feature_importance_df,
               'coefficients': all_coefficients}
    return results

params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 80,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.85,
          'feature_fraction': 0.8,
          'min_split_gain': 0.01,
          'min_child_samples': 150,
          'min_child_weight': 0.1,
          'verbosity': -1,
          'data_random_seed': 3,
          'early_stop': 100,
          'verbose_eval': 100,
          'num_rounds': 10000}

def runLGB(train_X, train_y, test_X, test_y, test_X2, params):
    print('Prep LGB')
    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(test_X, label=test_y)
    watchlist = [d_train, d_valid]
    print('Train LGB')
    num_rounds = params.pop('num_rounds')
    verbose_eval = params.pop('verbose_eval')
    early_stop = None
    if params.get('early_stop'):
        early_stop = params.pop('early_stop')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    print('Predict 1/2')
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    optR = OptimizedRounder()
    optR.fit(pred_test_y, test_y)
    coefficients = optR.coefficients()
    pred_test_y_k = optR.predict(pred_test_y, coefficients)
    print("Valid Counts = ", Counter(test_y))
    print("Predicted Counts = ", Counter(pred_test_y_k))
    print("Coefficients = ", coefficients)
    qwk = quadratic_weighted_kappa(test_y, pred_test_y_k)
    print("QWK = ", qwk)
    print('Predict 2/2')
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance(), coefficients, qwk

results = run_cv_model(train, test, target, runLGB, params, rmse, 'lgb')

In [None]:
imports = results['importance'].groupby('feature')['feature', 'importance'].mean().reset_index()
imports.sort_values('importance', ascending=False)

In [None]:
optR = OptimizedRounder()
coefficients_ = np.mean(results['coefficients'], axis=0)
print(coefficients_)
train_predictions = [r[0] for r in results['train']]
train_predictions = optR.predict(train_predictions, coefficients_).astype(int)
Counter(train_predictions)

In [None]:
optR = OptimizedRounder()
test_predictions = [r[0] for r in results['test']]
test_predictions = optR.predict(test_predictions, coefficients_).astype(int)
Counter(test_predictions)

In [None]:
pd.DataFrame(sk_cmatrix(target, train_predictions), index=list(range(5)), columns=list(range(5)))

In [None]:
quadratic_weighted_kappa(target, train_predictions)

In [None]:
rmse(target, [r[0] for r in results['train']])

In [None]:
submission = pd.DataFrame({'PetID': test_id, 'AdoptionSpeed': test_predictions})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)