In [75]:
import gensim
import numpy as np
import pandas as pd

import os
import json
from tqdm import tqdm

import lightgbm as lgb
import scipy as sp

from functools import partial
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder

In [8]:
input_path = "../../input"

In [9]:
train_df = pd.read_csv(os.path.join(input_path, 'train/train.csv'))

In [11]:
test_df = pd.read_csv(os.path.join(input_path, 'test/test.csv'))

In [12]:
train_texts = train_df['Description']

In [13]:
test_texts = test_df['Description']

In [19]:
all_texts = pd.concat([train_texts, test_texts])

In [24]:
all_texts.isna().sum()

14

In [27]:
all_texts.fillna('none', inplace=True)

In [28]:
all_texts[all_texts == 'none'].shape

(14,)

In [29]:
def read_corpus(iterable):
    for i, line in enumerate(iterable):
        yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

In [30]:
tagged_texts = list(read_corpus(all_texts))

In [31]:
tagged_texts[0]

TaggedDocument(words=['nibble', 'is', 'month', 'old', 'ball', 'of', 'cuteness', 'he', 'is', 'energetic', 'and', 'playful', 'rescued', 'couple', 'of', 'cats', 'few', 'months', 'ago', 'but', 'could', 'not', 'get', 'them', 'neutered', 'in', 'time', 'as', 'the', 'clinic', 'was', 'fully', 'scheduled', 'the', 'result', 'was', 'this', 'little', 'kitty', 'do', 'not', 'have', 'enough', 'space', 'and', 'funds', 'to', 'care', 'for', 'more', 'cats', 'in', 'my', 'household', 'looking', 'for', 'responsible', 'people', 'to', 'take', 'over', 'nibble', 'care'], tags=[0])

In [32]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)


In [34]:
model.build_vocab(tagged_texts)

In [36]:
%time
model.train(tagged_texts, total_examples=model.corpus_count, epochs=model.epochs)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


In [37]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires'])


array([-0.3874086 , -0.30622107,  0.4088354 , -0.68452317, -0.12602532,
        0.5572939 , -0.21954854,  0.13485023,  0.21205647, -0.20292778,
        0.5622685 , -0.20333122,  0.00886884,  0.10412114, -0.73048705,
        0.13834874,  0.435352  , -0.39406323,  0.408562  ,  0.01949188,
        0.566398  , -0.12355072, -0.2045855 , -0.39323974,  0.08722531,
       -0.1482043 , -0.0808804 , -0.01379645,  0.01692487,  0.5998542 ,
        0.23881292,  0.23251271,  0.06813861,  0.303746  ,  0.00425552,
       -0.3191005 , -0.08286846,  0.05431944,  0.35676932,  0.27131963,
        0.465069  ,  0.47620323,  0.16257784, -0.64490014, -0.1765105 ,
       -0.15225255,  0.10303286, -0.39230847,  0.18953569, -0.19346207],
      dtype=float32)

In [45]:
train=train_df[['PetID', 'Description', 'AdoptionSpeed']]
train.Description=train.Description.fillna('none')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [47]:
%time
doc2vec_df = pd.DataFrame([model.infer_vector(gensim.utils.simple_preprocess(line)) for line in train.Description], 
                              columns = ["doc2vec_"+str(int(i)) for i in range(50)])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


In [55]:
train_doc2vec = pd.concat([train, doc2vec_df], axis=1)

In [56]:
train_doc2vec.head()

Unnamed: 0,PetID,Description,AdoptionSpeed,doc2vec_0,doc2vec_1,doc2vec_2,doc2vec_3,doc2vec_4,doc2vec_5,doc2vec_6,...,doc2vec_40,doc2vec_41,doc2vec_42,doc2vec_43,doc2vec_44,doc2vec_45,doc2vec_46,doc2vec_47,doc2vec_48,doc2vec_49
0,86e1089a3,Nibble is a 3+ month old ball of cuteness. He ...,2,0.362608,-1.339198,-0.115744,-1.040151,-0.323848,0.788398,-0.018753,...,-0.061661,0.530647,-0.127925,-0.178366,-0.24026,-0.929224,-0.10895,-0.666617,-0.093832,0.635729
1,6296e909a,I just found it alone yesterday near my apartm...,0,-0.612035,0.272964,-0.400818,-1.278494,0.825669,0.344825,-0.347659,...,0.421728,0.821713,-0.222975,-0.511119,-0.31478,-0.830124,-0.346739,0.293434,0.334306,0.42051
2,3422e4906,Their pregnant mother was dumped by her irresp...,3,-0.01245,-0.774957,0.234869,0.204644,0.226701,0.400874,0.068928,...,0.240239,1.639499,0.622158,0.059481,-0.313485,-0.618437,-0.094084,-0.145679,-0.318513,0.135195
3,5842f1ff5,"Good guard dog, very alert, active, obedience ...",2,0.371585,-0.701058,-0.018497,-0.288783,0.764215,-0.358407,-0.309732,...,-0.008311,0.679632,0.66463,0.089409,0.166947,0.1889,0.233804,-1.117431,0.766134,-0.392208
4,850a43f90,This handsome yet cute boy is up for adoption....,2,0.501965,0.346176,-1.496494,-1.006186,-0.092925,0.738305,-1.487715,...,0.159439,1.7854,-0.656976,0.746208,1.035529,0.079319,-0.35282,-0.401899,-0.690531,0.056028


In [72]:
train_doc2vec = train_doc2vec.drop(['Description'], axis=1)

In [86]:
# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [87]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [79]:
RANDOM_STATE = 322

In [81]:
def cross_val(X, y, n_splits):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=False, random_state=RANDOM_STATE)
#     test_predictions = np.zeros((X_test.shape[0], 1))
    train_predictions = np.zeros((X.shape[0], 1))

    cv_results = []
    scores = []
    coefficients = np.zeros((n_splits, 4))
    fold = 0
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X.loc[tr_ind]
        y_train = y.loc[tr_ind]
        
        X_valid = X.loc[val_ind]
        y_valid = y.loc[val_ind]
        
        lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'learning_rate': 0.005,
            'subsample': .8,
            'colsample_bytree': 0.8,
            'min_split_gain': 0.006,
            'min_child_samples': 150,
            'min_child_weight': 0.1,
            'max_depth': 20,
            'n_estimators': 10000,
            'num_leaves': 80,
            'silent': -1,
            'verbose': -1,
            'random_state': RANDOM_STATE
        }
        
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='rmse',
            verbose=200,
            early_stopping_rounds=100
        )
        
        y_pred = model.predict(X_valid, num_iteration=model.best_iteration_)
#         y_pred = list(map(lambda x: int(x[0]), y_pred))
#         test_pred = model.predict(X_test, num_iteration=model.best_iteration_)
#         test_predictions += test_pred.reshape(-1, 1)
        
        train_predictions[val_ind] = y_pred.reshape(-1, 1)

        
        optR = OptimizedRounder()
        optR.fit(y_pred, y_valid.values)
        coefficients[fold, :] = optR.coefficients()
        
        pred = optR.predict(y_pred, coefficients[fold, :])
        
        kappa_scr = quadratic_weighted_kappa(y_valid, pred)
        
        print("Fold = {}. QWK = {}. Coef = {}".format(fold, kappa_scr, coefficients[fold,:]))
        cv_result = {}
        cv_result['Fold'] = fold
        cv_result['Model'] = model
        cv_result['QWK'] = kappa_scr
        cv_result['Coef'] = coefficients[fold, :]
        cv_results.append(cv_result)
        scores.append(kappa_scr)
        fold += 1
    print('Average: {}'.format(sum(scores)/n_splits))
#     test_predictions = test_predictions * 1./n_splits

    return {
            'train_predictions': train_predictions,
            'predictions': test_predictions,
            'coefficients': np.mean(coefficients, axis=0)
        }

In [73]:
y = train_doc2vec.AdoptionSpeed

In [90]:
X = train_doc2vec.drop(['AdoptionSpeed', 'PetID'], axis=1)

In [91]:
X.head()

Unnamed: 0,doc2vec_0,doc2vec_1,doc2vec_2,doc2vec_3,doc2vec_4,doc2vec_5,doc2vec_6,doc2vec_7,doc2vec_8,doc2vec_9,...,doc2vec_40,doc2vec_41,doc2vec_42,doc2vec_43,doc2vec_44,doc2vec_45,doc2vec_46,doc2vec_47,doc2vec_48,doc2vec_49
0,0.362608,-1.339198,-0.115744,-1.040151,-0.323848,0.788398,-0.018753,-0.197545,0.6907,-0.118282,...,-0.061661,0.530647,-0.127925,-0.178366,-0.24026,-0.929224,-0.10895,-0.666617,-0.093832,0.635729
1,-0.612035,0.272964,-0.400818,-1.278494,0.825669,0.344825,-0.347659,-0.802429,-1.072299,0.287547,...,0.421728,0.821713,-0.222975,-0.511119,-0.31478,-0.830124,-0.346739,0.293434,0.334306,0.42051
2,-0.01245,-0.774957,0.234869,0.204644,0.226701,0.400874,0.068928,1.078775,-1.386476,-0.039092,...,0.240239,1.639499,0.622158,0.059481,-0.313485,-0.618437,-0.094084,-0.145679,-0.318513,0.135195
3,0.371585,-0.701058,-0.018497,-0.288783,0.764215,-0.358407,-0.309732,-0.036906,-0.283879,0.687535,...,-0.008311,0.679632,0.66463,0.089409,0.166947,0.1889,0.233804,-1.117431,0.766134,-0.392208
4,0.501965,0.346176,-1.496494,-1.006186,-0.092925,0.738305,-1.487715,0.523871,-0.167349,1.066349,...,0.159439,1.7854,-0.656976,0.746208,1.035529,0.079319,-0.35282,-0.401899,-0.690531,0.056028


In [92]:
results = cross_val(X, y, n_splits = 3)

Training until validation scores don't improve for 100 rounds.
[200]	valid_0's l2: 1.36135	valid_0's rmse: 1.16677
[400]	valid_0's l2: 1.35696	valid_0's rmse: 1.16489
[600]	valid_0's l2: 1.35606	valid_0's rmse: 1.1645
[800]	valid_0's l2: 1.35551	valid_0's rmse: 1.16426
Early stopping, best iteration is:
[836]	valid_0's l2: 1.35527	valid_0's rmse: 1.16416
Fold = 0. QWK = 0.08451325783201258. Coef = [0.5079581  1.47608797 2.61185901 3.44789229]
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's l2: 1.36045	valid_0's rmse: 1.16638
[400]	valid_0's l2: 1.34793	valid_0's rmse: 1.161
[600]	valid_0's l2: 1.34277	valid_0's rmse: 1.15878
[800]	valid_0's l2: 1.3411	valid_0's rmse: 1.15806
Early stopping, best iteration is:
[812]	valid_0's l2: 1.34104	valid_0's rmse: 1.15803
Fold = 1. QWK = 0.09793454868105433. Coef = [0.51810275 1.47835325 2.44566464 3.55682648]
Training until validation scores don't improve for 100 rounds.
[200]	valid_0's l2: 1.35722	valid_0's rmse: 1

NameError: name 'test_predictions' is not defined