In [49]:
import scipy as sp
import numpy as np

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix

# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, LogisticRegression
import xgboost as xgb

In [51]:
dataframe = pd.read_csv("Data/train.csv", header=0)
array = dataframe.values
labels = dataframe.columns
X = array[:,0:23]
Y = array[:,23]
Y = Y.astype('float')
print(Y)
print(labels)

[2. 0. 3. ... 3. 4. 3.]
Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed'],
      dtype='object')


In [52]:
# Feature synthesis:
# Name/No Name
has_name = [str(p[1]) != "nan" for p in X]

# Name length
name_len = [len(str(p[1])) if str(p[1]) != "nan" else 0 for p in X]

# Adoption Speed -> linear rather then categorical
def to_linear(y):
  if y == 0:  # same day
    return 0.5
  elif y == 1: # 1 week
    return 4.0
  elif y == 2: # 1 month
    return 19.0
  elif y == 3: # 1-3 months
    return 60.0
  elif y == 4: # > 100 days (estimated 320)
    return 210

def to_linear(y, skew):
  if y == 0:  # same day
    return 0.5*skew
  elif y == 1: # 1 week
    return 4.0*skew  # 4
  elif y == 2: # 1 month
    return 19.0*skew # 19
  elif y == 3: # 1-3 months
    return 60.0*skew # 60
  elif y == 4: # > 100 days (estimated 320)
    return 220

Y_trans = [to_linear(y, 0.75) for y in Y]

# Description length
desc_len = [len(str(p[20])) if str(p[1]) != "nan" else 0 for p in X]

# Pictures/no pictures
has_pic = [p[22] != 0 for p in X]

# Free/not free
is_free = [p[16] == 0 for p in X]

# Clearing: name, description, rescuer ID, pet ID, state
X = np.delete(X, [1,17,18,20,21], axis=1)

In [53]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [1, 2, 3, 4]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [89]:
# Linear regression
model = Lasso(alpha=0.1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

param = {'max_depth':2, 'eta':1, 'silent':1}
num_round = 2

xgb_train = xgb.DMatrix(X_train, label=Y_train)
xgb_test = xgb.DMatrix(X_test)
bst = xgb.train(param, xgb_train, num_round)
# make prediction
pred = bst.predict(xgb_test)


# transform back to categorical
def to_categorical(y):
  if y <= 1.07:  # same day
    return 0.0
  elif y <= 2.37: # 1 week
    return 1.0
  elif y <= 2.54: # 1 month
    return 2.0
  elif y <= 3.58: # 1-3 months
    return 3.0
  else: # > 100 days
    return 4.0

# model.fit(X_train, Y_train)
#Y_pred = model.predict(X_test)

OptR = OptimizedRounder()
OptR.fit(pred, Y_test)
print(OptR.coefficients())

pred = [to_categorical(y) for y in pred]

accuracy = quadratic_weighted_kappa(Y_test,pred)
print(accuracy)

[0.946875 2.16875  2.971875 3.775   ]
0.22153835805334132


In [55]:
# Adoption Speed -> linear rather then categorical
def to_linear(y, skew):
  if y == 0:  # same day
    return 0.5
  elif y == 1: # 1 week
    return 4.0  # 4
  elif y == 2: # 1 month
    return 19.0*skew # 19
  elif y == 3: # 1-3 months
    return 60.0*skew # 60
  elif y == 4: # > 100 days (estimated 320)
    return 220

weights = []
for i in range(60,80,1):
    skew = i/100
    
    acc = 0
    for j in range(20):
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
        
        Y_train = [to_linear(y, skew) for y in Y_train]

        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)

        Y_pred = [to_categorical(y) for y in Y_pred]

        acc = acc + quadratic_weighted_kappa(Y_test,Y_pred)
    acc = acc/20
    weights.append((skew, acc))


plt.scatter(*zip(*weights))
plt.show()



KeyboardInterrupt: 