In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
import os
from sklearn.decomposition import PCA
import json


In [None]:
os.listdir("../input/keras-pretrained-models/")

In [None]:
train = pd.read_csv("../input/petfinder-adoption-prediction/train/train.csv")
train_label = train.pop("AdoptionSpeed")

train["traintest"] = "train"
print(train.shape)

test = pd.read_csv("../input/petfinder-adoption-prediction/test/test.csv")
test["traintest"] = "test"
print(test.shape)

fulldf = pd.concat([train,test],axis = 0)
print(fulldf.shape)

In [None]:
fulldf.sample(5)

In [None]:
fulldf.loc[fulldf['PetID'] == "269c5b546"]["traintest"].values[0]

In [None]:
def add_feature (feature_column,original_dataset,join_column_name):
    if (type(feature_column) != pd.DataFrame):
        feature_column = pd.DataFrame(feature_column) 
    if (join_column_name not in feature_column.columns):
        print("No ID column to merge on")
        return
    
    new_dataset = original_dataset.merge(original_dataset.merge(feature_column, how='left', on=join_column_name, sort=False))
    return new_dataset

In [None]:
ft = pd.DataFrame(fulldf.groupby("RescuerID")["RescuerID"].count())
ft.columns = ['RescuerExp']
ft = ft.reset_index()
ft.head()

In [None]:
fulldf = add_feature (ft,fulldf,"RescuerID")
fulldf.head()

In [None]:
df = fulldf[fulldf.Age < 30]
plt.hist(df["Age"])
plt.show()

In [None]:
fulldf['RescuerExpRank'] = fulldf["RescuerExp"].rank(method='min')

fulldf['AgeCat'] = np.where(fulldf['Age'] < 6, 0, 1)
fulldf['AgeCat'] = fulldf['AgeCat'].astype("category")

fulldf['Named'] = np.where(fulldf['Name'] == "No Name Yet",0, 1)
fulldf['Named'] = fulldf['Named'].astype("category")

In [None]:
plt.hist(fulldf["RescuerExpRank"])
plt.show()

In [None]:
df_type = fulldf.loc[:,"Vaccinated":"Health"]
pca = PCA(n_components=3)
pca.fit(df_type)
df_pcatype = pca.transform(df_type)
df_pcatype.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
fulldf["PCA_health1"] =  df_pcatype[:,0]
fulldf["PCA_health2"] =  df_pcatype[:,1]
fulldf.head(2)

In [None]:
df_type = fulldf.loc[:,"Breed1":"FurLength"]
pca = PCA(n_components=3)
pca.fit(df_type)
df_pcatype = pca.transform(df_type)
df_pcatype.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
fulldf["PCA_type1"] =  df_pcatype[:,0]
fulldf.head(2)

In [None]:
train_images = os.listdir("../input/petfinder-adoption-prediction/train_images")
train_folder = "../input/petfinder-adoption-prediction/train_images/"

test_images = os.listdir("../input/petfinder-adoption-prediction/test_images")
test_folder = "../input/petfinder-adoption-prediction/test_images/"

In [None]:
#On ne garde que les premieres images

compteur = 0

for i in range(len(train_images)):
    if (train_images[i].split("-")[1].split(".")[0] == "1"):
        compteur += 1
        
print(compteur)

compteur2 = 0

for i in range(len(test_images)):
    if (test_images[i].split("-")[1].split(".")[0] == "1"):
        compteur2 += 1
        
print(compteur2)

In [None]:
from keras.preprocessing import image
import os
from keras.applications.vgg16  import VGG16,preprocess_input

model = VGG16(weights='../input/keras-pretrained-models/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5', include_top=False)

extracted_features = np.zeros((compteur, 7 * 7 * 512))
ids = np.empty((compteur,1),dtype=object)
line = 0

for i in range(len(train_images)):
    if (train_images[i].split("-")[1].split(".")[0] == "1"):
        img_path = train_folder + train_images[i]
        img_id = train_images[i].split("-")[0]
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        features = model.predict(x)
        features = np.reshape(features, (1, 7 * 7 * 512))
        extracted_features[line,:] = features
        ids[line,0] = img_id
        line += 1

In [None]:
# Refaire la même chose pour les images test et merge train_test avant PCA
extracted_features2 = np.zeros((compteur2, 7 * 7 * 512))
ids2 = np.empty((compteur2,1),dtype=object)
line2 = 0

for i in range(len(test_images)):
    if (test_images[i].split("-")[1].split(".")[0] == "1"):
        img_path = test_folder + test_images[i]
        img_id = test_images[i].split("-")[0]
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        features = model.predict(x)
        features = np.reshape(features, (1, 7 * 7 * 512))
        extracted_features2[line2,:] = features
        ids2[line2,0] = img_id
        line2 += 1

In [None]:
full_extracted_features = np.vstack((extracted_features,extracted_features2))
full_extracted_features.shape

In [None]:
#release memory
import gc

In [None]:
del extracted_features
del extracted_features2
gc.collect()

In [None]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
full_extracted_features = std_scaler.fit_transform(full_extracted_features)

In [None]:
pca = PCA(n_components=20)
df_pcaimgs = pca.fit_transform(full_extracted_features)
print(df_pcaimgs.shape)
print(np.sum(pca.explained_variance_ratio_))

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.show()

In [None]:
del full_extracted_features
gc.collect()

full_ids = np.vstack((ids,ids2))
del ids
del ids2
full_ids.shape

In [None]:
df = pd.DataFrame(np.hstack((full_ids,df_pcaimgs)))

del full_ids
del df_pcaimgs
gc.collect()

df.shape

In [None]:
# For merging
cols = ["pca_img" + str(i) for i in range(df.shape[1])]
cols[0] = "PetID"
df.columns = cols

del cols
gc.collect()

In [None]:
fulldf = add_feature (df,fulldf,"PetID")

del df
gc.collect()

fulldf.loc[:,"pca_img1":"pca_img20"] = fulldf.loc[:,"pca_img1":"pca_img20"].astype("float")

fulldf.shape

In [None]:
#Extraction des métadonnées

train_metadata = os.listdir("../input/petfinder-adoption-prediction/train_metadata/")
train_metadata_folder = "../input/petfinder-adoption-prediction/train_metadata/"

test_metadata = os.listdir("../input/petfinder-adoption-prediction/test_metadata/")
test_metadata_folder = "../input/petfinder-adoption-prediction/test_metadata/"

In [None]:
vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
face_annotations = []
label_annotations = []
text_annotations = []
pet_ids = []

nf_count = 0
nl_count = 0
for pet in fulldf.PetID:
    if (fulldf.loc[fulldf['PetID'] == pet]["traintest"].values[0] == "train") :
        try:
            with open(train_metadata_folder + pet + '-1.json', 'r') as f:
                data = json.load(f)

            pet_ids.append(pet)    
            face_annotations.append(data.get('faceAnnotations', []))
            text_annotations.append(data.get('textAnnotations', []))
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            if data.get('labelAnnotations'):
                label_annotations.append(data['labelAnnotations'])
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)
            else:
                nl_count += 1
                label_annotations.append([])
                label_descriptions.append('nothing')
                label_scores.append(-1)
        except FileNotFoundError:
            pet_ids.append(pet)
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            label_annotations.append([])
            label_descriptions.append('nothing')
            label_scores.append(-1)
            face_annotations.append([])
            text_annotations.append([])
    else:
        try:
            with open(test_metadata_folder + pet + '-1.json', 'r') as f:
                data = json.load(f)

            pet_ids.append(pet)    
            face_annotations.append(data.get('faceAnnotations', []))
            text_annotations.append(data.get('textAnnotations', []))
            vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
            vertex_xs.append(vertex_x)
            vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
            vertex_ys.append(vertex_y)
            bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
            bounding_confidences.append(bounding_confidence)
            bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
            bounding_importance_fracs.append(bounding_importance_frac)
            dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
            dominant_blues.append(dominant_blue)
            dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
            dominant_greens.append(dominant_green)
            dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
            dominant_reds.append(dominant_red)
            dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
            dominant_pixel_fracs.append(dominant_pixel_frac)
            dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
            dominant_scores.append(dominant_score)
            if data.get('labelAnnotations'):
                label_annotations.append(data['labelAnnotations'])
                label_description = data['labelAnnotations'][0]['description']
                label_descriptions.append(label_description)
                label_score = data['labelAnnotations'][0]['score']
                label_scores.append(label_score)
            else:
                nl_count += 1
                label_annotations.append([])
                label_descriptions.append('nothing')
                label_scores.append(-1)
        except FileNotFoundError:
            nf_count += 1
            pet_ids.append(pet)
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            label_annotations.append([])
            label_descriptions.append('nothing')
            label_scores.append(-1)
            face_annotations.append([])
            text_annotations.append([])


In [None]:
label_descs = []
for label in label_descriptions:
    if (label == "cat"):
        label_descs.append(0)
    elif (label == "dog"):
        label_descs.append(1)
    else:
        label_descs.append(2)
        
        
metadata_df = pd.DataFrame(np.vstack((pet_ids,vertex_xs,vertex_ys,bounding_confidences,bounding_importance_fracs,dominant_blues,
                        dominant_greens,dominant_reds,dominant_pixel_fracs,dominant_scores,label_descs,label_scores)))

metadata_df = metadata_df.T

cols = ["PetID","vertex_x","vertex_y","bounding_confidences","bounding_importance_fracs","dominant_blues","dominant_greens","dominant_reds",
        "dominant_pixel_fracs","dominant_scores","label_descriptions","label_scores"]
metadata_df.columns = cols

for col in cols:
    if not ((col == "PetID") or (col == "label_descriptions")):
        metadata_df[col] = metadata_df[col].astype("float64")

metadata_df["label_descriptions"] = metadata_df["label_descriptions"].astype('category')
print(metadata_df.dtypes)

metadata_df.head()

In [None]:
fulldf = add_feature (metadata_df,fulldf,"PetID")

del metadata_df
gc.collect()

fulldf.head()

In [None]:
fulldf["Description"] = fulldf["Description"].astype(str) 
fulldf["Description"] = fulldf["Description"].astype(str).fillna('missing') # FILL NA
fulldf["Description"] =fulldf["Description"].str.lower() 
fulldf["Description"+ '_num_words'] = fulldf["Description"].apply(lambda comment: len(comment.split())) # Count number of Words
fulldf["Description"+ '_num_unique_words'] = fulldf["Description"].apply(lambda comment: len(set(w for w in comment.split())))  # Count Unique Words
fulldf["Description"+ '_words_vs_unique'] = fulldf["Description" + '_num_unique_words'] / fulldf["Description"+'_num_words'] * 100 # 

fulldf.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

word_vect = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='word',
            token_pattern=r'\w{1,}',
            stop_words='english',
            ngram_range=(1, 2),
            max_features=20000)

In [None]:
word_vect.fit(fulldf['Description'])
word_features  = word_vect.transform(fulldf['Description'])

print(type(word_features))
word_features.get_shape

In [None]:
word_vect.get_feature_names()[9000:9005]

In [None]:
fulldf_word_features = pd.DataFrame(word_features.toarray())

del word_features
del word_vect
gc.collect()

fulldf_word_features.shape

In [None]:
from sklearn.decomposition import TruncatedSVD

std_scaler = StandardScaler()
fulldf_word_features = std_scaler.fit_transform(fulldf_word_features)


svd = TruncatedSVD(n_components=20, random_state=42)

fulldf_svd_word_features = pd.DataFrame(svd.fit_transform(fulldf_word_features))

cols = ["desc_svd_tfid" + str(i+1) for i in range(fulldf_svd_word_features.shape[1])]
fulldf_svd_word_features.columns = cols


del fulldf_word_features
gc.collect()

In [None]:
plt.plot(svd.explained_variance_ratio_)
plt.show()

In [None]:
fulldf = pd.concat([fulldf,pd.DataFrame(fulldf_svd_word_features)],axis=1)

del fulldf_svd_word_features
gc.collect()

fulldf.head()

In [None]:
#Ajouter features sentiments
samplesentiment = pd.read_json('../input/petfinder-adoption-prediction/train_sentiment/{}'.format("4fdebca57.json"), orient='index', typ='series')
print("Document Sentiment")
print(samplesentiment["documentSentiment"])
print("\nEntities")
print(samplesentiment["entities"][0])
print("\nSentences")
print(samplesentiment['sentences'][0])


In [None]:
from pandas.io.json import json_normalize

sentiment_list = os.listdir('../input/petfinder-adoption-prediction/train_sentiment')
train_sentiment_df = pd.DataFrame()
for i,x in enumerate(sentiment_list):
    samplesentiment = pd.read_json('../input/petfinder-adoption-prediction/train_sentiment/{}'.format(x), orient='index', typ='series')

    sentences = json_normalize(samplesentiment.sentences).loc[:,['sentiment.magnitude', 'sentiment.score']].agg(
                    {
                       'sentiment.magnitude' : ['count','mean','std'],
                       'sentiment.score' : ['mean','std', 'sum']

                    }).unstack().to_frame().sort_index(level=1).T
    sentences.columns = sentences.columns.map('_'.join)

#         words_salience_type = json_normalize(samplesentiment.entities).loc[:,['name','salience','type']].set_index('name')\
#             .unstack().to_frame().sort_index(level=1).T
#         words_salience_type.columns = words_salience_type.columns.map('_'.join)

    sentiment = pd.concat([json_normalize(samplesentiment["documentSentiment"]),
                           sentences,
#                                words_salience_type
                          ], axis =1)
#         train_sentiment_df[x[:9]] = sentiment
    sentiment.index = [x[:9]]
    train_sentiment_df = pd.concat([train_sentiment_df, sentiment], axis =0)

train_sentiment_df.sample(5)

In [None]:
sentiment_list = os.listdir('../input/petfinder-adoption-prediction/test_sentiment/')
test_sentiment_df = pd.DataFrame()
for i,x in enumerate(sentiment_list):
    samplesentiment = pd.read_json('../input/petfinder-adoption-prediction/test_sentiment/{}'.format(x), orient='index', typ='series')
    sentences = json_normalize(samplesentiment.sentences).loc[:,['sentiment.magnitude', 'sentiment.score']].agg(
                    {
                       'sentiment.magnitude' : ['count','mean','std'],
                       'sentiment.score' : ['mean','std', 'sum']

                    }).unstack().to_frame().sort_index(level=1).T
    sentences.columns = sentences.columns.map('_'.join)

#         words_salience_type = json_normalize(samplesentiment.entities).loc[:,['name','salience','type']].set_index('name')\
#             .unstack().to_frame().sort_index(level=1).T
#         words_salience_type.columns = words_salience_type.columns.map('_'.join)

    sentiment = pd.concat([json_normalize(samplesentiment["documentSentiment"]),
                           sentences,
#                                words_salience_type
                          ], axis =1)
#         test_sentiment_df[x[:9]] = sentiment
    sentiment.index = [x[:9]]
    test_sentiment_df = pd.concat([test_sentiment_df, sentiment], axis =0)

test_sentiment_df.sample(5)

In [None]:
sentiments_df = pd.concat([train_sentiment_df,test_sentiment_df],axis=0)
sentiments_df["PetID"] = sentiments_df.index
fulldf = add_feature (sentiments_df,fulldf,"PetID")
fulldf.head()

In [None]:
fulldf = fulldf.drop(["Name","Description","RescuerID"],axis=1)
fulldf.head()

In [None]:
#Add gradient boosting modelization and submit !
filtertrain = (fulldf["traintest"] == "train")
train = fulldf[filtertrain]

filtertest = (fulldf["traintest"] == "test")
test = fulldf[filtertest]

train.pop("PetID")

del fulldf
gc.collect()

print(train.shape)
print(test.shape)
test.pop("PetID").head()

In [None]:
train.pop("traintest")
test.pop("traintest");

In [None]:
import lightgbm as lgb

from sklearn.model_selection import KFold

lgb_data = lgb.Dataset(data = train, label = train_label)

lgb_params = {'objective': 'multiclass','num_leaves': 20, 'num_class': 5 }


lgb_cv = lgb.cv(nfold=5, params=lgb_params,train_set=lgb_data,num_boost_round=50, metrics='multi_error',early_stopping_rounds=10,stratified=True)

print(lgb_cv)

In [None]:
from sklearn.metrics import cohen_kappa_score, make_scorer

kappa_scorer = make_scorer(cohen_kappa_score,weights='quadratic')

parameters_lgb = {'num_leaves': np.array([20,50,200]) ,'n_estimators': np.array([50]) ,
                  'learning_rate': np.array([0.05]) ,'min_child_samples':np.array([30,150]),'reg_alpha': np.array([0.1,0.5])}

In [None]:
from sklearn.model_selection import GridSearchCV

lgbc = lgb.LGBMRegressor()

gs = GridSearchCV(estimator=lgbc, param_grid=parameters_lgb, cv=4,scoring = 'neg_mean_squared_error')

In [None]:
gs.fit(train, train_label)

In [None]:
gs.cv_results_['mean_test_score']

In [None]:
np.max(gs.cv_results_['mean_test_score'])

In [None]:
gs.best_estimator_

In [None]:
train_preds = gs.predict(train)

In [None]:
test_preds = gs.predict(test)

In [None]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

In [None]:
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0
    
    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
    
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    
    def coefficients(self):
        return self.coef_['x']

In [None]:
optR = OptimizedRounder()
optR.fit(train_preds,train_label.values)
coefficients = optR.coefficients()
valid_pred = optR.predict(train_preds, coefficients)
qwk = quadratic_weighted_kappa(train_label.values, valid_pred)
print("QWK = ", qwk)

In [None]:
coefficients_ = coefficients.copy()
#coefficients_[0] = 1.66
#coefficients_[1] = 2.13
#coefficients_[3] = 2.85

test_predictions = optR.predict(test_preds, coefficients_).astype(np.int8)

print(test_predictions.shape)
test_predictions.head()

In [None]:
submission = pd.read_csv("../input/petfinder-adoption-prediction/test/sample_submission.csv")

print(submission.shape)
submission["AdoptionSpeed"] = test_predictions
print(submission.shape)

submission.head()

In [None]:
test_predictions.to_csv('test_predictions', index=False)
submission.to_csv('submission.csv', index=False)