In [None]:
import pandas as pd
import matplotlib as plt
import os
import seaborn as sns
import numpy as np
import json
import matplotlib.pyplot as plt

import optuna
import lightgbm as lgb
import sklearn.metrics
from sklearn.feature_selection import RFECV

from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

In [None]:
# lgb

# #private score: 0.34864
# #weighted kappa on cross validation: 0.4010444202401364 

# #private score: 0.35116 with conversion to categorical (vanilla)
# #0.40327950719004346 qwk cv

# #private score: 0.33312 with all metadata/sentiment
# #0.39 qwk cv

# #private score:0.34807 with top 11 important features
# #0.41 qwk cv

# #private score:0.34558 with 6 least important features removal
# #0.41 qwk cv 

# xgboost

# QWK =  0.31 vwith all metadata/sentiment cv
# private score: 0.22969


# QWK =  0.24007281878061848 vanilla xgboost cv
# private score:0.16972

# QWK = 0.27 top 12 features cv
# private score: 0.21

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

In [None]:
def extract_sentiment(dataset, ids, folder):
    
    doc_sent_mag = []
    doc_sent_score = []
    
    for pet in ids:
        try:
            with open(f'{folder}/{pet}.json', 'r', encoding='utf-8') as f:
                sentiment = json.load(f)
                file_sentiment = sentiment['documentSentiment']
                doc_sent_mag.append(file_sentiment['magnitude'])
                doc_sent_score.append(file_sentiment['score'])
        
        except FileNotFoundError:
            doc_sent_mag.append(np.nan)
            doc_sent_score.append(np.nan)
            
    dataset['doc_sent_mag'] = doc_sent_mag
    dataset['doc_sent_score'] = doc_sent_score
    
    return dataset


def get_label_score(json_file, json_keys):
    
    return np.asarray([x['score'] for x in json_file['labelAnnotations']]).mean() if 'labelAnnotations' in json_keys else np.nan


def get_img_color_score_pixelfrac(json_file, json_keys):
    
    if 'imagePropertiesAnnotation' in json_keys:

        img_colors = json_file['imagePropertiesAnnotation']['dominantColors']['colors']
        img_color_score = np.asarray([x['score'] for x in img_colors]).mean()
        img_color_pixelfrac = np.asarray([x['pixelFraction'] for x in img_colors]).mean()

    else:
        img_color_score = np.nan
        img_color_pixelfrac = np.nan
        
    return img_color_score, img_color_pixelfrac
    
def get_img_crop_conf_importance(json_file, json_keys):
    
    if 'cropHintsAnnotation' in json_keys:
        img_crops = json_file['cropHintsAnnotation']['cropHints']
        img_crop_conf = np.asarray([x['confidence'] for x in img_crops]).mean()

        if 'importanceFraction' in img_crops[0].keys():
            img_crop_importance = np.asarray([x['importanceFraction'] for x in img_crops]).mean()
        else:
            img_crop_importance = np.nan

    else:
        img_crop_conf = np.nan
        img_crop_importance = np.nan
        
    return img_crop_conf, img_crop_importance
    

def extract_metadata(dataset, ids, folder):
    
    metadata_label_score_column = []
    metadata_color_score_column = []
    metadata_color_pixelfrac_column = []
    metadata_crop_conf_column = []
    metadata_crop_importance_column = []

    for pet in ids:
        
        metadata_label_scores = []
        metadata_color_scores = []
        metadata_color_pixelfracs = []
        metadata_crop_confs = []
        metadata_crop_importances = []
        
        more_image_exist = True
        iterator = 1

        while more_image_exist:

            try:

                with open(f'{folder}/{pet}-{iterator}.json', 'r', encoding='utf-8') as f:

                    metadata = json.load(f)
                    keys = list(metadata.keys())
                    
                    label_score = get_label_score(metadata, keys)
                    img_color_score, img_color_pixelfrac = get_img_color_score_pixelfrac(metadata, keys)
                    img_crop_conf, img_crop_importance = get_img_crop_conf_importance(metadata, keys)

                    metadata_label_scores.append(label_score)
                    metadata_color_scores.append(img_color_score)
                    metadata_color_pixelfracs.append(img_color_pixelfrac)
                    metadata_crop_confs.append(img_crop_conf)
                    metadata_crop_importances.append(img_crop_importance)

                iterator += 1

            except FileNotFoundError:
                more_image_exist = False
                
        metadata_label_score_column.append(np.mean(metadata_label_scores))
        metadata_color_score_column.append(np.mean(metadata_color_scores))
        metadata_color_pixelfrac_column.append(np.mean(metadata_color_pixelfracs))
        metadata_crop_conf_column.append(np.mean(metadata_crop_confs))
        metadata_crop_importance_column.append(np.mean(metadata_crop_importances))
    
    dataset['metadata_label_score'] = metadata_label_score_column
    dataset['metadata_color_score'] = metadata_color_score_column
    dataset['metadata_color_pixelfrac'] = metadata_color_pixelfrac_column
    dataset['metadata_crop_conf'] = metadata_crop_conf_column
    dataset['metadata_crop_importance'] = metadata_crop_importance_column
    
    return dataset

In [None]:
%%time
train = extract_metadata(train, train['PetID'], '../input/petfinder-adoption-prediction/train_metadata')
train = extract_sentiment(train, train['PetID'], '../input/petfinder-adoption-prediction/train_sentiment')

In [None]:
%%time 
test = extract_metadata(test, test['PetID'], '../input/petfinder-adoption-prediction/test_metadata')
test = extract_sentiment(test, test['PetID'], '../input/petfinder-adoption-prediction/test_sentiment')

In [None]:
train['Description_length'] = train['Description'].apply(lambda x: len(x.split()) if x is not np.nan else 0)
test['Description_length'] = test['Description'].apply(lambda x: len(x.split()) if x is not np.nan else 0)

In [None]:
columns_to_drop = ['Name', 
                   'RescuerID', 
                   'Description', 
                   'PetID'
                   ]

X_train = train.drop(columns_to_drop, axis=1)
X_test = test.drop(columns_to_drop, axis=1)

In [None]:
categorical_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength',
                    'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State']

for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [None]:
X_train.columns

In [None]:
y_train = train['AdoptionSpeed']

In [None]:
X_train.shape, y_train.shape, X_test.shape

In [None]:
# import optuna

# import lightgbm as lgb
# import sklearn.metrics
# from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)
# train_df = lgb.Dataset(data=x_train, label=y_train, categorical_feature = categorical_cols, free_raw_data=False)
# test_df = lgb.Dataset(data=x_test, label=y_test, categorical_feature = categorical_cols)
    
# def objective(trial): 
#     param = {
#         'objective': 'multiclass',
#         'metric': 'multi_logloss',
#         'num_class': 5,
#         "verbosity": -1,
#         'data_random_seed': 42,
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#         'num_iterations': trial.suggest_int('num_iterations', 1, 1000),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#         'max_depth': trial.suggest_int('max_depth', 0, 11),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
#         'feature_pre_filter': False
#     }
    
#     gbm = lgb.train(param, train_df, valid_sets=test_df) #,  early_stopping_rounds=10, num_boost_round=100
#     preds = gbm.predict(x_test).argmax(axis=1) #, num_iteration=gbm.best_iteration
#     accuracy = sklearn.metrics.accuracy_score(y_test, preds)
#     return accuracy

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=30)
 
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)
# print("  Value: {}".format(study.best_trial.value))

In [None]:
# print("  Value: {}".format(study.best_trial.params))

In [None]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

In [None]:
# import lightgbm as lgb


# #private score: 0.34864
# #weighted kappa on cross validation: 0.4010444202401364

# #0.35116 with conversion to categorical (vanilla)
# #0.40327950719004346 qwk cv

# #0.33312 with all metadata/sentiment
# #0.39 qwk cv

# #0.34807 with top 11 important features
# #0.41 qwk cv

# #0.34558 with 6 least important features removal
# #0.41 qwk cv 


# #XGBOOST
# #QWK =  0.31921077436316847
# # private score: 0.22969
# params = {'application': 'regression',
#           'boosting': 'gbdt',
#           'metric': 'rmse',
#           'num_leaves': 70,
#           'max_depth': 9,
#           'learning_rate': 0.01,
#           'bagging_fraction': 0.85,
#           'feature_fraction': 0.8,
#           'min_split_gain': 0.02,
#           'min_child_samples': 150,
#           'min_child_weight': 0.02,
#           'lambda_l2': 0.0475,
#           'verbosity': -1,
#           'data_random_seed': 17}

# # params = {
# #         'application': 'regression',
# #         'boosting': 'gbdt',
# #         'metric': 'rmse',
# #         'lambda_l1': 0.21785449693658834, 
# #         'num_iterations': 570, 
# #         'lambda_l2': 0.0027941677997421446, 
# #         'num_leaves': 18, 
# #         'feature_fraction': 0.94, 
# #         'bagging_fraction': 0.85, 
# #         'bagging_freq': 5, 
# #         'min_child_samples': 85, 
# #         'max_depth': 6, 
# #         'learning_rate': 1.0}


# # params = study.best_trial.params

# # print(study.best_trial.params)
# # Additional parameters:
# early_stop = 500
# verbose_eval = 100
# num_rounds = 10000
# n_splits = 5

In [None]:
# X_train.shape

In [None]:
# from sklearn.model_selection import StratifiedKFold


# kfold = StratifiedKFold(n_splits=n_splits)

# #out of fold 
# oof_train = np.zeros((X_train.shape[0]))
# oof_test = np.zeros((X_test.shape[0], n_splits))


# i = 0

# #Indexes of train and test rows
# for train_index, valid_index in kfold.split(X_train, y_train):
    
#     #training data is split to train and validation sets 
#     X_tr = X_train.iloc[train_index, :]
#     X_val = X_train.iloc[valid_index, :]
    
#     #target values are taken from training dataset
#     y_tr = X_tr['AdoptionSpeed'].values
#     #training data taken with no target
#     X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)
    
#     #target values are taken from validation dataset
#     y_val = X_val['AdoptionSpeed'].values
#     #validation data taken with no target
#     X_val = X_val.drop(['AdoptionSpeed'], axis=1)
    
    
#     print('\ny_tr distribution: {}'.format(Counter(y_tr)))
    
#     #make datasets for lgb? 
#     d_train = lgb.Dataset(X_tr, label=y_tr)
#     d_valid = lgb.Dataset(X_val, label=y_val)
#     #see loss functions in lgb
#     watchlist = [d_train, d_valid]
    
#     print('training LGB:')
#     model = lgb.train(params,
#                       train_set=d_train,
#                       num_boost_round=num_rounds,
#                       valid_sets=watchlist,
#                       verbose_eval=verbose_eval,
#                       early_stopping_rounds=early_stop)
    
#     val_pred = model.predict(X_val, num_iteration=model.best_iteration)
#     test_pred = model.predict(X_test, num_iteration=model.best_iteration)
    
#     oof_train[valid_index] = val_pred
#     oof_test[:, i] = test_pred
    
#     i += 1

In [None]:
# plt.hist(oof_train)

In [None]:
# X_train.columns

In [None]:
# features_importance = pd.Series(model.feature_importance(), index=X_train.drop(['AdoptionSpeed'], axis=1).columns)
# features_importance = features_importance.sort_values(ascending=False)
# df = features_importance.to_frame()
# df['feature'] = df.index
# df = df.rename(columns={0: 'importance'})

# fig = plt.figure(figsize=(15,10))
# ax = sns.barplot(x="importance", y="feature", data=df)
# ax.set_xlabel('Importance')
# ax.set_ylabel('Feature')
# plt.show()

In [None]:
# X_train2 = X_train.drop(['VideoAmt', 'metadata_crop_conf', 'Type', 'Health', 'metadata_crop_importance', 'Color3'], axis=1)

# X_test2 = X_test.drop(['VideoAmt', 'metadata_crop_conf', 'Type', 'Health', 'metadata_crop_importance', 'Color3'], axis=1)

# X_train2.columns

In [None]:
# X_train2['metadata_label_score'].fillna(0,inplace=True)
# X_train2['metadata_color_pixelfrac'].fillna(0,inplace=True)
# X_train2['metadata_color_score'].fillna(0,inplace=True)
# X_train2['doc_sent_mag'].fillna(0,inplace=True)
# X_train2['doc_sent_score'].fillna(0,inplace=True)

# X_test2['metadata_label_score'].fillna(0,inplace=True)
# X_test2['metadata_color_pixelfrac'].fillna(0,inplace=True)
# X_test2['metadata_color_score'].fillna(0,inplace=True)
# X_test2['doc_sent_mag'].fillna(0,inplace=True)
# X_test2['doc_sent_score'].fillna(0,inplace=True)

In [None]:
# X_train2.isnull().sum()

In [None]:
# #out of fold 
# oof_train = np.zeros((X_train2.shape[0]))
# oof_test = np.zeros((X_test2.shape[0], n_splits))


# i = 0

# #Indexes of train and test rows
# for train_index, valid_index in kfold.split(X_train2, y_train):
    
#     #training data is split to train and validation sets 
#     X_tr = X_train2.iloc[train_index, :]
#     X_val = X_train2.iloc[valid_index, :]
    
#     #target values are taken from training dataset
#     y_tr = X_tr['AdoptionSpeed'].values
#     #training data taken with no target
#     X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)
    
#     #target values are taken from validation dataset
#     y_val = X_val['AdoptionSpeed'].values
#     #validation data taken with no target
#     X_val = X_val.drop(['AdoptionSpeed'], axis=1)
    
    
#     print('\ny_tr distribution: {}'.format(Counter(y_tr)))
    
#     #make datasets for lgb? 
#     d_train = lgb.Dataset(X_tr, label=y_tr)
#     d_valid = lgb.Dataset(X_val, label=y_val)
#     #see loss functions in lgb
#     watchlist = [d_train, d_valid]
    
#     print('training LGB:')
#     model = lgb.train(params,
#                       train_set=d_train,
#                       num_boost_round=num_rounds,
#                       valid_sets=watchlist,
#                       verbose_eval=verbose_eval,
#                       early_stopping_rounds=early_stop)
    
#     val_pred = model.predict(X_val, num_iteration=model.best_iteration)
#     test_pred = model.predict(X_test2, num_iteration=model.best_iteration)
    
#     oof_train[valid_index] = val_pred
#     oof_test[:, i] = test_pred
    
#     i += 1

In [None]:
# # Compute QWK based on OOF train predictions:
# optR = OptimizedRounder()
# # optR.fit(oof_train, X_train['AdoptionSpeed'].values)
# optR.fit(oof_train, y_train)
# coefficients = optR.coefficients()
# pred_test_y_k = optR.predict(oof_train, coefficients)
# # print("\nValid Counts = ", Counter(X_train['AdoptionSpeed'].values))
# print("\nValid Counts = ", Counter(y_train))
# print("Predicted Counts = ", Counter(pred_test_y_k))
# print("Coefficients = ", coefficients)
# # qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, pred_test_y_k)
# qwk = quadratic_weighted_kappa(y_train, pred_test_y_k)
# print("QWK = ", qwk)

In [None]:
# # Manually adjusted coefficients:

# coefficients_ = coefficients.copy()

# # coefficients_[0] = 1.645
# # coefficients_[1] = 2.115
# # coefficients_[3] = 2.84

# train_predictions = optR.predict(oof_train, coefficients_).astype(int)
# print('train pred distribution: {}'.format(Counter(train_predictions)))

# test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_)
# print('test pred distribution: {}'.format(Counter(test_predictions)))

In [None]:
# # Distribution inspection of original target and predicted train and test:

# print("True Distribution:")
# print(pd.value_counts(X_train['AdoptionSpeed'], normalize=True).sort_index())
# print("\nTrain Predicted Distribution:")
# print(pd.value_counts(train_predictions, normalize=True).sort_index())
# print("\nTest Predicted Distribution:")
# print(pd.value_counts(test_predictions, normalize=True).sort_index())

# xgb

In [None]:
#OPTUNA FOR XGB

# X_train_optuna = X_train.drop(['AdoptionSpeed'], axis=1)
# y_train_optuna = X_train['AdoptionSpeed']

# X_train_optuna['Type'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Breed1'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Breed2'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Gender'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Color1'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Color2'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Color3'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['MaturitySize'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['FurLength'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Vaccinated'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Dewormed'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Sterilized'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['Health'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))
# X_train_optuna['State'] = lbl.fit_transform(X_train_optuna['Type'].astype(int))

# import xgboost as xgb
# import sklearn.metrics
# from sklearn.model_selection import train_test_split

# x_train, x_val, y_train, y_val = train_test_split(X_train_optuna, y_train_optuna, test_size=0.2)
# train_df = xgb.DMatrix(data=x_train, label=y_train)
# val_df = xgb.DMatrix(data=x_val, label=y_val)
# eval_sets = [(train_df, 'train'), (val_df, 'eval')]
    
# def objective(trial): 
    
#     param = {
#         "num_class": 5,
#         'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.6),
#         'subsample': trial.suggest_uniform('subsample', 0.3, 0.9),
#         'max_depth': trial.suggest_int('max_depth', 3, 9),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 0.9),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 4)
#              }
    
#     xgb_opt = xgb.train(param, train_df, evals=eval_sets) #,  early_stopping_rounds=10, num_boost_round=100
#     preds = xgb_opt.predict(val_df) #, num_iteration=gbm.best_iteration
#     accuracy = sklearn.metrics.accuracy_score(y_test, preds)
#     return accuracy

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=30)

In [None]:
# print('Best trial:', study.best_trial.params)
# print("  Value: {}".format(study.best_trial.value))

In [None]:
# from sklearn.preprocessing import LabelEncoder

# lbl = LabelEncoder()
# X_train3 = X_train.copy()
# cats = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State']
# X_train3['Type'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Breed1'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Breed2'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Gender'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Color1'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Color2'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Color3'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['MaturitySize'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['FurLength'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Vaccinated'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Dewormed'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Sterilized'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['Health'] = lbl.fit_transform(X_train3['Type'].astype(int))
# X_train3['State'] = lbl.fit_transform(X_train3['Type'].astype(int))

In [None]:
# X_test3 = X_test.copy()
# X_test3['Type'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Breed1'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Breed2'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Gender'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Color1'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Color2'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Color3'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['MaturitySize'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['FurLength'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Vaccinated'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Dewormed'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Sterilized'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['Health'] = lbl.fit_transform(X_test3['Type'].astype(int))
# X_test3['State'] = lbl.fit_transform(X_test3['Type'].astype(int))

In [None]:
# X_train3_ = X_train3[['metadata_label_score', 'metadata_color_score', 'metadata_color_pixelfrac', 'Description_length', 'Age', 'doc_sent_mag', 'doc_sent_score',
#                    'PhotoAmt', 'Fee', 'Quantity', 'metadata_crop_importance', 'Type', 'AdoptionSpeed']]

In [None]:
# X_test3_ = X_test3[['metadata_label_score', 'metadata_color_score', 'metadata_color_pixelfrac', 'Description_length', 'Age', 'doc_sent_mag', 'doc_sent_score',
#                    'PhotoAmt', 'Fee', 'Quantity', 'metadata_crop_importance', 'Type']]

In [None]:
# def run_xgb(params, X_train, X_test):X_test
#     n_splits = 10
#     verbose_eval = 1000
#     num_rounds = 60000
#     early_stop = 500

#     kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)

#     oof_train = np.zeros((X_train.shape[0]))
#     oof_test = np.zeros((X_test.shape[0], n_splits))

#     i = 0

#     for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):

#         X_tr = X_train.iloc[train_idx, :]
#         X_val = X_train.iloc[valid_idx, :]

#         y_tr = X_tr['AdoptionSpeed'].values
#         X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)

#         y_val = X_val['AdoptionSpeed'].values
#         X_val = X_val.drop(['AdoptionSpeed'], axis=1)

#         d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns, enable_categorical=True)
#         d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns, enable_categorical=True)

#         watchlist = [(d_train, 'train'), (d_valid, 'valid')]
#         model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist,
#                          early_stopping_rounds=early_stop, verbose_eval=verbose_eval, params=params)

#         valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), ntree_limit=model.best_ntree_limit)
#         test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), ntree_limit=model.best_ntree_limit)

#         oof_train[valid_idx] = valid_pred
#         oof_test[:, i] = test_pred

#         i += 1
#     return model, oof_train, oof_test

In [None]:
# xgb_params = {'learning_rate': 0.1888087749957744, 'subsample': 0.7500614175599446, 'max_depth': 4, 'colsample_bytree': 0.6971444735272232, 'min_child_weight': 3}

# model, oof_train, oof_test = run_xgb(xgb_params, X_train3_, X_test3_)

In [None]:
# from xgboost import plot_importance
# plot_importance(model)

In [None]:
# def plot_pred(pred):
#     sns.distplot(pred, kde=True, hist_kws={'range': [0, 5]})

In [None]:
# plot_pred(oof_train)

In [None]:
# plot_pred(oof_test.mean(axis=1))

In [None]:
# optR = OptimizedRounder()
# optR.fit(oof_train, X_train3_['AdoptionSpeed'].values)
# coefficients = optR.coefficients()
# valid_pred = optR.predict(oof_train, coefficients)
# qwk = quadratic_weighted_kappa(X_train3['AdoptionSpeed'].values, valid_pred)
# print("QWK = ", qwk)

In [None]:
# coefficients_ = coefficients.copy()
# train_predictions = optR.predict(oof_train, coefficients_).astype(np.int8)
# print(f'train pred distribution: {Counter(train_predictions)}')
# test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_).astype(np.int8)
# print(f'test pred distribution: {Counter(test_predictions)}')

In [None]:
# submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions.astype(np.int32)})
# submission.head()
# submission.to_csv('submission.csv', index=False)

# LinearRegression

In [None]:
# X_train_optuna = X_train.copy()
# y_train_optuna = X_train['AdoptionSpeed']

In [None]:
# X_train_optuna.isnull().sum()

In [None]:
# X_train_optuna['metadata_label_score'].fillna(0,inplace=True)
# X_train_optuna['metadata_color_pixelfrac'].fillna(0,inplace=True)
# X_train_optuna['metadata_color_score'].fillna(0,inplace=True)
# X_train_optuna['metadata_crop_conf'].fillna(0,inplace=True)
# X_train_optuna['metadata_crop_importance'].fillna(0,inplace=True)
# X_train_optuna['doc_sent_mag'].fillna(0,inplace=True)
# X_train_optuna['doc_sent_score'].fillna(0,inplace=True)

# # X_train_optuna['metadata_label_score'].fillna(0,inplace=True)
# # X_test2['metadata_color_pixelfrac'].fillna(0,inplace=True)
# # X_test2['metadata_color_score'].fillna(0,inplace=True)
# # X_test2['doc_sent_mag'].fillna(0,inplace=True)
# # X_test2['doc_sent_score'].fillna(0,inplace=True)

In [None]:
# from sklearn.linear_model import LogisticRegression
# import sklearn.metrics
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler



# x_train, x_val, y_train, y_val = train_test_split(X_train_optuna, y_train_optuna, test_size=0.2)
# # train_df = xgb.DMatrix(data=x_train, label=y_train)
# # val_df = xgb.DMatrix(data=x_val, label=y_val)
# # eval_sets = [(x_train, 'train'), (x_val, 'eval')]


    
# def objective(trial):

#     param = {
#             'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'sag']),
#             'penalty': trial.suggest_categorical('penalty', ["l2", "none"]),
#             'C': trial.suggest_uniform('C', 0.1, 1.0),
#             'class_weight': trial.suggest_categorical('class_weight', ["balanced", None])
#         }
        
#     model = LogisticRegression(**param, max_iter=500, random_state=12)
#     model.fit(x_train, y_train) #, num_iteration=gbm.best_iteration
#     preds = model.predict(x_val)
#     accuracy = sklearn.metrics.accuracy_score(y_val, preds)
#     return accuracy

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

In [None]:
X_train_optuna = X_train.copy()
X_train_optuna['metadata_label_score'].fillna(0,inplace=True)
X_train_optuna['metadata_color_pixelfrac'].fillna(0,inplace=True)
X_train_optuna['metadata_color_score'].fillna(0,inplace=True)
X_train_optuna['metadata_crop_conf'].fillna(0,inplace=True)
X_train_optuna['metadata_crop_importance'].fillna(0,inplace=True)
X_train_optuna['doc_sent_mag'].fillna(0,inplace=True)
X_train_optuna['doc_sent_score'].fillna(0,inplace=True)

X_test_optuna = X_test.copy()
X_test_optuna['metadata_label_score'].fillna(0,inplace=True)
X_test_optuna['metadata_color_pixelfrac'].fillna(0,inplace=True)
X_test_optuna['metadata_color_score'].fillna(0,inplace=True)
X_test_optuna['metadata_crop_conf'].fillna(0,inplace=True)
X_test_optuna['metadata_crop_importance'].fillna(0,inplace=True)
X_test_optuna['doc_sent_mag'].fillna(0,inplace=True)
X_test_optuna['doc_sent_score'].fillna(0,inplace=True)

In [None]:
X_test_optuna.isnull().sum()

In [None]:
# Best trial: {'solver': 'newton-cg', 'penalty': 'none', 'C': 0.9219805871547243, 'class_weight': None}
#   Value: 0.349783261087029
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import metrics

# def run_lr(params, X_train):
    
#     clf = LogisticRegression(**params, max_iter=10000):
#     x_tr, y_tr = X_train.drop(['AdoptionSpeed'], axis=1), X_train['AdoptionSpeed']
#     predicted = cross_validation.cross_val_predict(clf, x_tr, y_tr, cv=10)

# def run_lr(params, X_train, X_test):
#     pred_test_full=0
#     cv_score=[]
#     i=1
#     n_splits = 10
    
#     oof_train = np.zeros((X_train.shape[0]))
#     oof_test = np.zeros((X_test.shape[0], n_splits))

#     kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)
    
#     for train_idx,valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):
        
#         X_tr = X_train.iloc[train_idx, :]
#         X_val = X_train.iloc[valid_idx, :]
        
#         y_tr = X_tr['AdoptionSpeed'].values
#         X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)
        
#         y_val = X_val['AdoptionSpeed'].values
#         X_val = X_val.drop(['AdoptionSpeed'], axis=1)
        
#         clf = LogisticRegression(**params, max_iter=500)
#         clf.fit(X_tr, y_tr)
        
#         valid_pred = clf.predict(X_val)
#         test_pred = clf.predict(X_test)
        
#         oof_train[valid_idx] = valid_pred
#         oof_test[:, i] = test_pred
        
#         i +=1
        
#     return clf, oof_train, oof_test

In [None]:
# params = {'solver': 'newton-cg', 'penalty': 'none', 'C': 0.92, 'class_weight': None}

# clf = LogisticRegression(**params, max_iter=10000)
x_tr, y_tr = X_train_optuna.drop(['AdoptionSpeed'], axis=1), X_train_optuna['AdoptionSpeed']
# predicted = cross_val_score(clf, x_tr, y_tr, cv=10)

# model, oof_train, oof_test = run_lr(params, X_train_optuna, X_test_optuna)

In [None]:
# print(predicted.mean()) #0.35909997776295305 

In [None]:
clf.fit(x_tr, y_tr)

In [None]:
ypred = clf.predict(X_test_optuna)

In [None]:
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': ypred.astype(np.int32)})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)