## Import libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import scipy as sp 

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import cohen_kappa_score, accuracy_score
from xgboost import XGBClassifier, XGBRegressor

import zipfile, json
import datetime as dt 

import sys, glob, os 
print('python version:', sys.version)
print('pandas version:', pd.__version__)

python version: 3.6.7 (default, Dec  5 2018, 15:02:05) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-36)]
pandas version: 0.24.1


## Read basic data

In [2]:
print(dt.datetime.now(), 'start reading data')

if os.path.isdir('../input') and not os.path.isfile('../input/test.zip'): 
    envx = 'remote' 
    dirc = '../input/'
else: 
    envx = 'local'
    dirc = '../../../data/PetFinder.my-Adoption-Prediction/'
print('kernel running environment:', envx) 
print('data path:', dirc) 

train = pd.read_csv(dirc+'train/train.csv')
test = pd.read_csv(dirc+'test/test.csv')
sample_sub = pd.read_csv(dirc+'test/sample_submission.csv')

print('train data shape:', train.shape)
print('test data shape:', test.shape)

data = pd.concat([train, test], sort=False)
print('full data shape:', data.shape)

breed_labels = pd.read_csv(dirc+'breed_labels.csv')
color_labels = pd.read_csv(dirc+'color_labels.csv')
state_labels = pd.read_csv(dirc+'state_labels.csv')

print(dt.datetime.now(), 'finish reading data')

2019-03-12 04:16:40.340796 start reading data
kernel running environment: local
data path: ../../../data/PetFinder.my-Adoption-Prediction/
train data shape: (14993, 24)
test data shape: (3948, 23)
full data shape: (18941, 24)
2019-03-12 04:16:40.561293 finish reading data


In [3]:
with pd.option_context('display.max_columns', 200):
    print(train.head())

   Type         Name  Age  Breed1  Breed2  Gender  Color1  Color2  Color3  \
0     2       Nibble    3     299       0       1       1       7       0   
1     2  No Name Yet    1     265       0       1       1       2       0   
2     1       Brisco    1     307       0       1       2       7       0   
3     1         Miko    4     307       0       2       1       2       0   
4     1       Hunter    1     307       0       1       1       0       0   

   MaturitySize  FurLength  Vaccinated  Dewormed  Sterilized  Health  \
0             1          1           2         2           2       1   
1             2          2           3         3           3       1   
2             2          2           1         1           2       1   
3             2          1           1         1           2       1   
4             2          1           2         2           2       1   

   Quantity  Fee  State                         RescuerID  VideoAmt  \
0         1  100  41326  8480853f

## Start parsing data from sentiment and metadata

### Utility function?

In [4]:
def getmean(ary, key): 
    ''' 
    get mean value associated with 'key' from a iterable 'ary'
    '''
    return np.array([x.get(key, np.nan) for x in ary]).mean()

def getsum(ary, key): 
    ''' 
    get sum value associated with 'key' from a iterable 'ary'
    '''
    return np.array([x.get(key, np.nan) for x in ary]).sum() 


### Parse sentiment data

In [5]:
print(dt.datetime.now(), 'start getting sentiment')

def get_sentiment(s): 
    '''
    parse sentiment from a sentiment json file 's' 
    '''
    docSentiMag = s['documentSentiment']['magnitude'] 
    docSentiScore = s['documentSentiment']['score']

    mag_sco = [x['sentiment'] for x in s['sentences']]
    
    fullMag = getsum(mag_sco, 'magnitude')
    fullScore = getsum(mag_sco, 'score')
    
    return [docSentiMag, docSentiScore, fullMag, fullScore]

def get_sentiment_f(myfile): 
    '''
    forward a file name string to get_sentiment 
    '''
    s = json.load(open(myfile))       
    return [myfile[myfile.rfind('/')+1:-5], *get_sentiment(s)]

def get_sentiment_zip(zipfilename):
    sentiment_proc = np.asarray([
        get_sentiment_f(myfile) for myfile in glob.glob(zipfilename)
    ])
    df_senti = pd.DataFrame(sentiment_proc, 
             columns=['PetID', 'docSentiMag', 'docSentiScore', 'fullMag', 'fullScore'])  
    df_senti['docSentiMag'] = df_senti['docSentiMag'].astype('float')
    df_senti['docSentiScore'] = df_senti['docSentiScore'].astype('float')
    df_senti['fullMag'] = df_senti['fullMag'].astype('float')
    df_senti['fullScore'] = df_senti['fullScore'].astype('float')
    return df_senti

 
train_senti = get_sentiment_zip(dirc+'train_sentiment/*.json')
test_senti = get_sentiment_zip(dirc+'test_sentiment/*.json')   
data_senti = pd.concat([train_senti, test_senti], sort=False)

print('train sentiment shape:', train_senti.shape)
print('test sentiment shape', test_senti.shape)
print('full sentiment shape', data_senti.shape)


print(dt.datetime.now(), 'finish getting sentiment')


2019-03-12 04:16:40.721473 start getting sentiment
train sentiment shape: (14442, 5)
test sentiment shape (3815, 5)
full sentiment shape (18257, 5)
2019-03-12 04:16:45.418610 finish getting sentiment


In [6]:
data_senti.isna().sum()

PetID            0
docSentiMag      0
docSentiScore    0
fullMag          0
fullScore        0
dtype: int64

In [7]:
data_senti.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18257 entries, 0 to 3814
Data columns (total 5 columns):
PetID            18257 non-null object
docSentiMag      18257 non-null float64
docSentiScore    18257 non-null float64
fullMag          18257 non-null float64
fullScore        18257 non-null float64
dtypes: float64(4), object(1)
memory usage: 855.8+ KB


In [8]:
data_senti.head()

Unnamed: 0,PetID,docSentiMag,docSentiScore,fullMag,fullScore
0,e6ff63097,1.2,0.3,1.0,1.0
1,14831f659,0.4,-0.4,0.4,-0.4
2,065906a54,1.7,0.2,1.5,1.5
3,03aae5768,1.1,0.5,1.0,1.0
4,e4dd90768,2.9,0.5,2.7,2.7


### Parse metadata

In [9]:
print(dt.datetime.now(), 'start getting metadata')

def parse_metadata(s): 
    '''
    parse metadata from a metadata json file 's' 
    '''
    if 'labelAnnotations' in s: 
        s_label_Anno = s['labelAnnotations']
        labelAnnoScore = getmean(s_label_Anno, 'score')
        labelAnnoDesc = s_label_Anno[0]['description']
    else: 
        labelAnnoScore = np.nan 
        labelAnnoDesc = np.nan

    if 'imagePropertiesAnnotation' in s:
        s_ipa_dom_colors = s['imagePropertiesAnnotation']['dominantColors']['colors']
        imagePropAnnoScore = getmean(s_ipa_dom_colors, 'score')
        imagePropAnnoPixelFrac = getmean(s_ipa_dom_colors, 'pixelFraction')
    else: 
        imagePropAnnoScore = np.nan 
        imagePropAnnoPixelFrac = np.nan

    if 'cropHintsAnnotation' in s:
        s_cHA = s['cropHintsAnnotation']['cropHints']
        cropHintsAnnoConf = getmean(s_cHA, 'confidence')
        cropHintAnnoImport = getmean(s_cHA, 'importanceFraction')
    else: 
        cropHintsAnnoConf = np.nan 
        cropHintAnnoImport = np.nan

    return [labelAnnoScore, labelAnnoDesc, 
              imagePropAnnoScore, imagePropAnnoPixelFrac, 
              cropHintsAnnoConf, cropHintAnnoImport]

def parse_metadata_f(myfile): 
    s = json.load(open(myfile))
    return [myfile[myfile.rfind('/')+1:myfile.rfind('-')], *parse_metadata(s)]

def parse_metadata_zip(zipfilename): 
    metadata_proc = np.asarray([
        parse_metadata_f(myfile) for myfile in glob.glob(zipfilename)
    ])
    df_metadata = pd.DataFrame(metadata_proc,
                columns = ['PetID', 
                            'labelAnnoScore', 'labelAnnoDesc', 
                            'imagePropAnnoScore', 'imagePropAnnoPixelFrac', 
                            'cropHintsAnnoConf', 'cropHintAnnoImport'])
    df_metadata['labelAnnoScore'] = df_metadata['labelAnnoScore'].astype('float')
    df_metadata['imagePropAnnoScore'] = df_metadata['imagePropAnnoScore'].astype('float')
    df_metadata['imagePropAnnoPixelFrac'] = df_metadata['imagePropAnnoPixelFrac'].astype('float')
    df_metadata['cropHintsAnnoConf'] = df_metadata['cropHintsAnnoConf'].astype('float')
    df_metadata['cropHintAnnoImport'] = df_metadata['cropHintAnnoImport'].astype('float')
    return df_metadata

train_metad = parse_metadata_zip(dirc+'train_metadata/*.json')
test_metad = parse_metadata_zip(dirc+'test_metadata/*.json')
data_metad = pd.concat([train_metad, test_metad], sort=False)

print('train metadata shape:', train_metad.shape)
print('test metadata shape', test_metad.shape)
print('full metadata shape', data_metad.shape)  

print(dt.datetime.now(), 'finish getting metadata')

2019-03-12 04:16:45.659661 start getting metadata
train metadata shape: (58311, 7)
test metadata shape (15040, 7)
full metadata shape (73351, 7)
2019-03-12 04:17:11.567676 finish getting metadata


In [10]:
data_metad.dtypes

PetID                      object
labelAnnoScore            float64
labelAnnoDesc              object
imagePropAnnoScore        float64
imagePropAnnoPixelFrac    float64
cropHintsAnnoConf         float64
cropHintAnnoImport        float64
dtype: object

In [11]:
data_metad.head()

Unnamed: 0,PetID,labelAnnoScore,labelAnnoDesc,imagePropAnnoScore,imagePropAnnoPixelFrac,cropHintsAnnoConf,cropHintAnnoImport
0,e736f4022,0.744926,floor,0.097666,0.065789,0.8,1.0
1,f861fe441,0.7433,dog breed,0.082835,0.074638,0.8,1.0
2,6da1ee245,0.707046,cat,0.090962,0.048281,0.8,1.0
3,8f32c880e,0.801786,dog,0.079665,0.052886,0.8,1.0
4,9ca31b395,0.793212,cat,0.097187,0.091496,0.8,1.0


### Impute missing data in metadata

In [12]:
data_metad.isna().sum()

PetID                      0
labelAnnoScore            15
labelAnnoDesc              0
imagePropAnnoScore         0
imagePropAnnoPixelFrac     0
cropHintsAnnoConf          0
cropHintAnnoImport        24
dtype: int64

In [13]:
data_metad_missing_cols = ['labelAnnoScore', 'cropHintAnnoImport']

myimputer = SimpleImputer() 
data_metad[data_metad_missing_cols] = myimputer.fit_transform(data_metad[data_metad_missing_cols])

In [14]:
data_metad.isna().sum()

PetID                     0
labelAnnoScore            0
labelAnnoDesc             0
imagePropAnnoScore        0
imagePropAnnoPixelFrac    0
cropHintsAnnoConf         0
cropHintAnnoImport        0
dtype: int64

### Group PetID from image metadata

In [15]:
if not data_metad.PetID.is_unique:
    data_metad = data_metad.groupby('PetID').mean().reset_index()
print(data_metad.shape)
data_metad.head()

(18473, 6)


Unnamed: 0,PetID,labelAnnoScore,imagePropAnnoScore,imagePropAnnoPixelFrac,cropHintsAnnoConf,cropHintAnnoImport
0,0008c5398,0.777046,0.071256,0.050027,0.8,1.0
1,000a290e4,0.752176,0.080857,0.057316,0.8,1.0
2,000c21f80,0.776582,0.070803,0.054727,0.8,1.0
3,000fb9572,0.76548,0.089756,0.064844,0.8,1.0
4,0011d7c25,0.78412,0.083866,0.075841,0.8,1.0


## Join and Process data

### Merge and impute 

In [16]:
print(dt.datetime.now(), 'start merging data')

data1 = data.copy()

data1 = pd.merge(data1, data_senti, on='PetID', how='left')
data1 = pd.merge(data1, data_metad, on='PetID', how='left')

# Impute missing values, because Metadata or Sentiment is not complete 
data1_na_columns = list(data1.columns[\
                            (data1.isna().sum() != 0) & (data1.dtypes != 'object') \
                            ].drop('AdoptionSpeed'))
print('impute columns with nan:', data1_na_columns)
for col in data1_na_columns: 
    myimputer1 = SimpleImputer() 
    data1[col] = myimputer1.fit_transform(data1[col].values.reshape(-1, 1))

# Irrelevant columns 
data1.drop(['Description', 'Name', 'RescuerID', 'PetID'], axis=1, inplace=True)

# astype to int 
data1['PhotoAmt'] = data1.PhotoAmt.astype('int')

print(dt.datetime.now(), 'finish merging data')

2019-03-12 04:17:11.849846 start merging data
impute columns with nan: ['docSentiMag', 'docSentiScore', 'fullMag', 'fullScore', 'labelAnnoScore', 'imagePropAnnoScore', 'imagePropAnnoPixelFrac', 'cropHintsAnnoConf', 'cropHintAnnoImport']
2019-03-12 04:17:11.948629 finish merging data


### Get dummies for categorical data

In [17]:
print(dt.datetime.now(), 'start processing data')

# get_dummies 
col_dummied = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'State']
data1_dummies = pd.get_dummies(data1[col_dummied].astype('object'))
data1 = pd.concat([data1, data1_dummies], axis=1)
data1.drop(col_dummied, axis=1, inplace=True)

# 'Not Sure' in Vaccinated, Sterilized, or Dewormed -> 'No' 
col_not_sure = ['Vaccinated', 'Sterilized', 'Dewormed']
for col in col_not_sure: 
    data1[col] = data1[col].map(lambda x: 2 if x == 3 else x)

train1 = data1[data1.AdoptionSpeed.notna()]
test1 = data1[data1.AdoptionSpeed.isna()].drop('AdoptionSpeed', axis=1)

x = train1.drop('AdoptionSpeed', axis=1)
y = train1['AdoptionSpeed'].astype('int')
    
print(data1.shape)
print(data1.info(memory_usage='deep'))

print(dt.datetime.now(), 'finish processing data')

data1.head()

2019-03-12 04:17:12.005762 start processing data
(18941, 396)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18941 entries, 0 to 18940
Columns: 396 entries, Type to State_41415
dtypes: float64(10), int64(12), uint8(374)
memory usage: 10.1 MB
None
2019-03-12 04:17:12.198654 finish processing data


Unnamed: 0,Type,Age,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,...,State_41330,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415
0,2,3,1,1,2,2,2,1,1,100,...,0,0,0,0,0,0,0,0,0,0
1,2,1,2,2,2,2,2,1,1,0,...,0,0,0,0,0,0,0,0,1,0
2,1,1,2,2,1,1,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,4,2,1,1,1,2,1,1,150,...,0,0,0,0,0,0,0,0,1,0
4,1,1,2,1,2,2,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


### Training

In [18]:
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size = 0.10)

def neg_quad_weighted_kappa(y_pred, dy_true): 
    return 'quadratic_weighted_kappa', -cohen_kappa_score(
        dy_true.get_label(), np.argmax(y_pred, axis=1), weights='quadratic'
    ) 

print(dt.datetime.now(), 'start training...')

'''
model_params = {'n_jobs': -1, 'tree_method': 'gpu_hist', 'learning_rate': 0.01, 
                'max_delta_step': 2, 'colsample_bylevel': 0.6, 'colsample_bytree': 0.1, 
                'gamma': 0.004, 'max_bin': 256, 'max_depth': 8, 'max_leaves': 27, 
                'min_child_weight': 96, 
                'reg_alpha': 0.003, 'reg_lambda': 0.060, 'subsample': 0.4}
'''
model_params = {'n_jobs': -1, 'tree_method': 'gpu_hist',
                'boosting': 'gbdt', 'metric': 'rmse', 'num_leaves': 70, 'max_depth': 9,
                'learning_rate': 0.01, 'bagging_fraction': 0.85, 'feature_fraction': 0.8,
                'min_split_gain': 0.02, 'min_child_samples': 150, 'min_child_weight': 0.02,
                'lambda_l2': 0.0475,  'verbosity': 200}
#model = XGBClassifier(n_estimators=20000, **model_params) 
#model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
#          verbose=200, eval_metric=neg_quad_weighted_kappa, early_stopping_rounds=2000)
#pred = model.predict(val_x) 
#print('cohen quadratic weighted kappa score:', cohen_kappa_score(val_y, pred, weights='quadratic'))

model = XGBRegressor(n_estimators=20000, **model_params) 
model.fit(train_x, train_y, eval_set=[(val_x, val_y)], 
          verbose=200, eval_metric='rmse', early_stopping_rounds=2000)
pred = model.predict(train_x) 

print(dt.datetime.now(), 'finish training')

2019-03-12 04:17:12.301511 start training...


  if getattr(data, 'base', None) is not None and \


[0]	validation_0-rmse:2.29762
Will train until validation_0-rmse hasn't improved in 2000 rounds.
[200]	validation_0-rmse:1.11184
[400]	validation_0-rmse:1.06878
[600]	validation_0-rmse:1.06508
[800]	validation_0-rmse:1.06398
[1000]	validation_0-rmse:1.06292
[1200]	validation_0-rmse:1.0625
[1400]	validation_0-rmse:1.06283
[1600]	validation_0-rmse:1.06247
[1800]	validation_0-rmse:1.06269
[2000]	validation_0-rmse:1.06284
[2200]	validation_0-rmse:1.06316
[2400]	validation_0-rmse:1.06387
[2600]	validation_0-rmse:1.06471
[2800]	validation_0-rmse:1.06567
[3000]	validation_0-rmse:1.06631
[3200]	validation_0-rmse:1.06724
[3400]	validation_0-rmse:1.06766
[3600]	validation_0-rmse:1.06862
Stopping. Best iteration:
[1618]	validation_0-rmse:1.06238

2019-03-12 04:18:25.391911 finish training


### Minimizer  
Ref: https://www.kaggle.com/wrosinski/baselinemodeling

In [19]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

# Compute QWK based on OOF train predictions:
optR = OptimizedRounder()
optR.fit(
    model.predict(train_x), train_y.values
)
coefficients = optR.coefficients()
pred_test_y_k = optR.predict(
    model.predict(val_x), coefficients
)
print("\nValid Counts = ", Counter(val_y.values))
print("Predicted Counts = ", Counter(pred_test_y_k))
print("Coefficients = ", coefficients)

qwk = quadratic_weighted_kappa(val_y, pred_test_y_k)
print("QWK = ", qwk)


Valid Counts =  Counter({2: 425, 4: 403, 3: 330, 1: 296, 0: 46})
Predicted Counts =  Counter({2.0: 536, 3.0: 450, 4.0: 263, 1.0: 251})
Coefficients =  [ 0.54172229  2.00069959  2.46851556  2.98165246]
QWK =  0.3872998706039128


### Submission 

In [20]:
#model_params = {'n_jobs': -1, 'tree_method': 'gpu_hist', 'learning_rate': 0.05}
#model1 = XGBClassifier(n_estimators=model.best_iteration, **model_params) 
#model1.fit(x, y)

sample_sub['AdoptionSpeed'] = optR.predict(
                                model.predict(test1), coefficients 
                                )
sample_sub['AdoptionSpeed'] = sample_sub['AdoptionSpeed'].astype('int')

sample_sub.to_csv('submission.csv', index=False)
sample_sub.head()

Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,2
1,73c10e136,3
2,72000c4c5,4
3,e147a4b9f,3
4,43fbba852,3
