## Import libraries

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import scipy as sp 

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold 
from sklearn.metrics import cohen_kappa_score, accuracy_score
from xgboost import XGBClassifier, XGBRegressor
import lightgbm as lgb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import SparsePCA, TruncatedSVD, LatentDirichletAllocation, NMF

import zipfile, json
import datetime as dt 
from collections import Counter

import sys, glob, os 
print('python version:', sys.version)
print('pandas version:', pd.__version__)

python version: 3.6.7 (default, Dec  5 2018, 15:02:05) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-36)]
pandas version: 0.24.1


### Text vectorization 

In [2]:
def text_vec(pd_series, n_components = 5):
    '''
    input a pandas series, return a vectorize pandas dataframe 
    '''
    col_name = pd_series.name
    # Generate text features:
    # Initialize decomposition methods:
    print('generating features from: {}'.format(col_name))
    svd_ = TruncatedSVD(n_components=n_components)#, random_state=1337)
    nmf_ = NMF(n_components=n_components)#, random_state=1337)

    tfidf_col = TfidfVectorizer().fit_transform(pd_series.values)

    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('SVD_{}_'.format(col_name))

    nmf_col = nmf_.fit_transform(tfidf_col)
    nmf_col = pd.DataFrame(nmf_col)
    nmf_col = nmf_col.add_prefix('NMF_{}_'.format(col_name))

    text_features = [svd_col, nmf_col]

    # Combine all extracted features:
    text_features = pd.concat(text_features, axis=1)

    return text_features

def text_vec_df(pd_dataframe, n_components = 5): 
    df = []
    for col in pd_dataframe.columns: 
        df_temp = text_vec(pd_dataframe[col], n_components)
        df.append(df_temp)
    return pd.concat(df, axis=1)

### Minimizer  
Ref: https://www.kaggle.com/wrosinski/baselinemodeling

In [3]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix


# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']
    
def rmse(actual, predicted):
    return sqrt(mean_squared_error(actual, predicted))

### Utility function?

In [4]:
def getmean(ary, key): 
    ''' 
    get mean value associated with 'key' from a iterable 'ary'
    '''
    return np.array([x.get(key, np.nan) for x in ary]).mean()

def getsum(ary, key): 
    ''' 
    get sum value associated with 'key' from a iterable 'ary'
    '''
    return np.array([x.get(key, np.nan) for x in ary]).sum() 

def getjoin(ary, key): 
    '''
    get str join associated with 'key' from a iterable 'ary
    '''
    return ' '.join([x.get(key, '') for x in ary])

## Read basic data

In [5]:
print(dt.datetime.now(), 'start reading data')

if os.path.isdir('../input') and not os.path.isfile('../input/test.zip'): 
    envx = 'remote' 
    dirc = '../input/'
else: 
    envx = 'local'
    dirc = '../../../data/PetFinder.my-Adoption-Prediction/'
print('kernel running environment:', envx) 
print('data path:', dirc) 

train = pd.read_csv(dirc+'train/train.csv')
test = pd.read_csv(dirc+'test/test.csv')
sample_sub = pd.read_csv(dirc+'test/sample_submission.csv')

print('train data shape:', train.shape)
print('test data shape:', test.shape)

data = pd.concat([train, test], sort=False).reset_index()
data.drop('index', axis=1, inplace=True)
fullpetid = data.PetID
print('full data shape:', data.shape)

breed_labels = pd.read_csv(dirc+'breed_labels.csv')
color_labels = pd.read_csv(dirc+'color_labels.csv')
state_labels = pd.read_csv(dirc+'state_labels.csv')

print(dt.datetime.now(), 'finish reading data')

2019-03-13 18:10:01.687631 start reading data
kernel running environment: local
data path: ../../../data/PetFinder.my-Adoption-Prediction/
train data shape: (14993, 24)
test data shape: (3948, 23)
full data shape: (18941, 24)
2019-03-13 18:10:01.931484 finish reading data


In [6]:
with pd.option_context('display.max_columns', 200):
    print(train.head())

   Type         Name  Age  Breed1  Breed2  Gender  Color1  Color2  Color3  \
0     2       Nibble    3     299       0       1       1       7       0   
1     2  No Name Yet    1     265       0       1       1       2       0   
2     1       Brisco    1     307       0       1       2       7       0   
3     1         Miko    4     307       0       2       1       2       0   
4     1       Hunter    1     307       0       1       1       0       0   

   MaturitySize  FurLength  Vaccinated  Dewormed  Sterilized  Health  \
0             1          1           2         2           2       1   
1             2          2           3         3           3       1   
2             2          2           1         1           2       1   
3             2          1           1         1           2       1   
4             2          1           2         2           2       1   

   Quantity  Fee  State                         RescuerID  VideoAmt  \
0         1  100  41326  8480853f

## Start parsing data from sentiment and metadata

### Parse sentiment data

In [7]:
print(dt.datetime.now(), 'start getting sentiment')
sentiment_cols = ['PetID', 'docSentiMag', 'docSentiScore', 'fullMag', 'fullScore', 'fullentities']

def get_sentiment(s): 
    '''
    parse sentiment from a sentiment json file 's' 
    '''
    docSentiMag = s['documentSentiment']['magnitude'] 
    docSentiScore = s['documentSentiment']['score']

    mag_sco = [x['sentiment'] for x in s['sentences']]
    
    fullMag = getsum(mag_sco, 'magnitude')
    fullScore = getsum(mag_sco, 'score')
    
    fullentities = getjoin(s['entities'], 'name')
    
    return [docSentiMag, docSentiScore, fullMag, fullScore, fullentities]

def get_sentiment_f(myfile): 
    '''
    forward a file name string to get_sentiment 
    '''
    s = json.load(open(myfile))       
    return [myfile[myfile.rfind('/')+1:-5], *get_sentiment(s)]

def get_sentiment_zip(zipfilename):
    sentiment_proc = np.asarray([
        get_sentiment_f(myfile) for myfile in glob.glob(zipfilename)
    ])
    df_senti = pd.DataFrame(sentiment_proc, columns = sentiment_cols)  
    df_senti['docSentiMag'] = df_senti['docSentiMag'].astype('float')
    df_senti['docSentiScore'] = df_senti['docSentiScore'].astype('float')
    df_senti['fullMag'] = df_senti['fullMag'].astype('float')
    df_senti['fullScore'] = df_senti['fullScore'].astype('float')
    return df_senti


# get raw sentiment data from json files 
train_senti = get_sentiment_zip(dirc+'train_sentiment/*.json')
test_senti = get_sentiment_zip(dirc+'test_sentiment/*.json')   

# join 
data_senti = pd.concat([train_senti, test_senti], sort=False)

'''
# find missing PetID
fullpetid_ct = Counter(fullpetid)
data_senti_ct = Counter(data_senti.PetID)
fullpetid_ct.subtract(data_senti_ct)
missing_petid = list(+fullpetid_ct)

# generate DataFrame associated with missied PetID 
temp = [[missing_petid[i], np.nan, np.nan, np.nan, np.nan, '<MISSING>'] 
        for i in range(len(missing_petid))]
df_missing_senti = pd.DataFrame(temp, columns=sentiment_cols)

data_senti = pd.concat([data_senti, df_missing_senti], axis=0, sort=False).\
    reset_index().drop('index', axis=1)

vec_col = 'fullentities'

data_senti_proc = text_vec(data_senti[vec_col], 5)
data_senti = pd.concat([data_senti, data_senti_proc], axis=1)
data_senti.drop(vec_col, axis=1, inplace=True)
print('missing PetID sentiment shape', data_senti_proc.shape)
'''

print('train sentiment shape:', train_senti.shape)
print('test sentiment shape', test_senti.shape)
print('full sentiment shape', data_senti.shape)


print(dt.datetime.now(), 'finish getting sentiment')


2019-03-13 18:10:02.117308 start getting sentiment
train sentiment shape: (14442, 6)
test sentiment shape (3815, 6)
full sentiment shape (18257, 6)
2019-03-13 18:10:07.401904 finish getting sentiment


In [8]:
data_senti.head()

Unnamed: 0,PetID,docSentiMag,docSentiScore,fullMag,fullScore,fullentities
0,e6ff63097,1.2,0.3,1.0,1.0,pup adoption breed golden retriever mix pup Ca...
1,14831f659,0.4,-0.4,0.4,-0.4,dog pity dog ss2 sea park area dog dog house i...
2,065906a54,1.7,0.2,1.5,1.5,owner care cat someone cat Malaysia rules rest...
3,03aae5768,1.1,0.5,1.0,1.0,Looks whippet' breed greyhound family
4,e4dd90768,2.9,0.5,2.7,2.7,puppy walker hiking trail Melawati cutie cooki...


### Parse metadata

In [9]:
print(dt.datetime.now(), 'start getting metadata')
metadata_cols = ['PetID', 
                'labelAnnoScore', 'labelAnnoDesc', 
                'imagePropAnnoScore', 'imagePropAnnoPixelFrac', 
                'cropHintsAnnoConf', 'cropHintAnnoImport']

def parse_metadata(s): 
    '''
    parse metadata from a metadata json file 's' 
    '''
    if 'labelAnnotations' in s: 
        s_label_Anno = s['labelAnnotations'][:int(0.3*len(s['labelAnnotations']))+1]
        labelAnnoScore = getmean(s_label_Anno, 'score')
        #labelAnnoDesc = s_label_Anno[0]['description']
        labelAnnoDesc = getjoin(s_label_Anno, 'description')
    else: 
        labelAnnoScore = np.nan 
        labelAnnoDesc = np.nan

    if 'imagePropertiesAnnotation' in s:
        s_ipa_dom_colors = s['imagePropertiesAnnotation']['dominantColors']['colors']
        imagePropAnnoScore = getmean(s_ipa_dom_colors, 'score')
        imagePropAnnoPixelFrac = getmean(s_ipa_dom_colors, 'pixelFraction')
    else: 
        imagePropAnnoScore = np.nan 
        imagePropAnnoPixelFrac = np.nan

    if 'cropHintsAnnotation' in s:
        s_cHA = s['cropHintsAnnotation']['cropHints']
        cropHintsAnnoConf = getmean(s_cHA, 'confidence')
        cropHintAnnoImport = getmean(s_cHA, 'importanceFraction')
    else: 
        cropHintsAnnoConf = np.nan 
        cropHintAnnoImport = np.nan

    return [labelAnnoScore, labelAnnoDesc, 
              imagePropAnnoScore, imagePropAnnoPixelFrac, 
              cropHintsAnnoConf, cropHintAnnoImport]

def parse_metadata_f(myfile): 
    s = json.load(open(myfile))
    return [myfile[myfile.rfind('/')+1:myfile.rfind('-')], *parse_metadata(s)]

def parse_metadata_zip(zipfilename): 
    metadata_proc = np.asarray([
        parse_metadata_f(myfile) for myfile in glob.glob(zipfilename)
    ])
    df_metadata = pd.DataFrame(metadata_proc, columns = metadata_cols)
    df_metadata['labelAnnoScore'] = df_metadata['labelAnnoScore'].astype('float')
    df_metadata['imagePropAnnoScore'] = df_metadata['imagePropAnnoScore'].astype('float')
    df_metadata['imagePropAnnoPixelFrac'] = df_metadata['imagePropAnnoPixelFrac'].astype('float')
    df_metadata['cropHintsAnnoConf'] = df_metadata['cropHintsAnnoConf'].astype('float')
    df_metadata['cropHintAnnoImport'] = df_metadata['cropHintAnnoImport'].astype('float')
    return df_metadata

train_metad = parse_metadata_zip(dirc+'train_metadata/*.json')
test_metad = parse_metadata_zip(dirc+'test_metadata/*.json')
data_metad = pd.concat([train_metad, test_metad], sort=False)


print('train metadata shape:', train_metad.shape)
print('test metadata shape', test_metad.shape)
print('full metadata shape', data_metad.shape)  

print(dt.datetime.now(), 'finish getting metadata')

2019-03-13 18:10:07.613393 start getting metadata
train metadata shape: (58311, 7)
test metadata shape (15040, 7)
full metadata shape (73351, 7)
2019-03-13 18:10:34.118140 finish getting metadata


In [10]:
data_metad.head()

Unnamed: 0,PetID,labelAnnoScore,labelAnnoDesc,imagePropAnnoScore,imagePropAnnoPixelFrac,cropHintsAnnoConf,cropHintAnnoImport
0,e736f4022,0.946534,floor flooring tile,0.097666,0.065789,0.8,1.0
1,f861fe441,0.874389,dog breed dog street dog dog breed group,0.082835,0.074638,0.8,1.0
2,6da1ee245,0.872424,cat small to medium sized cats cat like mammal...,0.090962,0.048281,0.8,1.0
3,8f32c880e,0.961112,dog dog like mammal dog breed,0.079665,0.052886,0.8,1.0
4,9ca31b395,0.917344,cat small to medium sized cats cat like mammal...,0.097187,0.091496,0.8,1.0


### Group PetID from image metadata

In [11]:
# get text columns, and join them 
#data_metad_obj = data_metad.loc[:, (data_metad.dtypes == 'object') | (data_metad.columns == 'PetID')].copy()
#data_metad_obj = data_metad_obj.groupby('PetID').sum()
data_metad_obj = data_metad.groupby('PetID').labelAnnoDesc.unique().apply(lambda x: ' '.join(x))

# vectorize text columns 
#data_metad_obj_proc = text_vec(data_metad_obj, 5)
data_metad_obj_proc = data_metad_obj.to_frame()

# get numeric columns, add additional features: mean, sum 
data_metad_num = data_metad.loc[:, (data_metad.dtypes != 'object') | (data_metad.columns == 'PetID')].copy()
for col in list(data_metad.columns[data_metad.dtypes != 'object']): 
    myimputer = SimpleImputer() 
    data_metad_num[col] = myimputer.fit_transform(data_metad_num[col].values.reshape(-1, 1))
data_metad_num = data_metad_num.groupby('PetID').agg(['mean', 'sum']).reset_index()

data_metad_num.columns = ["_".join(x) for x in data_metad_num.columns.ravel()]
data_metad_num.rename(columns={'PetID_': 'PetID'}, inplace=True)

# concat text columns and num columns 
#data_metad1 = pd.concat([data_metad_num, data_metad_obj_proc], axis=1)
data_metad1 = pd.merge(data_metad_num, data_metad_obj_proc, how='left', on='PetID')
print(data_metad1.shape)
data_metad1.head()

(18473, 12)


Unnamed: 0,PetID,labelAnnoScore_mean,labelAnnoScore_sum,imagePropAnnoScore_mean,imagePropAnnoScore_sum,imagePropAnnoPixelFrac_mean,imagePropAnnoPixelFrac_sum,cropHintsAnnoConf_mean,cropHintsAnnoConf_sum,cropHintAnnoImport_mean,cropHintAnnoImport_sum,labelAnnoDesc
0,0008c5398,0.916467,5.498802,0.071256,0.427536,0.050027,0.30016,0.8,4.8,1.0,6.0,cat small to medium sized cats cat like mammal...
1,000a290e4,0.930162,1.860325,0.080857,0.161713,0.057316,0.114633,0.8,1.6,1.0,2.0,dog dog breed dog like mammal dog breed group
2,000c21f80,0.910548,2.731643,0.070803,0.212409,0.054727,0.164181,0.8,2.4,1.0,3.0,cat small to medium sized cats cat like mammal...
3,000fb9572,0.92533,5.551982,0.089756,0.538534,0.064844,0.389065,0.8,4.8,1.0,6.0,dog like mammal dog breed dog dog breed group ...
4,0011d7c25,0.917093,2.75128,0.083866,0.251597,0.075841,0.227523,0.8,2.4,1.0,3.0,cat small to medium sized cats whiskers cat li...


## Join and Process data

### Count RescuerID occurrences: 
(don't understand why, but this feature is important)

In [12]:
rescuer_count = data.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

data_resc = pd.merge(data, rescuer_count, how='left', on='RescuerID')
data_resc.drop('RescuerID', axis=1, inplace=True)

#data_resc.head()

### Merge sentiment and metadata, vectorize text columns

In [13]:
data_sm = pd.merge(data_resc, data_senti, how='left', on='PetID')
data_sm = pd.merge(data_sm, data_metad1, how='left', on='PetID')

str_cols = ['Description', 'fullentities', 'labelAnnoDesc']
data_sm[str_cols] = data_sm[str_cols].fillna('<MISSING>')
data_text_vec = text_vec_df(data_sm[str_cols], n_components=5)
data_sm.drop(str_cols, axis=1, inplace=True)

data_sm = pd.concat([data_sm, data_text_vec], axis=1)
data_sm.drop(['Name', 'PetID'], axis=1, inplace=True) 

#data1 = data_sm
#data1['PhotoAmt'] = data1.PhotoAmt.astype('int')

print(data_sm.shape)

generating features from: Description
generating features from: fullentities
generating features from: labelAnnoDesc
(18941, 65)


### Impute missing numeric values? 

In [14]:
data1 = data_sm.copy()

impute_missing_values = False 
print('impute missing values?', impute_missing_values)

if impute_missing_values: 
    print(dt.datetime.now(), 'start imputing')
    
    # Impute missing values, because Metadata or Sentiment is not complete 
    data1_na_columns = list(data1.columns[\
                                (data1.isna().sum() != 0) & (data1.dtypes != 'object') \
                                ].drop('AdoptionSpeed'))
    print('impute numeric columns with nan:', data1_na_columns)
    for col in data1_na_columns: 
        myimputer1 = SimpleImputer() 
        data1[col] = myimputer1.fit_transform(data1[col].values.reshape(-1, 1))
        
    print(dt.datetime.now(), 'finish merging data')

# astype to int 
data1['PhotoAmt'] = data1.PhotoAmt.astype('int')


impute missing values? False


In [15]:
data1.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,SVD_labelAnnoDesc_0,SVD_labelAnnoDesc_1,SVD_labelAnnoDesc_2,SVD_labelAnnoDesc_3,SVD_labelAnnoDesc_4,NMF_labelAnnoDesc_0,NMF_labelAnnoDesc_1,NMF_labelAnnoDesc_2,NMF_labelAnnoDesc_3,NMF_labelAnnoDesc_4
0,2,3,299,0,1,1,7,0,1,1,...,0.271465,0.730297,3.018277e-07,-0.043794,-0.1053,0.0,0.090134,0.0,0.000144,0.004518
1,2,1,265,0,1,1,2,0,2,2,...,0.370153,0.905112,4.227043e-07,-0.0015,0.092516,0.0,0.099664,0.0,0.000176,0.055519
2,1,1,307,0,1,2,7,0,2,2,...,0.938243,-0.34,9.423154e-08,-0.027664,-0.007926,0.098023,0.000766,0.0,0.0,0.0
3,1,4,307,0,2,1,2,0,2,1,...,0.934383,-0.35026,-2.939679e-09,-0.027566,0.012194,0.098066,0.0,0.0,0.0,0.0
4,1,1,307,0,1,1,0,0,2,1,...,0.465788,-0.142857,-7.014057e-07,-0.013312,-0.054549,0.047593,0.003383,0.0,0.000458,0.0


### Get dummies for categorical data

In [16]:
print(dt.datetime.now(), 'start processing data')

data2 = data1.copy() 

# It seems like NOT getting dummies would give better results :( 
# get_dummies 
get_dummies_for_categorical = False 
print('get dummies for categorica data?', get_dummies_for_categorical)

if get_dummies_for_categorical: 
    col_dummied = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'State']
    data1_dummies = pd.get_dummies(data1[col_dummied].astype('object'))
    data2 = pd.concat([data1, data1_dummies], axis=1)
    data2.drop(col_dummied, axis=1, inplace=True)

# 'Not Sure' in Vaccinated, Sterilized, or Dewormed -> 'No' 
vac_ster_deworm_impute = False 
print('change \'not sure\' in Vaccinated, Sterilized, Dewormed to No?', 
      vac_ster_deworm_impute)
if vac_ster_deworm_impute: 
    col_not_sure = ['Vaccinated', 'Sterilized', 'Dewormed']
    for col in col_not_sure: 
        data1[col] = data1[col].map(lambda x: 2 if x == 3 else x)

train1 = data2[data2.AdoptionSpeed.notna()]
test1 = data2[data2.AdoptionSpeed.isna()].drop('AdoptionSpeed', axis=1)

x = train1.drop('AdoptionSpeed', axis=1)
y = train1['AdoptionSpeed'].astype('int')
    
print(data2.shape)
#print(data2.info(memory_usage='deep'))

print(dt.datetime.now(), 'finish processing data')

data2.head()

2019-03-13 18:10:50.858048 start processing data
get dummies for categorica data? False
change 'not sure' in Vaccinated, Sterilized, Dewormed to No? False
(18941, 65)
2019-03-13 18:10:50.878643 finish processing data


Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,...,SVD_labelAnnoDesc_0,SVD_labelAnnoDesc_1,SVD_labelAnnoDesc_2,SVD_labelAnnoDesc_3,SVD_labelAnnoDesc_4,NMF_labelAnnoDesc_0,NMF_labelAnnoDesc_1,NMF_labelAnnoDesc_2,NMF_labelAnnoDesc_3,NMF_labelAnnoDesc_4
0,2,3,299,0,1,1,7,0,1,1,...,0.271465,0.730297,3.018277e-07,-0.043794,-0.1053,0.0,0.090134,0.0,0.000144,0.004518
1,2,1,265,0,1,1,2,0,2,2,...,0.370153,0.905112,4.227043e-07,-0.0015,0.092516,0.0,0.099664,0.0,0.000176,0.055519
2,1,1,307,0,1,2,7,0,2,2,...,0.938243,-0.34,9.423154e-08,-0.027664,-0.007926,0.098023,0.000766,0.0,0.0,0.0
3,1,4,307,0,2,1,2,0,2,1,...,0.934383,-0.35026,-2.939679e-09,-0.027566,0.012194,0.098066,0.0,0.0,0.0,0.0
4,1,1,307,0,1,1,0,0,2,1,...,0.465788,-0.142857,-7.014057e-07,-0.013312,-0.054549,0.047593,0.003383,0.0,0.000458,0.0


### Training XGBoost or LightGBM
Ref: https://www.kaggle.com/wrosinski/baselinemodeling

In [17]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits)


oof_train = np.zeros((x.shape[0]))
oof_test = np.zeros((test1.shape[0], n_splits))

params = {'application': 'regression',
          'boosting': 'gbdt',
          'metric': 'rmse',
          'num_leaves': 70,
          'max_depth': 9,
          'learning_rate': 0.01,
          'bagging_fraction': 0.85,
          'feature_fraction': 0.8,
          'min_split_gain': 0.02,
          'min_child_samples': 150,
          'min_child_weight': 0.02,
          'lambda_l2': 0.0475,
          'verbosity': -1} #, 
          #'device_type': 'gpu'}

'''
model_params = {'n_jobs': -1, 'tree_method': 'gpu_hist', 'metric': 'rmse', 
                'num_leaves': 70, 'max_depth': 9, 'gamma': 0.004, 
                'learning_rate': 0.01, 'bagging_fraction': 0.85, 
                'min_split_gain': 0.02, 'min_child_samples': 10, 'min_child_weight': 0.02, 
                'reg_alpha': 0.003, 'reg_lambda': 0.0475,  'verbosity': 200}
'''


# Additional parameters:
early_stop = 500
verbose_eval = 200
num_rounds = 10000


i = 0
for train_index, valid_index in kfold.split(x, y):
    
    X_tr = x.iloc[train_index, :]
    X_val = x.iloc[valid_index, :]
    
    y_tr = y.iloc[train_index]   
    y_val = y.iloc[valid_index]
    
    print('\ny_tr distribution: {}'.format(Counter(y_tr)))
    
    d_train = lgb.Dataset(X_tr, label=y_tr)
    d_valid = lgb.Dataset(X_val, label=y_val)
    watchlist = [d_train, d_valid]
    
    
    print('training LGB:')
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=num_rounds,
                      valid_sets=watchlist,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)
    '''
    print('training XGB:')
    model = XGBRegressor(n_estimators=num_rounds, **params, tree_method='gpu_hist') 
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], 
          verbose=verbose_eval, eval_metric='rmse', early_stopping_rounds=early_stop)
    '''
    
    val_pred = model.predict(X_val)#, num_iteration=model.best_iteration)
    test_pred = model.predict(test1)#, num_iteration=model.best_iteration)
    
    oof_train[valid_index] = val_pred
    oof_test[:, i] = test_pred
    
    i += 1
    
# Compute QWK based on OOF train predictions:
optR = OptimizedRounder()
optR.fit(oof_train, y)
coefficients = optR.coefficients()
pred_test_y_k = optR.predict(oof_train, coefficients)
print("\nValid Counts = ", Counter(y))
print("Predicted Counts = ", Counter(pred_test_y_k))
print("Coefficients = ", coefficients)
qwk = quadratic_weighted_kappa(y, pred_test_y_k)
print("QWK = ", qwk)


y_tr distribution: Counter({4: 3357, 2: 3229, 3: 2607, 1: 2472, 0: 328})
training LGB:
Training until validation scores don't improve for 500 rounds.
[200]	training's rmse: 1.00763	valid_1's rmse: 1.07084
[400]	training's rmse: 0.951017	valid_1's rmse: 1.05154
[600]	training's rmse: 0.913425	valid_1's rmse: 1.04468
[800]	training's rmse: 0.885805	valid_1's rmse: 1.04123
[1000]	training's rmse: 0.860528	valid_1's rmse: 1.0395
[1200]	training's rmse: 0.836763	valid_1's rmse: 1.03814
[1400]	training's rmse: 0.816436	valid_1's rmse: 1.03727
[1600]	training's rmse: 0.795229	valid_1's rmse: 1.03683
[1800]	training's rmse: 0.777283	valid_1's rmse: 1.03665
[2000]	training's rmse: 0.760743	valid_1's rmse: 1.03657
[2200]	training's rmse: 0.74554	valid_1's rmse: 1.03622
[2400]	training's rmse: 0.726105	valid_1's rmse: 1.03626
[2600]	training's rmse: 0.712124	valid_1's rmse: 1.03616
[2800]	training's rmse: 0.698418	valid_1's rmse: 1.0365
[3000]	training's rmse: 0.68507	valid_1's rmse: 1.03665
Ear

### Submission 

In [18]:
sample_sub['AdoptionSpeed'] = optR.predict(oof_test.mean(axis=1), coefficients)
sample_sub['AdoptionSpeed'] = sample_sub['AdoptionSpeed'].astype('int')

sample_sub.to_csv('submission.csv', index=False)
sample_sub.head()

Unnamed: 0,PetID,AdoptionSpeed
0,378fcc4fc,1
1,73c10e136,4
2,72000c4c5,4
3,e147a4b9f,3
4,43fbba852,4
