In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from tqdm import tqdm, tqdm_notebook
import xgboost as xgb
import os
import lightgbm as lgb
from catboost import CatBoostClassifier
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score
from sklearn.linear_model import LogisticRegressionCV
import time
import gc
from gensim.models import Word2Vec
%matplotlib inline
import embedder

In [4]:
A = pd.read_csv(r"~/Deep-Learning-Project-Template/data/train.csv", dtype={'PetID': str})
# Add some features and decipher categorical columns

# Cat/Dog
A['Type'] = A['Type'].apply(lambda x: 'Dog' if x == 1 else 'Cat')

# binary noname col
A['Name'] = A['Name'].fillna('Unnamed')
A['No_name'] = 0
A.loc[A['Name'] == 'Unnamed', 'No_name'] = 1

A['binned_age'] = pd.cut(A['Age'], bins=[-1,3,6,12,24,36,10000])

# Meaningless names - 2 characters and less (maybe 3 as well)
# A['meaningless_name'] = 0
# A.loc[len(A['Name']) <= 2, 'meaningless_name'] = 1

# is pure-bred
A['Pure_breed'] = 0
A.loc[A['Breed2'] == 0, 'Pure_breed'] = 1

A['health'] = A['Vaccinated'].astype(str) + '_' + \
                A['Dewormed'].astype(str) + '_' + \
                A['Sterilized'].astype(str) + '_' + \
                A['Health'].astype(str)
            
A['Free'] = A['Fee'].apply(lambda x: 1 if x == 0 else 0)


A['Description'] = A['Description'].fillna('')
A['desc_length'] = A['Description'].apply(lambda x: len(x))
A['desc_words'] = A['Description'].apply(lambda x: len(x.split()))
A['averate_word_length'] = A['desc_length'] / A['desc_words']
A.loc[~np.isfinite(A['averate_word_length']), 'averate_word_length'] = 0

A = A.drop(columns=['Unnamed: 0'])

In [5]:
A.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,BinaryLabel,set,No_name,binned_age,Pure_breed,health,Free,desc_length,desc_words,averate_word_length
0,Cat,Nibble,3,299,0,1,1,7,0,1,...,1,0.0,0,"(-1, 3]",1,2_2_2_1,0,359,69,5.202899
1,Cat,No Name Yet,1,265,0,1,1,2,0,2,...,1,0.0,0,"(-1, 3]",1,3_3_3_1,1,118,23,5.130435
2,Dog,Brisco,1,307,0,1,2,7,0,2,...,1,0.0,0,"(-1, 3]",1,1_1_2_1,1,393,69,5.695652
3,Dog,Miko,4,307,0,2,1,2,0,2,...,1,1.0,0,"(3, 6]",1,1_1_2_1,0,146,25,5.84
4,Dog,Hunter,1,307,0,1,1,0,0,2,...,1,1.0,0,"(-1, 3]",1,2_2_2_1,1,390,81,4.814815


In [4]:
def add_col_name_to_val(df, rel_cols):
    for c in rel_cols:
        df[c] = c + "_" + df[c].astype(str)
    return df

In [6]:
real_cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'binned_age',
                 'Sterilized', 'Health', 'State']

In [None]:
from category_encoders import OrdinalEncoder
y = A['BinaryLabel']

cols_to_drop = ['Description', 'Name', 'PetID', 'RescuerID', 'health']

X = A.drop(columns=cols_to_drop+['BinaryLabel', 'AdoptionSpeed', 'binned_age'])
real_cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
                 'Sterilized', 'Health', 'State']
# enc = OrdinalEncoder(cols=real_cat_cols).fit(X, y)
# X = enc.transform(X)

In [None]:
cat_sz = [(col, X[col].unique().shape[0]) for col in X.columns
              if col in real_cat_cols]

# cat_vars = embedder.preprocessing.categorize(A)
embedding_dict = embedder.preprocessing.pick_emb_dim(cat_sz, max_dim=50)
X_encoded, encoders = embedder.preprocessing.encode_categorical(X)
import keras
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,
                           mode='min',
                           verbose=1)

embedder_instance = embedder.classification.Embedder(embedding_dict, model_json=None)
embedder_instance_res = embedder_instance.fit(X_encoded, y, epochs=15, early_stop=early_stop)

In [None]:
b = embedder_instance_res.transform(X_encoded)
print(b.shape)
print(b.min())
print(b.max())

In [None]:
# embedding_mats = embedder_instance.get_embeddings()
# embedding_mats['Breed1']
embedder_instance_res.model.summary()