## 1. Importar las Librerías Necesarias

In [None]:
# Importación de Librerías
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import os
import pandas as pd 
import seaborn as sns 
from PIL import Image
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, KFold
from tensorflow import keras

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 30)
plt.rcParams['figure.figsize'] = [12.0, 8.0]

from sklearn.metrics import cohen_kappa_score
def kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

## 2. Leer los Datos
Al menos los datos Tabulares de la base de "train"

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

train['dataset_type'] = 'train'
test['dataset_type'] = 'test'
all_data = pd.concat([train, test])

In [None]:
train.head()

In [None]:
main_count = train['AdoptionSpeed'].value_counts(normalize=True).sort_index()
def prepare_plot_dict(df, col, main_count):
    main_count = dict(main_count)
    plot_dict = {}
    for i in df[col].unique():
        val_count = dict(df.loc[df[col] == i, 'AdoptionSpeed'].value_counts().sort_index())

        for k, v in main_count.items():
            if k in val_count:
                plot_dict[val_count[k]] = ((val_count[k] / sum(val_count.values())) / main_count[k]) * 100 - 100
            else:
                plot_dict[0] = 0

    return plot_dict

def make_count_plot(df, x, hue='AdoptionSpeed', title='', main_count=main_count):
    g = sns.countplot(x=x, data=df, hue=hue);
    plt.title(f'AdoptionSpeed {title}');
    ax = g.axes

    plot_dict = prepare_plot_dict(df, x, main_count)

    for p in ax.patches:
        h = p.get_height() if str(p.get_height()) != 'nan' else 0
        text = f"{plot_dict[h]:.0f}%" if plot_dict[h] < 0 else f"+{plot_dict[h]:.0f}%"
        ax.annotate(text, (p.get_x() + p.get_width() / 2., h),
             ha='center', va='center', fontsize=11, color='green' if plot_dict[h] > 0 else 'red', rotation=0, xytext=(0, 10),
             textcoords='offset points') 

In [None]:
#Analisis de Datos
train['AdoptionSpeed'].value_counts().sort_index(ascending = False).plot(kind='barh', color='teal');
plt.title('Adoption speed classes counts');

In [None]:
train['Type'] = train['Type'].apply(lambda x: 'Dog' if x == 1 else 'Cat')
sns.countplot(x='AdoptionSpeed', data=train, hue='Type');
plt.title('Number of cats and dogs by AdoptionSpeed');

In [None]:
plt.figure(figsize=(18, 6));
plt.subplot(1, 2, 1)
make_count_plot(df=train, x='Gender', title='and gender')

plt.subplot(1, 2, 2)
sns.countplot(x='dataset_type', data=all_data, hue='Gender');
plt.title('Number of pets by gender in train and test data');

In [None]:
breeds = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')

In [None]:
breeds_dict = {k: v for k, v in zip(breeds['BreedID'], breeds['BreedName'])}

In [None]:
train['Breed1_name'] = train['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'Unknown')
train['Breed2_name'] = train['Breed2'].apply(lambda x: '_'.join(breeds_dict[x]) if x in breeds_dict else '-')

test['Breed1_name'] = test['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'Unknown')
test['Breed2_name'] = test['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else '-')

all_data['Breed1_name'] = all_data['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'Unknown')
all_data['Breed2_name'] = all_data['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else '-')

In [None]:
images = [i.split('-')[0] for i in os.listdir('../input/petfinder-adoption-prediction/train_images/')]
size_dict = {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large'}
for t in all_data['Type'].unique():
    for m in all_data['MaturitySize'].unique():
        df = all_data.loc[(all_data['Type'] == t) & (all_data['MaturitySize'] == m)]
        top_breeds = list(df['Breed1_name'].value_counts().index)[:5]
        m = size_dict[m]
        print(f"Most common Breeds of {m} {t}s:")
        
        fig = plt.figure(figsize=(25, 4))
        
        for i, breed in enumerate(top_breeds):
            # excluding pets without pictures
            b_df = df.loc[(df['Breed1_name'] == breed) & (df['PetID'].isin(images)), 'PetID']
            if len(b_df) > 1:
                pet_id = b_df.values[1]
            else:
                pet_id = b_df.values[0]
            ax = fig.add_subplot(1, 5, i+1, xticks=[], yticks=[])

            im = Image.open("../input/petfinder-adoption-prediction/train_images/" + pet_id + '-1.jpg')
            plt.imshow(im)
            ax.set_title(f'Breed: {breed}')
        plt.show();

In [None]:
plt.figure(figsize=(20, 12));
plt.subplot(2, 2, 1)
make_count_plot(df=train, x='Vaccinated', title='Vaccinated')
plt.xticks([0, 1, 2], ['Yes', 'No', 'Not sure']);
plt.title('AdoptionSpeed and Vaccinated');

plt.subplot(2, 2, 2)
make_count_plot(df=train, x='Dewormed', title='Dewormed')
plt.xticks([0, 1, 2], ['Yes', 'No', 'Not sure']);
plt.title('AdoptionSpeed and Dewormed');

plt.subplot(2, 2, 3)
make_count_plot(df=train, x='Sterilized', title='Sterilized')
plt.xticks([0, 1, 2], ['Yes', 'No', 'Not sure']);
plt.title('AdoptionSpeed and Sterilized');

plt.subplot(2, 2, 4)
make_count_plot(df=train, x='Health', title='Health')
plt.xticks([0, 1, 2], ['Healthy', 'Minor Injury', 'Serious Injury']);
plt.title('AdoptionSpeed and Health');

plt.suptitle('Adoption Speed and health conditions');

In [None]:
train.head()

In [None]:
def plot_two_graphs(col='', main_title='', dataset_title=''):

    plt.figure(figsize=(20, 12));
    plt.subplot(2, 2, 1)
    make_count_plot(df=train, x=col, title=f'and {main_title}')

    plt.subplot(2, 2, 2)
    sns.countplot(x='dataset_type', data=all_data, hue=col);
    plt.title(dataset_title);

In [None]:
train['Free'] = train['Fee'].apply(lambda x: 'Free' if x == 0 else 'Not Free')
test['Free'] = test['Fee'].apply(lambda x: 'Free' if x == 0 else 'Not Free')
all_data['Free'] = all_data['Fee'].apply(lambda x: 'Free' if x == 0 else 'Not Free')

In [None]:
plot_two_graphs(col='Free', main_title='Free', dataset_title='Number of pets by Free in train and test data')

## 3. Pre-procesar Nulos
Verificar la existencia de Nulos y decidir como Imputarlos en caso de que existan

Verificar la existencia de Ceros u otros valores que puedan indicar que pueden ser perdidos

In [None]:
train.isna().sum()
#Los nulos están en Name y Description que serán variables a eliminar

In [None]:
#Transformamos las variables del tipo object en category
for c in train.select_dtypes("O"):
    train[c] = train[c].astype("category")

## 4. Convertir o eliminar las Columnas Categóricas

Por ejemplo, la Descripción habría que sacarla para un análisis independiente

In [None]:
train = train.set_index('PetID')

In [None]:
train = train.drop(['Name','Description'], axis = 1)

train.head()

#Para recatadores que tengan mayor a X registros, se debe calcular el promedio de la AdoptionSpeed y luego generar una variable ordinal para reemplazar el RescuerID


## 5. Normalizar o Estandarizar las variables Numericas (para los modelos que sean necesarios)

Revisar si existen valores extremos y considerarlos para los modelos que afecte

In [None]:
#No es necesario para nuestro modelo LightGBM

## 6. Separa la base de Test (10%) y Train (90%)
Pueden ser otros porcentajes que les parezcan mejor

In [None]:
def metric(y_true, y_pred):
    res = cohen_kappa_score(y_true, y_pred.reshape((y_true.shape[0], 5), order="F").argmax(axis=1), weights= 'quadratic')
    return "kappa", res, True

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train.select_dtypes(exclude=['object']).drop("AdoptionSpeed", axis=1), 
    train.AdoptionSpeed, random_state=1
)

In [None]:
X_train.head()

In [None]:
X_test.head()

### 7. Para la parte de Train, armar un esquema de Cross Validation

Usar 10 Folds

In [None]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)
folds = [X_train.index[idx] for _, idx in kf.split(X_train)]

In [None]:
k=5
num_validation_samples=len(train)//k
#np.random.shuffle(data)
validation_scores=[]
test_probs = pd.DataFrame([], index=X_test.index, columns=range(y_train.max() + 1))
for idx in folds:
    Xv = X_train.loc[idx]
    yv = y_train.loc[idx]
    
    Xt = X_train.drop(idx)
    yt = y_train.drop(idx)
    
    model = LGBMClassifier()
    model.fit(Xt, yt)
    #vp = pd.Series(model.predict_proba(Xv), index=Xv)
    #valid_probs.append(vp)
    validation_score= model.predict_proba(Xv)
    validation_scores.append(validation_score)
    #test_probs = (test_probs + model.predict_proba(X_test)) / 5
    
#validation_scores = pd.concat([validation_scores,test_probs])

In [None]:
validation_score1=np.average(validation_score)
model= LGBMClassifier()
model.fit(X_train, y_train)
test_score=model.predict(X_test)

In [None]:
accuracy1= model.score(X_test, y_test)

In [None]:
accuracy1

In [None]:
accuracy2= model.score(Xv, yv)

In [None]:
accuracy2

In [None]:
#resultados[f"fold_{i+1}"] = cohen_kappa_score( Xt,yt, weights= 'quadratic')

In [None]:
validation_scores = pd.DataFrame(validation_scores)
test_probs= pd.DataFrame(test_probs)

validation_scores = pd.concat([validation_scores,test_probs], ignore_index=True)

In [None]:
validation_scores

In [None]:
test_probs

In [None]:
k=3
num_validation_samples=len(train)//k
np.random.shuffle(train)
validation_scores=[]
for fold in range(k):
    validation_data=data[num_validation_samples * fold:
                         num_validation_samples * (fold+1)]
    training_data=np.concatenate(
        [data[:num_validation_samples * fold],
        data[num_validation_samples * (fold+1):]], axis=0)
    model= LGBMClassifier()
    model.fit(training_data, ...)
    validation_score= model.evaluate(validation_data, ...)
    validation_scores.append(validation_score)
validation_score=np.average(validation_scores)
model= LGBMClassifier()
model.fit(data, ...)
test_score=model.evaluate(test_data, ...)

In [None]:
valid_probs = []
test_probs = pd.DataFrame([], index=X_test.index, columns=range(y_train.max() + 1))
for idx in folds:
    Xv = X_train.loc[idx]
    yv = y_train.loc[idx]
    
    Xt = X_train.drop(idx)
    yt = y_train.drop(idx)
    
    model = LGBMClassifier()
    model.fit(Xt, yt)
    
    vp = pd.Series(model.predict_proba(Xv), index=Xv)
    valid_probs.append(vp)
    
    test_probs = test_probs + pd.Series(model.predict_proba(X_test), index=X_test.index) / 5
valid_probs = pd.concat(valid_probs)

## 8. Entrenar al menos un Modelo que prefieran y optimizar al menos un Hiperparámetro