## 1. Importar las Librerías Necesarias

In [None]:
#!pip install textblob

In [None]:
# Importación de Librerías
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import os
import pandas as pd 
import seaborn as sns 
from PIL import Image
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, KFold
from tensorflow import keras

import nltk
import re
from nltk.corpus import stopwords
from textblob import TextBlob

from sklearn.metrics import cohen_kappa_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 30)
plt.rcParams['figure.figsize'] = [12.0, 8.0]

from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import ParameterGrid


## 2. Leer los Datos
Al menos los datos Tabulares de la base de "train"

In [None]:
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

train['dataset_type'] = 'train'
test['dataset_type'] = 'test'
all_data = pd.concat([train, test])

In [None]:
train.shape

In [None]:
test.shape

In [None]:
all_data.shape

## **EDA**

In [None]:
main_count = train['AdoptionSpeed'].value_counts(normalize=True).sort_index()
def prepare_plot_dict(df, col, main_count):
    main_count = dict(main_count)
    plot_dict = {}
    for i in df[col].unique():
        val_count = dict(df.loc[df[col] == i, 'AdoptionSpeed'].value_counts().sort_index())

        for k, v in main_count.items():
            if k in val_count:
                plot_dict[val_count[k]] = ((val_count[k] / sum(val_count.values())) / main_count[k]) * 100 - 100
            else:
                plot_dict[0] = 0

    return plot_dict

def make_count_plot(df, x, hue='AdoptionSpeed', title='', main_count=main_count):
    g = sns.countplot(x=x, data=df, hue=hue);
    plt.title(f'AdoptionSpeed {title}');
    ax = g.axes

    plot_dict = prepare_plot_dict(df, x, main_count)

    for p in ax.patches:
        h = p.get_height() if str(p.get_height()) != 'nan' else 0
        text = f"{plot_dict[h]:.0f}%" if plot_dict[h] < 0 else f"+{plot_dict[h]:.0f}%"
        ax.annotate(text, (p.get_x() + p.get_width() / 2., h),
             ha='center', va='center', fontsize=11, color='green' if plot_dict[h] > 0 else 'red', rotation=0, xytext=(0, 10),
             textcoords='offset points') 

In [None]:
#Analisis de Datos
train['AdoptionSpeed'].value_counts().sort_index(ascending = False).plot(kind='barh', color='teal');
plt.title('Adoption speed classes counts');

#La clase mas frecuente es 4 (No adoption), seguido por 2 (between 8 and 30 days), 3 (between 31 and 90 days) y 4 (between 1 and 7 days). 
#Solo una pequenia cantidad resultan adoptados en el mismo dia (0)

In [None]:
train['Type'] = train['Type'].apply(lambda x: 'Dog' if x == 1 else 'Cat')
sns.countplot(x='AdoptionSpeed', data=train, hue='Type');
plt.title('Number of cats and dogs by AdoptionSpeed');

#Los gatos son adoptados mas rapido que los perros. 

In [None]:
plt.figure(figsize=(18, 6));
plt.subplot(1, 2, 1)
make_count_plot(df=train, x='Gender', title='and gender')

plt.subplot(1, 2, 2)
sns.countplot(x='dataset_type', data=all_data, hue='Gender');
plt.title('Number of pets by gender in train and test data');

#Los machos son adoptados mas rapido (gender 1) que las hembras (2) y que los mixtos (naturalmente, considerando que son grupos de mascotas). 
#Las hembras son mas frecuentes que los machos. 

In [None]:
breeds = pd.read_csv('../input/petfinder-adoption-prediction/breed_labels.csv')

In [None]:
breeds_dict = {k: v for k, v in zip(breeds['BreedID'], breeds['BreedName'])}

In [None]:
train['Breed1_name'] = train['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'Unknown')
train['Breed2_name'] = train['Breed2'].apply(lambda x: '_'.join(breeds_dict[x]) if x in breeds_dict else '-')

test['Breed1_name'] = test['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'Unknown')
test['Breed2_name'] = test['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else '-')

all_data['Breed1_name'] = all_data['Breed1'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else 'Unknown')
all_data['Breed2_name'] = all_data['Breed2'].apply(lambda x: '_'.join(breeds_dict[x].split()) if x in breeds_dict else '-')

In [None]:
images = [i.split('-')[0] for i in os.listdir('../input/petfinder-adoption-prediction/train_images/')]
size_dict = {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large'}
for t in all_data['Type'].unique():
    for m in all_data['MaturitySize'].unique():
        df = all_data.loc[(all_data['Type'] == t) & (all_data['MaturitySize'] == m)]
        top_breeds = list(df['Breed1_name'].value_counts().index)[:5]
        m = size_dict[m]
        print(f"Most common Breeds of {m} {t}s:")
        
        fig = plt.figure(figsize=(25, 4))
        
        for i, breed in enumerate(top_breeds):
            # excluding pets without pictures
            b_df = df.loc[(df['Breed1_name'] == breed) & (df['PetID'].isin(images)), 'PetID']
            if len(b_df) > 1:
                pet_id = b_df.values[1]
            else:
                pet_id = b_df.values[0]
            ax = fig.add_subplot(1, 5, i+1, xticks=[], yticks=[])

            im = Image.open("../input/petfinder-adoption-prediction/train_images/" + pet_id + '-1.jpg')
            plt.imshow(im)
            ax.set_title(f'Breed: {breed}')
        plt.show();

In [None]:
plt.figure(figsize=(20, 12));
plt.subplot(2, 2, 1)
make_count_plot(df=train, x='Vaccinated', title='Vaccinated')
plt.xticks([0, 1, 2], ['Yes', 'No', 'Not sure']);
plt.title('AdoptionSpeed and Vaccinated');

plt.subplot(2, 2, 2)
make_count_plot(df=train, x='Dewormed', title='Dewormed')
plt.xticks([0, 1, 2], ['Yes', 'No', 'Not sure']);
plt.title('AdoptionSpeed and Dewormed');

plt.subplot(2, 2, 3)
make_count_plot(df=train, x='Sterilized', title='Sterilized')
plt.xticks([0, 1, 2], ['Yes', 'No', 'Not sure']);
plt.title('AdoptionSpeed and Sterilized');

plt.subplot(2, 2, 4)
make_count_plot(df=train, x='Health', title='Health')
plt.xticks([0, 1, 2], ['Healthy', 'Minor Injury', 'Serious Injury']);
plt.title('AdoptionSpeed and Health');

plt.suptitle('Adoption Speed and health conditions');

#Los animales no vacunados son adoptados mas rapido que los vacunados
#los que no fueron desparasitados mas rapido que los desparasitados
#los no esterilizados mas rapido que los esterilizados


In [None]:
def plot_four_graphs(col='', main_title='', dataset_title=''):

    plt.figure(figsize=(20, 12));
    plt.subplot(2, 2, 1)
    make_count_plot(df=train, x=col, title=f'and {main_title}')

    plt.subplot(2, 2, 2)
    sns.countplot(x='dataset_type', data=all_data, hue=col);
    plt.title(dataset_title);

    #plt.subplot(2, 2, 3)
    #make_count_plot(df=train.loc[train['Type'] == 1], x=col, title=f'and {main_title} for dogs')

    #plt.subplot(2, 2, 4)
    #make_count_plot(df=train.loc[train['Type'] == 2], x=col, title=f'and {main_title} for cats')

In [None]:
train['Free'] = train['Fee'].apply(lambda x: 'Free' if x == 0 else 'Not Free')
test['Free'] = test['Fee'].apply(lambda x: 'Free' if x == 0 else 'Not Free')
all_data['Free'] = all_data['Fee'].apply(lambda x: 'Free' if x == 0 else 'Not Free')

In [None]:
plot_four_graphs(col='Free', main_title='Free', dataset_title='Number of pets by Free in train and test data')
# Los animales gratuitos son adoptados ligeramente mas rapido que los que no son gratis. 

## 3. Pre-procesar Nulos
Verificar la existencia de Nulos y decidir como Imputarlos en caso de que existan

Verificar la existencia de Ceros u otros valores que puedan indicar que pueden ser perdidos

In [None]:
train.isna().sum()

#Los nulos en Name son indicativos que la mascota aun no fue nombrada. Consideramos que puede ser un predictor asi que lo transformamos en una columna mas.
# Mas adelante la eliminaremos



# Los nulos en description no son preocupantes: la transformaremos mediante sentiment analysis y eso no requiere tratamiento de nulos.

In [None]:
all_data['HasName'] = all_data.Name.isnull()
all_data.HasName.replace([True, False], [1, 0], inplace = True)

In [None]:
all_data.shape

In [None]:
# Los 0 en la variable Age es probable que indiquen edad perdida, pero podemos dejarlo asi para indicarle al arbol que ese es un valor distintivo (ya que puede resultar
# indicativo de otra cosa). Ademas no sabemos sino se trata de recien nacidos. 

#Los 0 en Breed 1 deben ser valores perdidos. Con el mismo criterio lo dejamos en blanco. 


(train == 0).astype(int).sum(axis=0)


## 4. Convertir o eliminar las Columnas Categóricas

Por ejemplo, la Descripción habría que sacarla para un análisis independiente

### Target Encoding: Breed

In [None]:
#Transformamos el adoption speed a dias para poder realizar operaciones matematicas. Definimos los dias tomando el punto medio del intervalo que representa cada categoria
# y fijamos 100 para el valor 4, no adoption. 

ASDays= pd.DataFrame (
 {'AdoptionSpeed': [0, 1, 2, 3, 4], 'Days': [0.5, 3.5, 19, 60, 100]}
)

all_data= pd.merge(all_data, ASDays, how = 'left')

In [None]:
#Encodeamos la Breed principal, la combinacion breed1 y breed2 y si es o no mezcla. 

train=all_data.loc[all_data.dataset_type == "train"  ,:]

Breed=pd.DataFrame (train.groupby('Breed1')['Days'].mean())


Breed.rename(columns={'Days': 'Breed'}, inplace=True)

Breed.head()

In [None]:
all_data= pd.merge(all_data, Breed, left_on='Breed1',right_on='Breed1', how='left')

all_data.head()

In [None]:
all_data['Mezcla'] = all_data.Breed2.isnull()
all_data.Mezcla.replace([True, False], [1, 0], inplace = True)
all_data['Mezcla'] = pd.to_numeric(all_data.Mezcla)


In [None]:
all_data['Breeds'] = all_data['Breed1'].astype(str) + ";" + all_data['Breed2'].astype(str)

In [None]:
train=all_data.loc[all_data.dataset_type == "train"  ,:]

Breeds=pd.DataFrame (train.groupby('Breeds')['Days'].mean())


Breeds.rename(columns={'Days': 'BreedsSpeed'}, inplace=True)

In [None]:
all_data= pd.merge(all_data, Breeds, left_on='Breeds',right_on='Breeds', how='left')


In [None]:
all_data.shape

### One Hot Encoding: Color

In [None]:
colores= all_data.loc[: , all_data.columns.isin(['Color1','Color2','Color3']) ]

In [None]:
colores = colores.astype({"Color1": str, "Color2": str, "Color3": str })


In [None]:
dummies = pd.get_dummies(colores)

In [None]:
all_data = pd.concat([all_data, dummies], axis=1)

### One Hot Encoding: State

In [None]:
estados= all_data.loc[: , all_data.columns.isin(['State']) ]
estados = estados.astype({"State": str})
dummiestate = pd.get_dummies(estados)
all_data = pd.concat([all_data, dummiestate], axis=1)

In [None]:
all_data.head()

### One Hot Encoding: Gender

In [None]:
genero= all_data.loc[: , all_data.columns.isin(['Gender']) ]
genero = genero.astype({"Gender": str})
dummiegender = pd.get_dummies(genero)
all_data = pd.concat([all_data, dummiegender], axis=1)

### Transformamos Description en variables de Sentiment Analysis

In [None]:
def text_cleaning(text):
    forbidden_words = set(stopwords.words('english'))
    text = ' '.join(text.split('.'))
    text = re.sub('\/',' ',text)
    text = text.strip('\'"')
    text = re.sub(r'@([^\s]+)',r'\1',text)
    text = re.sub(r'\\',' ',text)
    text = text.lower()
    text = re.sub('[\s]+', ' ', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    text = re.sub(r'((http)\S+)','',text)
    text = re.sub(r'\s+', ' ', re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()
    text = re.sub(r'\W+', ' ', text.strip().lower()).strip()
    text = [word for word in text.split() if word not in forbidden_words]
    return ' '.join(text)

In [None]:
all_data['Description'] = all_data['Description'].apply(lambda text: text_cleaning(str(text)))

In [None]:
def sent(txt):
    return TextBlob(str(txt)).polarity

all_data['sentiment'] = all_data['Description'].apply(lambda txt: sent(txt))      # new column of sentiment

In [None]:
def subj(txt):
    return TextBlob(str(txt)).subjectivity

all_data['subjectivity'] = all_data['Description'].apply(lambda txt: subj(txt))      # new column of subjectivity

all_data.head()

In [None]:
all_data = all_data.drop(['Name','Description','RescuerID','Gender','State','Color1','Color2','Color3','Breed1','Breed2','Breeds','Days'], axis = 1)

In [None]:
all_data.columns

## 5. Normalizar o Estandarizar las variables Numericas (para los modelos que sean necesarios)

Revisar si existen valores extremos y considerarlos para los modelos que afecte

In [None]:
#No es necesario para nuestro modelo LightGBM

## 6. Separa la base de Test (10%) y Train (90%)
Pueden ser otros porcentajes que les parezcan mejor

In [None]:
#Para el Cross Validation no podemos usar las columnas RescuerSpeed y Breed porque tienen un importante data leakage. 
#Las excluimos en esta etapa. 



train=all_data.loc[all_data.dataset_type == "train"  , all_data.columns.isin(['Breed','BreedsSpeed'])==False ]


X_train, X_test, y_train, y_test = train_test_split(
    train.select_dtypes(exclude=['object']).drop("AdoptionSpeed", axis=1), 
    train.AdoptionSpeed, random_state=1, 
    test_size=0.2, train_size = 0.8
)

In [None]:
train.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

### 7. Para la parte de Train, armar un esquema de Cross Validation

Usar 10 Folds

In [None]:
def metric(y_true, y_pred):
    res = cohen_kappa_score(y_true, y_pred.reshape((y_true.shape[0], 5), order="F").argmax(axis=1), weights= 'quadratic')
    return "kappa", res, True

In [None]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)
folds = [X_train.index[idx] for _, idx in kf.split(X_train)]

In [None]:
train_preds = []
train_true = []
test_preds = pd.DataFrame(np.zeros((X_test.shape[0], 5)), index=X_test.index, columns=range(5))
resultados = {}
for param in [{}]:
    for i, (ti, vi) in enumerate(kf.split(X_train)):
        Xt, Xv = X_train.iloc[ti], X_train.iloc[vi]
        yt, yv = y_train.iloc[ti], y_train.iloc[vi]

        model = LGBMClassifier(**param, n_estimators=1000, metric="custom")
        model.fit(Xt, yt, eval_set=[(Xt, yt), (Xv, yv)],
                  early_stopping_rounds=50, eval_metric=metric, verbose=50)
        preds = pd.Series(model.predict(Xv), index=Xv.index)
        train_preds.append(preds)
        train_true.append(yv)
        resultados[f"fold_{i+1}"] = cohen_kappa_score(yv, preds, weights= 'quadratic')
        test_preds = test_preds + pd.DataFrame(model.predict_proba(X_test), index=X_test.index, columns=range(5))
# train_preds = pd.concat(train_preds)

In [None]:
train_preds = pd.concat(train_preds)
train_true = pd.concat(train_true)
resultados["Train"] = cohen_kappa_score(train_true, train_preds, weights= 'quadratic')
resultados["Test"] = cohen_kappa_score(y_test, test_preds.idxmax(axis=1), weights= 'quadratic')
resultados

In [None]:

pd.crosstab(train_preds, train_true)

## 8. Entrenar al menos un Modelo que prefieran y optimizar al menos un Hiperparámetro

### Modelo 1 

In [None]:

train= all_data.loc[all_data.dataset_type == "train"  , :]
X_train = all_data.loc[all_data.dataset_type == "train"  ,  all_data.columns.isin(['AdoptionSpeed','dataset_type','PetID'])==False ]


In [None]:
X_train.shape

In [None]:
model = LGBMClassifier(n_estimators=1000, metric="custom")
model.fit(X_train, train.AdoptionSpeed , verbose=50)


In [None]:
test= all_data.loc[all_data.dataset_type == "test"  , :]
X_test = all_data.loc[all_data.dataset_type == "test"  ,  all_data.columns.isin(['AdoptionSpeed','dataset_type','PetID'])==False ]
y_test = all_data.loc[all_data.dataset_type == "test"  ,  all_data.columns=='PetID' ]

In [None]:
preds = pd.Series(model.predict(X_test), index=X_test.index, name='pred')



In [None]:
prediccion = pd.DataFrame()

In [None]:
prediccion['PetID'] = y_test.PetID
prediccion['pred'] = preds

prediccion.set_index('PetID',inplace = True)

In [None]:
prediccion.head()

In [None]:
prediccion.to_csv('prediccion1.csv')

### Modelo 2. Ajusto un hiperparametro con loop.

In [None]:
train=all_data.loc[all_data.dataset_type == "train"  , all_data.columns.isin(['Breed','BreedsSpeed'])==False ]


X_train, X_test, y_train, y_test = train_test_split(
    train.select_dtypes(exclude=['object']).drop("AdoptionSpeed", axis=1), 
    train.AdoptionSpeed, random_state=1, 
    test_size=0.2, train_size = 0.8
)

In [None]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)
folds = [X_train.index[idx] for _, idx in kf.split(X_train)]

In [None]:
train_preds = []
train_true = []
test_preds = pd.DataFrame(np.zeros((X_test.shape[0], 5)), index=X_test.index, columns=range(5))
resultados = {}
for param in ParameterGrid({'num_leaves': [10,20,50,100,200,350] }):
    for i, (ti, vi) in enumerate(kf.split(X_train)):
        Xt, Xv = X_train.iloc[ti], X_train.iloc[vi]
        yt, yv = y_train.iloc[ti], y_train.iloc[vi]

        model = LGBMClassifier(**param, n_estimators=1000, metric="custom")
        model.fit(Xt, yt, eval_set=[(Xt, yt), (Xv, yv)],
                  early_stopping_rounds=50, eval_metric=metric, verbose=50)
        preds = pd.Series(model.predict(Xv), index=Xv.index)
        train_preds.append(preds)
        train_true.append(yv)
        resultados[f"fold_{i+1}_{param}"] = cohen_kappa_score(yv, preds, weights= 'quadratic')
        test_preds = test_preds + pd.DataFrame(model.predict_proba(X_test), index=X_test.index, columns=range(5))
# train_preds = pd.concat(train_preds)

In [None]:
results=pd.DataFrame ( data= {'fold': list(resultados.keys()), 'score': list(resultados.values())})

results['num_leaves'] = results['fold'].str.split('_').str[3]


leaves=results.groupby('num_leaves')['score'].mean()
leaves

In [None]:
bestnumleave=str(leaves.index.max())
bestnumleave = filter(str.isdigit, bestnumleave)
bestnumleave = "".join(bestnumleave)

In [None]:
train= all_data.loc[all_data.dataset_type == "train"  , :]
X_train = all_data.loc[all_data.dataset_type == "train"  ,  all_data.columns.isin(['AdoptionSpeed','dataset_type','PetID'])==False ]

model = LGBMClassifier(num_leaves = int(bestnumleave), n_estimators=1000, metric="custom")
model.fit(X_train, train.AdoptionSpeed , verbose=50)



In [None]:
test= all_data.loc[all_data.dataset_type == "test"  , :]
X_test = all_data.loc[all_data.dataset_type == "test"  ,  all_data.columns.isin(['AdoptionSpeed','dataset_type','PetID'])==False ]

y_test = all_data.loc[all_data.dataset_type == "test"  ,  all_data.columns=='PetID' ]
preds = pd.Series(model.predict(X_test), index=X_test.index, name='pred')
prediccion = pd.DataFrame()
prediccion['PetID'] = y_test.PetID
prediccion['pred'] = preds

prediccion.set_index('PetID',inplace = True)
prediccion.to_csv('prediccion2.csv')

### Modelo 3.  Data Augmentation

#### Data Augmentation

In [None]:
all_data.columns

In [None]:

#all_data1= all_data.drop(['PetID','dataset_type','HasName','Mezcla','Color1_1', 'Color1_2', 'Color1_3', 'Color1_4',
 #      'Color1_5', 'Color1_6', 'Color1_7', 'Color2_0', 'Color2_2', 'Color2_3',
  #     'Color2_4', 'Color2_5', 'Color2_6', 'Color2_7', 'Color3_0', 'Color3_3',
   #    'Color3_4', 'Color3_5', 'Color3_6', 'Color3_7', 'State_41324',
    #   'State_41325', 'State_41326', 'State_41327', 'State_41330',
     #  'State_41332', 'State_41335', 'State_41336', 'State_41342',
      # 'State_41345', 'State_41361', 'State_41367', 'State_41401',
       #'State_41415', 'State_41380' , 'Gender_1', 'Gender_2', 'Gender_3','Vaccinated', 'Dewormed',
       #'Sterilized', 'Health','Breed','BreedsSpeed','Type'], axis=1)
#all_data1.columns

In [None]:
all_data1= all_data.drop(['PetID','dataset_type'], axis=1)

In [None]:
from itertools import combinations

# Creating a new pd.DataFrame
data2 = pd.DataFrame(index=all_data1.index)

# list of columns
columns = all_data1.loc [:,all_data1.columns !='AdoptionSpeed']

# Create all combinations of length 2 . eg. AB, BC, etc.
for combination in combinations(columns, 2):
    combination_string = "/".join(combination)
    data2[combination_string] = all_data1[combination[1]]/ all_data1[combination[0]]

In [None]:
all_data=pd.concat([all_data, data2], axis=1)


In [None]:
all_data=all_data.replace([np.inf, -np.inf,np.nan], 0)


In [None]:
all_data.head()

In [None]:

train=all_data.loc[all_data.dataset_type == "train"  , all_data.columns.isin(['Breed','BreedsSpeed'])==False ]


X_train, X_test, y_train, y_test = train_test_split(
    train.select_dtypes(exclude=['object']).drop("AdoptionSpeed", axis=1), 
    train.AdoptionSpeed, random_state=1, 
    test_size=0.2, train_size = 0.8
)

kf = KFold(n_splits=10, random_state=1, shuffle=True)
folds = [X_train.index[idx] for _, idx in kf.split(X_train)]

train_preds = []
train_true = []
test_preds = pd.DataFrame(np.zeros((X_test.shape[0], 5)), index=X_test.index, columns=range(5))
resultados = {}
for param in [{}]:
    for i, (ti, vi) in enumerate(kf.split(X_train)):
        Xt, Xv = X_train.iloc[ti], X_train.iloc[vi]
        yt, yv = y_train.iloc[ti], y_train.iloc[vi]

        model = LGBMClassifier(**param, n_estimators=1000, metric="custom")
        model.fit(Xt, yt, eval_set=[(Xt, yt), (Xv, yv)],
                  early_stopping_rounds=50, eval_metric=metric, verbose=50)
        preds = pd.Series(model.predict(Xv), index=Xv.index)
        train_preds.append(preds)
        train_true.append(yv)
        resultados[f"fold_{i+1}"] = cohen_kappa_score(yv, preds, weights= 'quadratic')
        test_preds = test_preds + pd.DataFrame(model.predict_proba(X_test), index=X_test.index, columns=range(5))
# train_preds = pd.concat(train_preds)

train_preds = pd.concat(train_preds)
train_true = pd.concat(train_true)
resultados["Train"] = cohen_kappa_score(train_true, train_preds, weights= 'quadratic')
resultados["Test"] = cohen_kappa_score(y_test, test_preds.idxmax(axis=1), weights= 'quadratic')
resultados

In [None]:
train= all_data.loc[all_data.dataset_type == "train"  , :]
X_train = all_data.loc[all_data.dataset_type == "train"  ,  all_data.columns.isin(['AdoptionSpeed','dataset_type','PetID'])==False ]

model = LGBMClassifier(num_leaves = int(bestnumleave), n_estimators=1000, metric="custom")
model.fit(X_train, train.AdoptionSpeed , verbose=50)
test= all_data.loc[all_data.dataset_type == "test"  , :]
X_test = all_data.loc[all_data.dataset_type == "test"  ,  all_data.columns.isin(['AdoptionSpeed','dataset_type','PetID'])==False ]

y_test = all_data.loc[all_data.dataset_type == "test"  ,  all_data.columns=='PetID' ]
preds = pd.Series(model.predict(X_test), index=X_test.index, name='pred')
prediccion = pd.DataFrame()
prediccion['PetID'] = y_test.PetID
prediccion['pred'] = preds

prediccion.set_index('PetID',inplace = True)
prediccion.to_csv('prediccion3.csv')

### Modelo 4.  RandomizedSearchCV