## 1. Importar las Librerías Necesarias

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from wordcloud import WordCloud
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from xgboost import XGBClassifier
from sklearn.metrics import cohen_kappa_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline


%matplotlib inline
pd.set_option('display.max_columns', 30)
plt.rcParams['figure.figsize'] = [12.0, 8.0]

In [None]:
#para sentiment analisis
import glob
import json
from pandas.io.json import json_normalize


## 2. Leer los Datos
Al menos los datos Tabulares de la base de "train"

In [None]:
#Leemos los datos tabulares
train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')

In [None]:
#leemos el JSON de sentiment analisys
train_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/train_sentiment/*.json'))
print('num of train sentiment files: {}'.format(len(train_sentiment_files)))
data_sentiment=[]

for i in range (len(train_sentiment_files)):
    with open(train_sentiment_files[i]) as jsonFile:
        jsonObject = json.load(jsonFile)
        jsonFile.close()
    score=jsonObject['documentSentiment']['score']
    magnitude=jsonObject['documentSentiment']['magnitude']
    path=train_sentiment_files[i]
    firstpos=path.rfind("/")
    lastpos=path.rfind(".")
    PetID=path[firstpos+1:lastpos]
    data_sentiment.append([PetID,score,magnitude,score*magnitude])

data_sentiment = pd.DataFrame(data_sentiment, columns=['PetID','Score', 'Magnitude','Sentiment'])
data_sentiment.head(5)


## 3. Pre-procesar Nulos
Verificar la existencia de Nulos y decidir como Imputarlos en caso de que existan

Verificar la existencia de Ceros u otros valores que puedan indicar que pueden ser perdidos

In [None]:
#vemos primero que valiables tienen nulos
train.isnull().sum()

#al parecer solo nombre y descripcion tienen nulos. Es por eso que vamos a considerar no tener nombre y no tener descripcion 
#como una variable mas para usar de input del modelo. 

In [None]:
train['Tiene_nombre?'] = np.where(train['Name'].notnull(), 0, 1)
train['Tiene_descripcion?'] = np.where(train['Description'].notnull(), 0, 1)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
###outliers###

#Age
#no vamos a hacer ningun tratamiento de esto, dado que para lightgbm no es necesario

###varialbes a Codificar###

#maturitysyze
#Vaccinated
#Dewormed
#sterilized
#health

####variable nueva###

#vacunated & dewormed
#vacunated & sterilezed
#sterilezed & dewormed
#sterlezed & dewormed & vacunated
#has photo?
#Has video?
#Has photo & video_?

In [None]:
###varialbes a Codificar###
OneHotEncoder().fit_transform(train[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]])

#codificacion de variables

encoder = OneHotEncoder().fit(train[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]])


In [None]:
codificadas = pd.DataFrame(encoder.transform(train[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]]).toarray(),
            columns=[
                f"{column}_{category}" for categories, column in zip(encoder.categories_, train[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]].columns)
                for category in categories
            ])

In [None]:
train.head()

In [None]:
#unimos las codificadas al dataset
train = pd.concat([train, codificadas], axis=1)

In [None]:
train.info()

In [None]:
####variables nueva###

#train['Vaccinated&Dewormed'] = np.where((train['Vaccinated'] == 1) & (train['Dewormed']==1), 1, 0)
#train['Vaccinated&Sterilized'] = np.where((train['Vaccinated'] == 1) & (train['Sterilized']==1), 1, 0)
#train['Sterilized&Dewormed'] = np.where((train['Sterilized'] == 1) & (train['Dewormed']==1), 1, 0)
train['Sterilized&Dewormed&Vaccinated'] = np.where((train['Vaccinated'] == 1) & (train['Dewormed']==1)& (train['Sterilized']==1), 1, 0)
#train['Hasphoto?'] = np.where((train['PhotoAmt'] > 0), 1, 0)
#train['Hasvideo?'] = np.where((train['VideoAmt'] > 0), 1, 0)
train['Hasvideo&photo?'] = np.where((train['VideoAmt'] > 0) & (train['PhotoAmt'] > 0), 1, 0)

In [None]:
#variable que reemplaza rescuer ID por la cantidad de publicaciones que hizo. 
rescuer_id = dict(train['RescuerID'].value_counts())

train = train.replace(rescuer_id)

train.rename(columns= {'RescuerID': 'RescuerQ'}, inplace= True)


In [None]:
#unimos todas las variables
train = pd.merge(train, data_sentiment, on='PetID', how='left')

In [None]:
train.columns

## 4. Convertir o eliminar las Columnas Categóricas

Por ejemplo, la Descripción habría que sacarla para un análisis independiente

In [None]:
#variables a eliminar por irrelevancia o porque no es posible procesarlas con un lightgbm
eliminar = ["Name","Description","PetID","Breed2","Color3","MaturitySize","Vaccinated","Dewormed","Health"]
data = train.drop(eliminar, axis=1)
print(data.columns.values)
data.head()

## 5. Normalizar o Estandarizar las variables Numericas (para los modelos que sean necesarios)

Revisar si existen valores extremos y considerarlos para los modelos que afecte

In [None]:
#Dado que solo utilizaremos un algoritmo del tipo arbol de decision (LightGBM) el proceso este es irrelevante

## 6. Separa la base de Test (10%) y Train (90%)
Pueden ser otros porcentajes que les parezcan mejor

In [None]:
X= data.loc[:, data.columns != 'AdoptionSpeed']
y= data['AdoptionSpeed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2021)

### 7. Para la parte de Train, armar un esquema de Cross Validation

Usar 10 Folds

In [None]:
#vamos a hacer todo con gridsearchCV con lo cual lo ordenamos de otra forma: 
#definimos la metrica que vamos a  usar como objetivo de maximizacion del algoritmo

def metric(y_true, y_pred):
    res = cohen_kappa_score(y_true, y_pred.reshape((y_true.shape[0], 5), order="F").argmax(axis=1), weights= 'quadratic')
    return "kappa", res, True

In [None]:
#creamos pipeline de clasificador para ordenar mejor el gridsearch luego
pipe = Pipeline([("classifier",LGBMClassifier(n_estimators=200, metric="custom",verbose=50, early_stopping_rounds=20) )])


In [None]:
#Generamos el espacio por el cual vamos a estar buscando los mejores hiperparametros.

#Importante aclarar que este proceso fue iterativo, algunos de los resultados de corridas anteriores quedan comentados. La totalidad se pueden 
# encontrar en cada version del script. Otra cosa a tener en cuenta es que en el medio tambien hemos ido creando y eliminando variables. Con lo cual  
# no toda optimizacion es resultado de las mismas variables por ende la extrapolacion a la totalidad de corridas no es directa.

search_space = [
    {
        "classifier": [LGBMClassifier(n_estimators=200, metric="custom")],
        "classifier__max_depth": list(range(9, 14, 1)),
        "classifier__min_data_in_leaf": list(range(28,35,1)),
        "classifier__learning_rate": [0.10,0.2,0.25]
    }
]
#primer intento (Sin sentiment)
#LGBMClassifier(learning_rate=0.015, max_depth=7, metric='custom',
#               min_data_in_leaf=40, n_estimators=300)
#segundo intento (Sin sentiment)
#LGBMClassifier(learning_rate=0.017, max_depth=6, metric='custom',
#               min_data_in_leaf=45, n_estimators=300)
#tercer intento  (Sin sentiment)
#LGBMClassifier(learning_rate=0.022, max_depth=6, metric='custom',
#               min_data_in_leaf=44, n_estimators=300)
#cuarto intento (Sin sentiment)
#LGBMClassifier(learning_rate=0.05, max_depth=5, metric='custom',
#               min_data_in_leaf=46, n_estimators=200)
#quinto intento (Sin sentiment)
#LGBMClassifier(learning_rate=0.15, max_depth=5, metric='custom',
#               min_data_in_leaf=48, n_estimators=200)
#LGBMClassifier(learning_rate=0.1 ,max_depth=8, metric='custom', min_data_in_leaf=38,
#               n_estimators=200)
#LGBMClassifier(learning_rate=0.2, max_depth=9, metric='custom',
#               min_data_in_leaf=41, n_estimators=10)

#con varias modificacion (Menos variables y la nueva del rescatador)
#LGBMClassifier(max_depth=11, metric='custom', min_data_in_leaf=31)

#LGBMClassifier(max_depth=11, metric='custom', min_data_in_leaf=31)

In [None]:
#creamos la optimizacion de hiperparametros

clf = GridSearchCV(pipe, search_space, cv=10, verbose=30)

## 8. Entrenar al menos un Modelo que prefieran y optimizar al menos un Hiperparámetro

In [None]:
# Entrenamos el modelo

mejor_modelo = clf.fit(X_train, y_train)

#vemos los mejores hiperparametros obtenidos

print(mejor_modelo.best_estimator_.get_params()["classifier"])

Probamos con el set de testing

# **Extra**
**Entrega para la competencia para probar como va con un set de validacion **

In [None]:
#por el momento solo leemos los datos tabulares 
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

# Creamos todo el mismo conjunto de variables que en el dataset de entrenamiento

In [None]:

test['Tiene_nombre?'] = np.where(test['Name'].notnull(), 0, 1)
test['Tiene_descripcion?'] = np.where(test['Description'].notnull(), 0, 1)
###varialbes a Codificar###
OneHotEncoder().fit_transform(test[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]])

#codificacion de variables

encoder = OneHotEncoder().fit(test[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]])

codificadas = pd.DataFrame(encoder.transform(test[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]]).toarray(),
            columns=[
                f"{column}_{category}" for categories, column in zip(encoder.categories_, test[["MaturitySize", "Vaccinated","Dewormed","Sterilized", "Health"]].columns)
                for category in categories
            ])

test = pd.concat([test, codificadas], axis=1)

####variable nueva###
#test['Vaccinated&Dewormed'] = np.where((test['Vaccinated'] == 1) & (test['Dewormed']==1), 1, 0)
#test['Vaccinated&Sterilized'] = np.where((test['Vaccinated'] == 1) & (test['Sterilized']==1), 1, 0)
#test['Sterilized&Dewormed'] = np.where((test['Sterilized'] == 1) & (test['Dewormed']==1), 1, 0)
test['Sterilized&Dewormed&Vaccinated'] = np.where((test['Vaccinated'] == 1) & (test['Dewormed']==1)& (test['Sterilized']==1), 1, 0)
#test['Hasphoto?'] = np.where((test['PhotoAmt'] > 0), 1, 0)
#test['Hasvideo?'] = np.where((test['VideoAmt'] > 0), 1, 0)
test['Hasvideo&photo?'] = np.where((test['VideoAmt'] > 0) & (test['PhotoAmt'] > 0), 1, 0)



In [None]:
rescuer_id = dict(test['RescuerID'].value_counts())

test = test.replace(rescuer_id)

test.rename(columns= {'RescuerID': 'RescuerQ'}, inplace= True)

In [None]:
test['RescuerQ']

In [None]:
test_sentiment_files = sorted(glob.glob('../input/petfinder-adoption-prediction/test_sentiment/*.json'))
print('num of train sentiment files: {}'.format(len(test_sentiment_files)))
data_test_sentiment=[]

for i in range (len(test_sentiment_files)):
    with open(test_sentiment_files[i]) as jsonFile:
        jsonObject = json.load(jsonFile)
        jsonFile.close()
    score=jsonObject['documentSentiment']['score']
    magnitude=jsonObject['documentSentiment']['magnitude']
    path=test_sentiment_files[i]
    firstpos=path.rfind("/")
    lastpos=path.rfind(".")
    PetID=path[firstpos+1:lastpos]
    data_test_sentiment.append([PetID,score,magnitude,score*magnitude])

data_test_sentiment = pd.DataFrame(data_test_sentiment, columns=['PetID','Score', 'Magnitude','Sentiment'])
data_test_sentiment.head(5)

In [None]:
test = pd.merge(test, data_test_sentiment, on='PetID', how='left')

In [None]:
#variables a eliminar
sacar = ["Name","Description","PetID","Breed2","Color3","MaturitySize","Vaccinated","Dewormed","Health"]
data2 = test.drop(sacar, axis=1)
print(data2.columns.values)

# Generamos la prediccion de este dataset

In [None]:
predictions = mejor_modelo.predict(data2)
#predictions = optR.predict(predictions)

# Creamos el csv para kaggle

In [None]:
submission = test[['PetID']].copy()
submission["AdoptionSpeed"] = predictions.astype("int64")
submission.to_csv("submission.csv", index=False)

submission["AdoptionSpeed"].value_counts(normalize=True,dropna=False)

In [None]:
#print(predictions)

In [None]:
#analisamos importancia de variables con modelo simplificado
#simplificado = LGBMClassifier(learning_rate=0.2, max_depth=9, metric='custom', min_data_in_leaf=41, n_estimators=10)
#entrenamiento, testeo = train_test_split(data, test_size = 0.10, shuffle = False)


In [None]:
#testeo = testeo.fillna(999)

In [None]:
#data.describe

In [None]:
#simplificado.fit(entrenamiento, testeo, eval_metric = 'custom')

In [None]:
#headers = data.columns.values.tolist()

#var_imp = pd.DataFrame({
# 'feature':headers, 
# 'v_importance':simplificado.feature_importances_.tolist()
# })
#print(var_imp.sort_values(by = 'v_importance', ascending=False))