In [None]:
# Importación de Librerías
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, KFold
from tensorflow import keras

from sklearn.metrics import cohen_kappa_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv', index_col="PetID")
data.info()

In [None]:
#for c in data.select_dtypes("O"):
 #   data[c] = data[c].astype("category")

In [None]:
data.info()

In [None]:
data['Description'].isnull().sum()

In [None]:
data['Description'] = data['Description'].fillna('nothing')

In [None]:
#Analisis de texto
#Sentiment analysis

import nltk
import re
from nltk.corpus import stopwords

def text_cleaning(text):
    forbidden_words = set(stopwords.words('english'))
    text = ' '.join(text.split('.'))
    text = re.sub('\/',' ',text)
    text = text.strip('\'"')
    text = re.sub(r'@([^\s]+)',r'\1',text)
    text = re.sub(r'\\',' ',text)
    text = text.lower()
    text = re.sub('[\s]+', ' ', text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)
    text = re.sub(r'((http)\S+)','',text)
    text = re.sub(r'\s+', ' ', re.sub('[^A-Za-z]', ' ', text.strip().lower())).strip()
    text = re.sub(r'\W+', ' ', text.strip().lower()).strip()
    text = [word for word in text.split() if word not in forbidden_words]
    return ' '.join(text)

In [None]:
#aplicamos la funcion a la columna Description para limpieza de texto
data['Description'] = data['Description'].apply(lambda text: text_cleaning(text))

In [None]:
#libreria de analisis de texto
!pip install textblob

In [None]:
from textblob import TextBlob

In [None]:
#funcion que tira sentimiento y subjetividad

def sent(txt):
    return TextBlob(str(txt)).polarity

data['sentiment'] = data['Description'].apply(lambda txt: sent(txt))      # new column of sentiment

In [None]:

def subj(txt):
    return TextBlob(str(txt)).subjectivity

data['subjectivity'] = data['Description'].apply(lambda txt: subj(txt))      # new column of subjectivity

In [None]:
#nube de palabras worldcloud
!pip install wordcloud

In [None]:
#importamos libreria matplotlib y wordcloud para graficos
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

from wordcloud import WordCloud, STOPWORDS

all_words = "".join (token for token in str(data["Description"]))


wordcloud = WordCloud( max_words=150,
                      max_font_size=350, random_state=42,
                      width=2000, height=1000,
                      colormap = "twilight",
                      background_color="white") 
wordcloud.generate(all_words)

# Plot
plt.figure(figsize = (16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show();

DATA AUGMENTATION****

In [None]:
#DATA AUGMENTATION
data1= data.drop(['Name','Description','RescuerID'], axis=1)


In [None]:
data1

In [None]:
from itertools import combinations

# Creating a new pd.DataFrame
data2 = pd.DataFrame(index=data1.index)

# list of columns
columns = data1.loc [:,data1.columns !='AdoptionSpeed']

# Create all combinations of length 2 . eg. AB, BC, etc.
for combination in combinations(columns, 2):
    combination_string = "/".join(combination)
    data2[combination_string] = data1[combination[1]]/ data1[combination[0]]
    print (data2)

In [None]:
data2= pd.DataFrame(data2)

In [None]:
data2

In [None]:
data2.info

In [None]:
data=pd.concat([data1, data2], axis=1)

In [None]:
data

In [None]:
#reemplazamos ceros e inf values

data=data.replace([np.inf, -np.inf], 0)

In [None]:
data

In [None]:
def metric(y_true, y_pred):
    res = cohen_kappa_score(y_true, y_pred.reshape((y_true.shape[0], 5), order="F").argmax(axis=1), weights= 'quadratic')
    return "kappa", res, True

In [None]:
data.isnull().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data.select_dtypes(exclude=['object']).drop("AdoptionSpeed", axis=1), 
    data.AdoptionSpeed, test_size=0.3, random_state=1
)
kf = KFold(n_splits=5, random_state=1, shuffle=True)

In [None]:
folds = [X_train.index[idx] for _, idx in kf.split(X_train)]

In [None]:

k=5
num_validation_samples=len(data)//k
#np.random.shuffle(data)
validation_scores=[]
test_probs = pd.DataFrame([], index=X_test.index, columns=range(y_train.max() + 1))
for idx in folds:
    Xv = X_train.loc[idx]
    yv = y_train.loc[idx]
    
    Xt = X_train.drop(idx)
    yt = y_train.drop(idx)
    
    model = LGBMClassifier()
    model.fit(Xt, yt)
    #vp = pd.Series(model.predict_proba(Xv), index=Xv)
    #valid_probs.append(vp)
    validation_score= model.predict_proba(Xv)
    validation_scores.append(validation_score)
    #test_probs = (test_probs + model.predict_proba(X_test)) / 5
    
#validation_scores = pd.concat([validation_scores,test_probs])

In [None]:
validation_score1=np.average(validation_score)
model= LGBMClassifier()
model.fit(X_train, y_train)
test_score=model.predict(X_test)

In [None]:
test_score

In [None]:
validation_score1

In [None]:
accuracy1= model.score(X_test, y_test)

In [None]:
accuracy1

In [None]:
accuracy2= model.score(Xv, yv)

In [None]:
accuracy2

In [None]:
model.predict(X_test)

In [None]:
metric(yv, model.predict_proba(Xv))

In [None]:
 #resultados[f"fold_{i+1}"] = cohen_kappa_score( Xt,yt, weights= 'quadratic')

In [None]:
#validation_scores = pd.DataFrame(validation_scores)
#test_probs= pd.DataFrame(test_probs)

#validation_scores1 = pd.concat([validation_scores,test_probs], ignore_index=True)

In [None]:
validation_scores

In [None]:
test_probs

In [None]:
#Cross validation que no corrio

#k=3
#num_validation_samples=len(data)//k
#np.random.shuffle(data)
#validation_scores=[]
#for fold in range(k):
 #   validation_data=data[num_validation_samples * fold:
  #                       num_validation_samples * (fold+1)]
   #      training_data=np.concatenate(
    #    [data[:num_validation_samples * fold],
     #   data[num_validation_samples * (fold+1):]], axis=0)
    #model= LGBMClassifier()
    #model.fit(training_data, ...)
    #validation_score= model.evaluate(validation_data, ...)
    #validation_scores.append(validation_score)
#validation_score=np.average(validation_scores)
#model= LGBMClassifier()
#model.fit(data, ...)
#test_score=model.evaluate(test_data, ...)

In [None]:
#BUSQUEDA HIPERPARAMETROS
#Randomized search engine para hiperp tunning
#generamos numeros aleat uniformes simil bayesian optimization

from scipy.stats import loguniform


class loguniform_int:
    """Integer valued version of the log-uniform distribution"""
    def __init__(self, a, b):
        self._distribution = loguniform(a, b)

    def rvs(self, *args, **kwargs):
        """Random variable sample"""
        return self._distribution.rvs(*args, **kwargs).astype(int)


In [None]:
#Now, we can define the randomized search using the different distributions. 
#Executing 10 iterations of 5-fold cross-validation for random parametrizations of this model on 
#this dataset can take from 10 seconds to several minutes, depending on the speed of the host computer
#and the number of available processors.

%time
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'classifier__l2_regularization': loguniform(1e-6, 1e3),
    'classifier__learning_rate': loguniform(0.01, 0.3),
    'classifier__num_leaves': loguniform_int(20, 600),
    'classifier__feature_fraction': loguniform(0.3,1),
    'classifier__max_depth': loguniform_int(5, 256),
    'classifier__min_data_in_leaf': loguniform_int(10, 100),
    'classifier__max_bin': loguniform_int(25, 35),
}

model_random_search = RandomizedSearchCV(
    model, param_distributions=param_distributions, n_iter=10,
    cv=5, verbose=1,
)
model_random_search.fit(X_train, y_train)

In [None]:
#accuracy score on the test set
accuracy = model_random_search.score(X_test, y_test)

print(f"The test accuracy score of the best model is "
      f"{accuracy:.2f}")

In [None]:
#accuracy score on the test set
accuracy = model_random_search.score(Xv, yv)

print(f"The test accuracy score of the best model is "
      f"{accuracy:.2f}")

In [None]:
from pprint import pprint

print("The best parameters are:")
pprint(model_random_search.best_params_)

In [None]:
best_params=model_random_search.best_params_

In [None]:
#vemos los resultados using the attributes cv_results as we did previously.

def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name

In [None]:
# get the parameter names
column_results = [
    f"param_{name}" for name in param_distributions.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]

cv_results = pd.DataFrame(model_random_search.cv_results_)
cv_results = cv_results[column_results].sort_values(
    "mean_test_score", ascending=False)
cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

In [None]:
#Aplicamos los hiperp obtenidos con iteraciones 30 (best_params)
#ver como aplicar los mejores parametros 

#model_random_search = RandomizedSearchCV(
 #    model ,param_distributions=param_name, n_iter=30,
  #   n_jobs=2, cv=5)
#model_random_search.fit(X_train, y_train)
#cv_results =  pd.DataFrame(model_random_search.cv_results_)
 #cv_results.to_csv("../figures/randomized_search_results.csv")

In [None]:
model= LGBMClassifier(feature_fraction= 0.7416287920026037,
                      l2_regularization= 5.809080248399578e-05,
                      learning_rate= 0.08410834407255606,
                      cl_max_bin= 29,
                      max_depth= 9,
                      min_data_in_leaf= 31,
                      num_leaves= 52)
model.fit(X_train, y_train)
test_score=model.predict(X_test)

In [None]:
accuracy1= model.score(X_test, y_test)

In [None]:
accuracy1

In [None]:
accuracy2= model.score(Xv, yv)

In [None]:
accuracy2

In [None]:
cv_results

**#levanto data set de test de kaggle para tirar predicciones**

In [None]:
data_test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv', index_col="PetID")
data_test.info()

In [None]:
#hacemos mismo tratamiento de lenguaje de sentimiento
data_test['Description'] = data_test['Description'].fillna('nothing')

In [None]:
data_test['Description'].isnull().sum()

In [None]:
#aplicamos la funcion a la columna Description para limpieza de texto
data_test['Description'] = data_test['Description'].apply(lambda text: text_cleaning(text))

In [None]:
#funcion que tira sentimiento y subjetividad

def sent(txt):
    return TextBlob(str(txt)).polarity

data_test['sentiment'] = data_test['Description'].apply(lambda txt: sent(txt))      # new column of sentiment

In [None]:
def subj(txt):
    return TextBlob(str(txt)).subjectivity

data_test['subjectivity'] = data_test['Description'].apply(lambda txt: subj(txt))      # new column of subjectivity

In [None]:
data_test

In [None]:
#hacemos el mismo feature engineering de train
#DATA AUGMENTATION
data_test= data_test.drop(['Name','Description','RescuerID'], axis=1)

In [None]:
from itertools import combinations

# Creating a new pd.DataFrame
data2_test = pd.DataFrame(index=data_test.index)

# list of columns
columns = data_test.columns

# Create all combinations of length 2 . eg. AB, BC, etc.
for combination in combinations(columns, 2):
    combination_string = "/".join(combination)
    data2_test[combination_string] = data_test[combination[1]]/ data_test[combination[0]]
    print (data2_test)

In [None]:
data_test=pd.concat([data_test, data2_test], axis=1)

In [None]:
data2_test

In [None]:
data_test

In [None]:
model_random_search.fit(X_train, y_train)

In [None]:
#Generamos prediccciones sobre dataset de test de kaggle
predictions=pd.DataFrame(model.predict(data_test))

In [None]:
predictions

In [None]:
predictions2=pd.DataFrame(model_random_search.predict(data_test))

In [None]:
predictions2

###ARCHIVO KAGGLE

In [None]:
#generamos archivo para submitir en kaggle Pet id y adopSpeed
predictions=pd.DataFrame(predictions )

In [None]:
predictions

In [None]:
#submission=pd.DataFrame(submission )

In [None]:
#submission

In [None]:
data_test.reset_index()

In [None]:
submission["PetID"] = data_test.index
#submission = data_test[("PetID")].copy()

In [None]:
submission

In [None]:
submission.rename(columns={ submission.columns[1]: "PetID" })

In [None]:
submission.rename(columns={ submission.columns[0]: "AdoptionSpeed" })

In [None]:
submission = submission.reindex(columns=['PetID', 'AdoptionSpeed'])

In [None]:
submission2 = submission.reindex(columns=['PetID', 'AdoptionSpeed'])

In [None]:
submission

In [None]:
submission["AdoptionSpeed"] = predictions.astype("int64")

In [None]:
submission2["AdoptionSpeed"] = predictions2.astype("int64")

In [None]:
submission

In [None]:
submission2

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
submission2.to_csv("submission2.csv", index=False)

In [None]:
submission["AdoptionSpeed"].value_counts(normalize=True,dropna=False)

**REDES CONVOLUCIONALES PARA COMPTER VISION SOBRE IMAGENES**