In [None]:
import numpy as np 
import pandas as pd 

import os
os.listdir('/kaggle/input/petfinder-adoption-prediction/n')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/petfinder-adoption-prediction/train/train.csv", index_col="PetID")
train

In [None]:
test = pd.read_csv("/kaggle/input/petfinder-adoption-prediction/test/test.csv", index_col="PetID")
test

In [None]:
train.columns

In [None]:
train.AdoptionSpeed.value_counts().sort_index().rename("freq").to_frame().join(train.AdoptionSpeed.value_counts(1).rename("freq_rel"))

In [None]:
train.info()

In [None]:
pd.crosstab(train.VideoAmt, train.AdoptionSpeed)

In [None]:
pd.crosstab(train.VideoAmt, train.AdoptionSpeed).apply(lambda x: x / x.sum(), axis=1)

In [None]:
pd.crosstab(train.PhotoAmt, train.AdoptionSpeed)

In [None]:
pd.crosstab(train.PhotoAmt.clip(0, 10), train.AdoptionSpeed).apply(lambda x: x / x.sum(), axis=1)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

def trainModel(train, test, target="AdoptionSpeed"):
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    folds = [(train.iloc[train_idx].index, train.iloc[valid_idx].index)
             for train_idx, valid_idx in kf.split(train.drop(target, axis=1), train.AdoptionSpeed)]
    test_probs = pd.DataFrame(np.zeros((test.shape[0], train.AdoptionSpeed.nunique())), index=test.index)
    valid_probs = []
#     out-of-fold probs
    for i, (train_idx, valid_idx) in enumerate(folds):
        X_train = train.loc[train_idx]
        y_train = X_train[target]
        X_train = X_train.drop(target, axis=1)
        
        X_valid = train.loc[valid_idx]
        y_valid = X_valid[target]
        X_valid = X_valid.drop(target, axis=1)
        
        learner = LGBMClassifier(n_estimators=10000)
        learner.fit(X_train, y_train,  early_stopping_rounds=10, eval_metric="multi_logloss", verbose=100,
                    eval_set=[(X_train, y_train),
                              (X_valid, y_valid)])
        probs = pd.Series(learner.predict_proba(X_valid).argmax(axis=1), index=X_valid.index)
        valid_probs.append(probs)
        res = cohen_kappa_score(y_valid, probs, weights='quadratic')
        print(f"para el fold {i + 1} el resultado fue: {res}")
        test_probs += learner.predict_proba(test)
    valid_probs = pd.concat(valid_probs)
    res = cohen_kappa_score(train.AdoptionSpeed.loc[valid_probs.index], valid_probs, weights='quadratic')
    print(f"Resultado Final: {res}")
    return test_probs.idxmax(axis=1).rename(target)

In [None]:
train.Description.fillna("", inplace=True)
test.Description.fillna("", inplace=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = TfidfVectorizer()
svd = TruncatedSVD(n_components=25, n_iter=7, random_state=42)


train = train.join(pd.DataFrame(svd.fit_transform(vectorizer.fit_transform(train.Description.fillna(""))), index=train.index, columns=[f"svd_{i}" for i in range(25)]))
test = test.join(pd.DataFrame(svd.transform(vectorizer.transform(test.Description.fillna(""))), index=test.index, columns=[f"svd_{i}" for i in range(25)]))

In [None]:
for c in train.select_dtypes("O"):
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")

In [None]:
test_preds = trainModel(train, test)

In [None]:
for c in ["Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3", "State", "MaturitySize", "Vaccinated","Dewormed","Sterilized","Health"]:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")

In [None]:
test_preds = trainModel(train, test)

In [None]:
train["gage"] = pd.factorize(pd.qcut(train.Age, 5))[0]
test["gage"] = pd.factorize(pd.qcut(test.Age, 5).astype("category"))[0]


In [None]:
train["playful"] = train['Description'].str.contains('playful').fillna(0).astype(int)
test["playful"] = test['Description'].str.contains('playful').fillna(0).astype(int)
test_preds = trainModel(train, test)

In [None]:
train["desc_len"] = train.Description.str.len().fillna(0)
test["desc_len"] = test.Description.str.len().fillna(0)
test_preds = trainModel(train, test)

In [None]:
test_preds.to_csv("submission.csv")