In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 


shelter_train = pd.read_csv("../input/train.csv")
shelter_train_outcome = shelter_train["OutcomeType"]
shelter_test = pd.read_csv("../input/test.csv")

In [2]:

dog_train = shelter_train[shelter_train["AnimalType"]=="Dog"]
dog_train = dog_train.reset_index()
dog_train.drop("AnimalType", axis=1, inplace=True)
dog_train.drop("index", axis=1, inplace=True)
dog_train_outcome = dog_train["OutcomeType"]
dog_train.drop("OutcomeType", axis=1, inplace=True)
dog_test = shelter_test[shelter_test["AnimalType"]=="Dog"]
dog_test = dog_test.reset_index()
dog_test.drop("AnimalType", axis=1, inplace=True)
dog_test.drop("index", axis=1, inplace=True)

cat_train = shelter_train[shelter_train["AnimalType"]=="Cat"]
cat_train = cat_train.reset_index()
cat_train.drop("AnimalType", axis=1, inplace=True)
cat_train.drop("index", axis=1, inplace=True)
cat_train_outcome = cat_train["OutcomeType"]
cat_train.drop("OutcomeType", axis=1, inplace=True)
cat_test = shelter_test[shelter_test["AnimalType"]=="Cat"]
cat_test = cat_test.reset_index()
cat_test.drop("AnimalType", axis=1, inplace=True)
cat_test.drop("index", axis=1, inplace=True)


dog_test_ID = dog_test["ID"].values
dog_test_ID = np.array([dog_test_ID])
dog_test_ID = dog_test_ID.T
dog_test.drop("ID", axis=1, inplace=True)
cat_test_ID = cat_test["ID"].values
cat_test_ID = np.array([cat_test_ID])
cat_test_ID = cat_test_ID.T
cat_test.drop("ID", axis=1, inplace=True)



In [3]:

def pre_processing(shelter_train, shelter_test, animal_type):
    shelter_train.drop("AnimalID", axis=1, inplace=True)
    shelter_train.drop("OutcomeSubtype", axis=1, inplace=True)

    time_train = pd.to_datetime(shelter_train["DateTime"])
    time_test = pd.to_datetime(shelter_test["DateTime"])

    shelter_train["Year"] = time_train.dt.year
    shelter_test["Year"] = time_test.dt.year
    shelter_train["Month"] = time_train.dt.month
    shelter_test["Month"] = time_test.dt.month
    shelter_test["Day"] = time_test.dt.day
    shelter_train["Day"] = time_train.dt.day
    shelter_test["Hour"] = time_test.dt.hour
    shelter_train["Hour"] = time_train.dt.hour
    shelter_test["Minute"] = time_test.dt.minute
    shelter_train["Minute"] = time_train.dt.minute


    shelter_train.drop("DateTime", axis=1, inplace=True)
    shelter_test.drop("DateTime", axis=1, inplace=True)

    shelter_train["SexuponOutcome"].fillna("Spayed Female", inplace=True)
    shelter_test["SexuponOutcome"].fillna("Spayed Female", inplace=True)

    def intact_group(sex):
        try:
            intact_type = sex.split()
        except:
            return 0
        if intact_type[0] == "Neutered" or intact_type[0] ==  "Spayed":		
            return 1
        elif intact_type[0] == "Intact":
            return 2
        else:
            return 0

    shelter_train["Virginity"] = shelter_train["SexuponOutcome"].apply(intact_group)
    shelter_test["Virginity"] = shelter_test["SexuponOutcome"].apply(intact_group)


    def sex_group(sexs):
        try:
            sex_type = sexs.split()
        except:
            return 0
        #categorize
        if sex_type[0] == "Unknown":
            return 0
        elif sex_type[1] == "Male":
            return 1
        elif sex_type[1] == "Female":
            return 2
        else:
            return 0

    shelter_train["Sex"] = shelter_train["SexuponOutcome"].apply(sex_group)
    shelter_test["Sex"] = shelter_test["SexuponOutcome"].apply(sex_group)

    shelter_train.drop("SexuponOutcome", axis=1, inplace=True)
    shelter_test.drop("SexuponOutcome", axis=1, inplace=True)


    def check_has_name(name):
        if type(name) is str:
            return 1
        else: 
            return 0

    shelter_train["has_name"] = shelter_train["Name"].apply(check_has_name)
    shelter_test["has_name"] = shelter_test["Name"].apply(check_has_name)

    shelter_train.drop("Name", axis=1, inplace=True)
    shelter_test.drop("Name", axis=1, inplace=True)


    shelter_train["AgeuponOutcome"].fillna("1 month", inplace=True)
    shelter_test["AgeuponOutcome"].fillna("1 month", inplace=True)
    

    def age_group(age):
        try:
            age_list = age.split() 
        except:
            return None
        ages = int(age_list[0])
        if(age_list[1].find("s")): 
            age_list[1] = age_list[1].replace("s","")
        if age_list[1] == "day":
            return ages
        elif (age_list[1] == "week"):
            return ages*7
        elif (age_list[1] == "month"):
            return ages*30
        elif (age_list[1] == "year"):
            return ages*365

 
    shelter_train["AgeuponOutcome"] = shelter_train["AgeuponOutcome"].apply(age_group)
    shelter_test["AgeuponOutcome"] = shelter_test["AgeuponOutcome"].apply(age_group)
 


    def hair_group(breed):
        if breed.find("Shorthair") != -1:
            return 0
        elif breed.find("Longhair") != -1:
            return 1
        else:
            return 2

    shelter_train["Hairgroup"] = shelter_train["Breed"].apply(hair_group)
    shelter_test["Hairgroup"] = shelter_test["Breed"].apply(hair_group)


    def aggressive(breed):
        if breed.find("Pit Bull") != -1:
            return 1
        elif breed.find("Rottweiler") != -1:
            return 2
        elif breed.find("Husky") != -1:
            return 3
        elif breed.find("Shepherd") != -1:
            return 4
        elif breed.find("Malamute") != -1:
            return 5
        elif breed.find("Doberman") != -1:
            return 6
        elif breed.find("Chow") != -1:
            return 7
        elif breed.find("Dane") != -1:
            return 8
        elif breed.find("Boxer") != -1:
            return 9
        elif breed.find("Akita") != -1:
            return 10
        else:
            return 11

    if (animal_type == "Dog"):
        shelter_train["Aggresiveness"] = shelter_train["Breed"].apply(aggressive)
        shelter_test["Aggresiveness"] = shelter_test["Breed"].apply(aggressive)


    def allergic(breed):
        if breed.find("Akita") != -1:
            return 1
        elif breed.find("Malamute") != -1:
            return 2
        elif breed.find("Eskimo") != -1:
            return 3
        elif breed.find("Corgi") != -1:
            return 4
        elif breed.find("Chow") != -1:
            return 5
        elif breed.find("Shepherd") != -1:
            return 6
        elif breed.find("Pyrenees") != -1:
            return 7
        elif breed.find("Labrador") != -1:
            return 8
        elif breed.find("Retriever") != -1:
            return 9
        elif breed.find("Husky") != -1:
            return 10
        else:
            return 11

    if (animal_type == "Dog"):
        shelter_train["Allergic"] = shelter_train["Breed"].apply(allergic)
        shelter_test["Allergic"] = shelter_test["Breed"].apply(allergic)


    def weight(breed):
        if breed.find("Pit Bull") != -1:
            return 1
        elif breed.find("Husky") != -1:
            return 1
        elif breed.find("Doberman") != -1:
            return 1
        elif breed.find("Boxer") != -1:
            return 1
        elif breed.find("Akita") != -1:
            return 1
        elif breed.find("Chow") != -1:
            return 1
        elif breed.find("Rottweiler") != -1:
            return 2
        elif breed.find("Shepherd") != -1:
            return 2
        elif breed.find("Malamute") != -1:
            return 2
        elif breed.find("Dane") != -1:
            return 2
        else:
            return 3

    if (animal_type == "Dog"):
        shelter_train["Weight"] = shelter_train["Breed"].apply(weight)
        shelter_test["Weight"] = shelter_test["Breed"].apply(weight)



    def breed_group(breed_input):
        breed = str(breed_input)
        if (' ' in breed) == False:
            br =  breed 
        else:
            breed_list = breed.split()
            try:
                br = breed_list[2] 
            except:
                br = breed_list[1] 
        if (br == "Mix"):
            return 0
        else:
            return 1
        return 1

    shelter_train["Breed"] = shelter_train["Breed"].apply(breed_group)
    shelter_test["Breed"] = shelter_test["Breed"].apply(breed_group)
  

    def color_group(color):
        try:
            color_type = color.split()
        except:
            return "unknown"
        return str(color_type[0])

    shelter_train["Color"] = shelter_train["Color"].apply(color_group)
    shelter_test["Color"] = shelter_test["Color"].apply(color_group)

    intval, label = pd.factorize(shelter_train["Color"], sort=True)
    shelter_train["Color"] = pd.DataFrame(intval)
    del intval, label
    intval, label = pd.factorize(shelter_test["Color"], sort=True)
    shelter_test["Color"] = pd.DataFrame(intval)
    del intval, label


    return shelter_train, shelter_test

dog_train, dog_test = pre_processing(dog_train, dog_test, "Dog")
cat_train, cat_test = pre_processing(cat_train, cat_test, "Cat")


In [4]:
from sklearn.model_selection import train_test_split
dog_X_train, dog_X_val, dog_y_train, dog_y_val = train_test_split(dog_train, dog_train_outcome, test_size=0.3)
cat_X_train, cat_X_val, cat_y_train, cat_y_val = train_test_split(cat_train, cat_train_outcome, test_size=0.3)



In [5]:

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss
from lightgbm import LGBMClassifier


classifiers = [
    LGBMClassifier()
]
print("DOG")
for classifier in classifiers:
    dog_log = classifier 
    dog_log.fit(dog_X_train, dog_y_train)

show_validation = True

if (show_validation == True):
    dog_y_probs = dog_log.predict_proba(dog_X_val)
    dog_y_pred = dog_log.predict(dog_X_val)
    print(type(classifier))
    print("accuracy_score:", accuracy_score(dog_y_val, dog_y_pred))
    print("log_loss:", log_loss(dog_y_val, dog_y_probs))
elif (show_validation == False):
    dog_y_probs = dog_log.predict_proba(dog_X_train)
    dog_y_pred = dog_log.predict(dog_X_train)
    print(type(classifier))
    print("accuracy_score:", accuracy_score(dog_y_train, dog_y_pred))
    print("log_loss:", log_loss(dog_y_train, dog_y_probs))

print("CAT")
for classifier in classifiers:
    cat_log = classifier
    cat_log.fit(cat_X_train, cat_y_train)

show_validation = True
# log knows how many classes are there idn y_train
if (show_validation == True):
    cat_y_probs = cat_log.predict_proba(cat_X_val)
    cat_y_pred = cat_log.predict(cat_X_val)
    print(type(classifier))
    print("accuracy_score:", accuracy_score(cat_y_val, cat_y_pred))
    print("log_loss:", log_loss(cat_y_val, cat_y_probs))
elif (show_validation == False):
    print(type(classifier))
    print("accuracy_score:", accuracy_score(cat_y_train, cat_y_pred))
    print("log_loss:", log_loss(cat_y_train, cat_y_probs))




DOG
<class 'lightgbm.sklearn.LGBMClassifier'>
accuracy_score: 0.592220559948707
log_loss: 0.9828002464506919
CAT
<class 'lightgbm.sklearn.LGBMClassifier'>
accuracy_score: 0.7994612391499551
log_loss: 0.5586455356985166


In [6]:

dog_log.fit(dog_train, dog_train_outcome)

dog_y_probs = dog_log.predict_proba(dog_test)
dog_test_result = np.append(dog_test_ID, dog_y_probs, axis=1)

In [7]:

cat_log.fit(cat_train, cat_train_outcome)

cat_y_probs = cat_log.predict_proba(cat_test)
cat_test_result = np.append(cat_test_ID, cat_y_probs, axis=1)



In [8]:
y_probs = np.append(dog_test_result, cat_test_result, axis=0)
y_probs = y_probs[y_probs[:,0].argsort()]
y_probs = y_probs[:,1:]
print(y_probs)

results = pd.read_csv("../input/sample_submission.csv")

results["Adoption"] = y_probs[:,0]
results["Died"] = y_probs[:,1]
results["Euthanasia"] = y_probs[:,2]
results["Return_to_owner"] = y_probs[:,3]
results["Transfer"] = y_probs[:,4]

results.to_csv("split_animal_lgb.csv",index = False)




[[1.47580156e-02 1.58578655e-04 2.80726340e-02 2.55386032e-01
  7.01624740e-01]
 [5.54500109e-01 2.29494511e-06 8.18982297e-03 3.12929878e-01
  1.24377896e-01]
 [6.37293170e-01 3.19854000e-04 8.20012426e-03 1.67567689e-01
  1.86619163e-01]
 ...
 [3.75273689e-04 1.09298285e-04 1.22221700e-03 5.88536277e-05
  9.98234357e-01]
 [3.71797463e-01 1.46932924e-04 3.10252072e-02 5.11562810e-01
  8.54675865e-02]
 [9.08807051e-02 1.92075852e-05 1.90797578e-01 5.17754070e-01
  2.00548439e-01]]
