In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")


In [3]:
train_df.head(3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White


In [4]:
test_df.head(3)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby


In [5]:
#Separating target variable
train_outcome = train_df["OutcomeType"]
train_df.drop(["OutcomeType"], axis=1, inplace=True)

In [6]:
# Delete DateTime (for now)
train_df.drop(["DateTime"], axis=1, inplace=True)
test_df.drop(["DateTime"], axis=1, inplace=True)

In [7]:
#Separating IDs
train_id = train_df[["AnimalID"]]
test_id = test_df[["ID"]]
train_df.drop(["AnimalID"], axis=1, inplace=True)
test_df.drop(["ID"], axis=1, inplace=True)

In [8]:
#Name and OutcomeSubtype are deleted
train_df.drop(["Name", "OutcomeSubtype"], axis=1, inplace=True)
test_df.drop(["Name"], axis=1, inplace=True)

In [9]:
train_df.shape

(26729, 5)

In [10]:
conjunto = pd.concat([train_df, test_df])

In [17]:
print(train_df.shape)
print(test_df.shape)
print(conjunto.shape)
#print(conjunto.head(3))

(26729, 5)
(11456, 5)
(38185, 5)


In [12]:
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

#Encode the categorical data of conjunto, columns by columns due memory restrictions
AnimalType_encoded = pd.get_dummies(conjunto['AnimalType'], columns='AnimalType')
SexuponOutcome_encoded = pd.get_dummies(conjunto['SexuponOutcome'], columns='SexuponOutcome')
AgeuponOutcome_encoded = pd.get_dummies(conjunto['AgeuponOutcome'], columns='AgeuponOutcome')
Breed_encoded = pd.get_dummies(conjunto['Breed'], columns='Breed')
Color_encoded = pd.get_dummies(conjunto['Color'], columns='Color')

In [13]:
# http://pandas.pydata.org/pandas-docs/stable/merging.html

conjunto_encoded = pd.concat([AnimalType_encoded, SexuponOutcome_encoded, AgeuponOutcome_encoded,
                 Breed_encoded, Color_encoded], axis=1)

In [14]:
#Split again for train and test
train = conjunto_encoded[:train_df.shape[0]]

In [15]:
test = conjunto_encoded[train_df.shape[0]:]

In [21]:
print(train.shape)
print(test.shape)

(26729, 2141)
(11456, 2141)


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [32]:
#Training a RF to get some metrics
X_train, X_val, y_train, y_val = train_test_split(train, train_outcome, test_size=0.3)
forest = RandomForestClassifier(n_estimators=250, n_jobs=2)
forest.fit(X_train, y_train)
y_pred_val = forest.predict(X_val)

In [33]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_val, y_pred_val))
print(accuracy_score(y_val, y_pred_val))

             precision    recall  f1-score   support

   Adoption       0.63      0.79      0.70      3152
       Died       0.33      0.03      0.06        61
 Euthanasia       0.22      0.08      0.11       484
Return_to_owner       0.42      0.37      0.39      1456
   Transfer       0.69      0.63      0.66      2866

avg / total       0.59      0.61      0.59      8019

0.607307644345


In [22]:
test.head()

Unnamed: 0,Cat,Dog,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown,0 years,1 day,1 month,...,Yellow,Yellow Brindle,Yellow Brindle/Blue,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/Gray,Yellow/Tan,Yellow/White,Yellow/Yellow
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#Training a RF with the complete training set
forest = RandomForestClassifier(n_estimators=500, n_jobs=2)
forest.fit(train, train_outcome)
y_pred = forest.predict_proba(test)

In [24]:
results = pd.read_csv("submissions/sample_submission.csv")

In [25]:
results

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,1,0,0,0,0
1,2,1,0,0,0,0
2,3,1,0,0,0,0
3,4,1,0,0,0,0
4,5,1,0,0,0,0
5,6,1,0,0,0,0
6,7,1,0,0,0,0
7,8,1,0,0,0,0
8,9,1,0,0,0,0
9,10,1,0,0,0,0


In [26]:
results['Adoption'], results['Died'], results['Euthanasia'], results['Return_to_owner'], results['Transfer'] = y_pred[:,0], y_pred[:,1], y_pred[:,2], y_pred[:,3], y_pred[:,4]
results.to_csv("submissions/submission_RuiMendes.csv", index=False)