In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#the max number of rows in data, 0 = no cap
LIMIT_ROWS = 5000

# Loads the data from the various csv files
def loadData():
    breed_labels = pd.read_csv("breed_labels.csv")
    color_labels = pd.read_csv("color_labels.csv")
    state_labels = pd.read_csv("state_labels.csv")
    testData = pd.read_csv("test/test.csv")
    trainData = pd.read_csv("train/train.csv")
    return breed_labels, color_labels, state_labels, testData, trainData

# Prepare data to be used for one hot encoding, replace id's with the names
def oneHotEncoding(breed_labels, color_labels, state_labels, trainData):
    breedIdList = breed_labels.loc[:, "BreedID"].tolist()
    breedNameList = breed_labels.loc[:, "BreedName"].tolist()
    colorIdList = color_labels.loc[:, "ColorID"].tolist()
    colorNameList = color_labels.loc[:, "ColorName"].tolist()
    stateIdList = state_labels.loc[:, "StateID"].tolist()
    stateNameList = state_labels.loc[:, "StateName"].tolist()
    features = trainData[["Type", "Age", "Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3", "MaturitySize", "FurLength", "Vaccinated", "Dewormed", "Sterilized", "Health", "Quantity", "Fee", "State", "VideoAmt", "PhotoAmt", "AdoptionSpeed"]]
    features.Type.replace([1, 2], ['dog', 'cat'], inplace=True)
    features.Gender.replace([1, 2, 3], ['Male', 'Female', 'Mixed'], inplace=True)
    features.MaturitySize.replace([1, 2, 3, 4, 0], ['Small', 'Medium', 'Large', 'Extra Large', 'Not Specified'], inplace=True)
    features.FurLength.replace([1, 2, 3, 0], ['Short', 'Medium', 'Long', 'Not Specified'], inplace=True)
    features.Vaccinated.replace([1, 2, 3], ['Yes', 'No', 'Not Sure'])
    features.Dewormed.replace([1, 2, 3], ['Yes', 'No', 'Not Sure'])
    features.Sterilized.replace([1, 2, 3], ['Yes', 'No', 'Not Sure'])
    features.Health.replace([1, 2, 3, 0], ['Healthy', 'Minor Injury', 'Serious Injury', 'Not Specified'])
    features.Breed1.replace(breedIdList, breedNameList, inplace=True)
    features.Breed2.replace(breedIdList, breedNameList, inplace=True)
    features.Color1.replace(colorIdList, colorNameList, inplace=True)
    features.Color2.replace(colorIdList, colorNameList, inplace=True)
    features.Color3.replace(colorIdList, colorNameList, inplace=True)
    features.State.replace(stateIdList, stateNameList, inplace=True)
    features = pd.get_dummies(features)
    if LIMIT_ROWS > 0:
        features = features.head(LIMIT_ROWS)
    return features
    
def createFeaturesLabels(features):
    labels = np.array(features['AdoptionSpeed'])
    features = features.drop('AdoptionSpeed', axis = 1)
    feature_list = list(features.columns)
    return features, labels

def trainAndPredict(features, labels):
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)
    print('Training Features Shape:', train_features.shape)
    print('Training Labels Shape:', train_labels.shape)
    print('Testing Features Shape:', test_features.shape)
    print('Testing Labels Shape:', test_labels.shape)

    rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
    rf.fit(train_features, train_labels)
    feature_importances = pd.DataFrame(rf.feature_importances_, index = features.columns, columns=['importance']).sort_values('importance', ascending=False)
    print(feature_importances)
    predictions = rf.predict(test_features)
    errors = abs((predictions + 1) - (test_labels + 1))
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
    mape = 100 * (errors / (test_labels + 1))
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')
    
def main():
    breed_labels, color_labels, state_labels, testData, trainData = loadData()
    features = oneHotEncoding(breed_labels, color_labels, state_labels, trainData)
    features, labels = createFeaturesLabels(features)
    trainAndPredict(features, labels)
    
main()


  from numpy.core.umath_tests import inner1d
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Training Features Shape: (3750, 366)
Training Labels Shape: (3750,)
Testing Features Shape: (1250, 366)
Testing Labels Shape: (1250,)
                                           importance
Age                                          0.103855
PhotoAmt                                     0.103775
Quantity                                     0.034888
Dewormed                                     0.034471
Sterilized                                   0.032809
Vaccinated                                   0.032215
Fee                                          0.028809
Color1_Black                                 0.022173
State_Selangor                               0.020508
Breed2_0                                     0.020474
Color1_Brown                                 0.020039
Color2_0                                     0.019849
FurLength_Short                              0.019773
FurLength_Medium                             0.019259
Gender_Female                                0.019010
Co