In [126]:
import ml_metrics
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


In [27]:
df_train = pd.read_csv("../input/train/train.csv")
df_test = pd.read_csv("../input/test/test.csv")
df_breeds = pd.read_csv("../input/breed_labels.csv")
df_colors = pd.read_csv("../input/color_labels.csv")

In [110]:
colors = df_colors['ColorID']
breeds = df_breeds['BreedID']

In [111]:
def apply_word_flags(df, words):
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    df = df.drop(columns=['Description'])
    return df

In [None]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [102]:
def apply_color_flags(df, colors):
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [103]:
def apply_breed_flags(df, breeds):
    for b in breeds:
        df[f'B{b}'] = 0
    for i,breeds in df[['Breed1', 'Breed2']].iterrows():
        for b in breeds:
            if b != 0:
                df.at[i,f'B{b}'] = 1
    df = df.drop(columns=['Breed1', 'Breed2'])
    return df

In [128]:
X_train = apply_word_flags(df_train.copy(), keywords)
X_train = apply_color_flags(X_train, colors)
X_train = apply_breed_flags(X_train, breeds)
X_train = pd.get_dummies(X_train, columns=['Gender', 
                                           'Vaccinated', 'Dewormed', 'Sterilized', 
                                           'State'])
X_train_petID = X_train['PetID']
y_train_all = X_train['AdoptionSpeed']
X_train_all = X_train.drop(columns=['Name', 'RescuerID', 'PetID', 'AdoptionSpeed'])

In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_train_all, y_train_all, test_size=0.20).copy()

In [139]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
prediction = rfc.predict(X_test)
rfc.score(X_test, y_test)

0.41313771257085696

In [140]:
ml_metrics.quadratic_weighted_kappa(rater_a=prediction, rater_b=y_test)

0.33427860592999925

In [141]:
feature_importance = rfc.feature_importances_
features = []
for i,feature in enumerate(feature_importance):
    features.append((feature, X_train.columns[i]))

In [143]:
sorted(features, reverse=True)

[(0.09727385108803624, 'PhotoAmt'),
 (0.09614502671879052, 'Age'),
 (0.033017342142982485, 'FurLength'),
 (0.03271812169236386, 'MaturitySize'),
 (0.03208768200099475, 'C1'),
 (0.03157235256165939, 'Quantity'),
 (0.031114359869362437, 'C7'),
 (0.02770077141526943, 'Fee'),
 (0.026884251467362487, 'C2'),
 (0.023790932539266462, 'home'),
 (0.0205990444835727, 'adopt'),
 (0.020310648926964093, 'good'),
 (0.020149481512637783, 'C5'),
 (0.020135035108620355, 'give'),
 (0.01852711319701429, 'Gender_2'),
 (0.018518968563122386, 'State_41326'),
 (0.018060932836491322, 'contact'),
 (0.01793823485444758, 'loving'),
 (0.0173970423982136, 'Gender_1'),
 (0.01728304952572498, 'looking'),
 (0.01691189268579405, 'rescued'),
 (0.01654125662690421, 'State_41401'),
 (0.016128474373286038, 'C3'),
 (0.015792112886220802, 'C6'),
 (0.015248322055972945, 'C4'),
 (0.015130176223073779, 'playful'),
 (0.013182603762026757, 'B307'),
 (0.012913919521614485, 'cat'),
 (0.012403449318261628, 'B266'),
 (0.0123991930891

In [130]:
# X_test = apply_word_flags(df_test.copy(), keywords)
# X_test = apply_color_flags(X_test, colors)
# X_test = apply_breed_flags(X_test, breeds)
# X_test = pd.get_dummies(X_test, columns=['Gender', 
#                                          'Vaccinated', 'Dewormed', 'Sterilized', 
#                                          'State'])
# X_test_petID = X_test['PetID']
# X_test_all = X_test.drop(columns=['Name', 'RescuerID', 'PetID'])