In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing as pre
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [24]:
#Clean-up some of the features
all_train = pd.read_csv('data/train.csv')
all_train = all_train.reindex(np.random.permutation(all_train.index))
all_train['Breed'] = all_train['Breed'].str.replace('Black/Tan', 'BlackTan')
all_train['Breed'] = all_train['Breed'].str.replace('/Unknown', ' Unknown')
all_train['Breed'] = all_train['Breed'].str.replace('St. Bernard Rough Coat', 'StBernard')
all_train['Breed'] = all_train['Breed'].str.replace('St. Bernard Smooth Coat', 'StBernard')
all_train['Breed'] = all_train['Breed'].str.replace('German Shorthair Pointer', 'Pointer')
all_train['Breed'] = all_train['Breed'].str.replace('German Wirehaired Pointer', 'Pointer')
all_train['Breed'] = all_train['Breed'].str.replace('Dachshund Longhair', 'Dachshund')
all_train['Breed'] = all_train['Breed'].str.replace('Dachshund Wirehair', 'Dachshund')
all_train['Breed'] = all_train['Breed'].str.replace('English Pointer', 'Pointer')
all_train['Breed'] = all_train['Breed'].str.replace('Chihuahua Shorthair', 'Chihuahua')
all_train['Breed'] = all_train['Breed'].str.replace('Chihuahua Longhair', 'Chihuahua')
all_train['Breed'] = all_train['Breed'].str.replace('Alaskan Husky', 'Husky')
all_train['Breed'] = all_train['Breed'].str.replace('Siberian Husky', 'Husky')

all_train['Breed'] = all_train['Breed'].str.replace(' ', '')
all_train['Breed'] = all_train['Breed'].str.replace('PitBull', ' PitBull ')
all_train['Breed'] = all_train['Breed'].str.replace('Rottweiler', ' Rottweiler ')
all_train['Breed'] = all_train['Breed'].str.replace('/', ' known ')
all_train['Breed'] = all_train['Breed'].str.replace('Mix', ' Mix ')
#all_train['Breed'] = all_train['Breed'].str.replace('Mix', '')
all_train['Breed'] = all_train['Breed'].str.replace('Unknown', '')
all_train['Breed'] = all_train['Breed'].str.replace('Australian', '')
all_train['Breed'] = all_train['Breed'].str.replace('American', '')
all_train['Breed'] = all_train['Breed'].str.replace('German', '')
all_train['Breed'] = all_train['Breed'].str.replace('YorkshireTerrier', ' Yorkshire ')

In [47]:
##Breed feature has many different combinations this function breaks it down to extract the most significant features
##and renames non-significant ones to "other"

def best_feat_breed(all_train):
    best_0 = pd.DataFrame()
    best_1 = pd.DataFrame()
    best_2 = pd.DataFrame()
    best_3 = pd.DataFrame()
    best_4 = pd.DataFrame()

    for trial in range(100):
        ##Randomize 100 times 
        all_train = all_train.reindex(np.random.permutation(all_train.index))

        split = all_train.shape[0] // 20
        dev = all_train[:split]
        train = all_train[split:]
        
        ##Use count vectorizer
        cv = CountVectorizer()
        train_corpus = cv.fit_transform(train.Breed)
        dev_corpus = cv.transform(dev.Breed)
        features = np.array(cv.get_feature_names())
        
        ##Apply logistic regression to measure determine best features
        clf = LogisticRegression(penalty='l1', C=0.5)
        clf.fit(train_corpus, train.OutcomeType)
        preds = clf.predict_proba(dev_corpus)
        
        ##Store the best features across all outcome types
        for a in range(5):
            if a == 0:
                best_0 = best_0.append(pd.DataFrame(features[[np.argsort(clf.coef_[a])[-20:]]]))
            elif a == 1:
                best_1 = best_1.append(pd.DataFrame(features[[np.argsort(clf.coef_[a])[-20:]]]))
            elif a == 2:
                best_2 = best_2.append(pd.DataFrame(features[[np.argsort(clf.coef_[a])[-20:]]]))
            elif a == 3:
                best_3 = best_3.append(pd.DataFrame(features[[np.argsort(clf.coef_[a])[-20:]]]))
            elif a == 4:
                best_4 = best_4.append(pd.DataFrame(features[[np.argsort(clf.coef_[a])[-20:]]]))

    best_0.columns = ['feature']
    best_1.columns = ['feature']
    best_2.columns = ['feature']
    best_3.columns = ['feature']
    best_4.columns = ['feature']
    
    ##Combine and get unique set of best features from the 100 trials which appeared at least 80% of the time
    best = list(best_0.groupby('feature').size()[best_0.groupby('feature').size() > 80].index)
    best.extend(list(best_1.groupby('feature').size()[best_1.groupby('feature').size() > 80].index))
    best.extend(list(best_2.groupby('feature').size()[best_2.groupby('feature').size() > 80].index))
    best.extend(list(best_3.groupby('feature').size()[best_3.groupby('feature').size() > 80].index))
    best.extend(list(best_4.groupby('feature').size()[best_4.groupby('feature').size() > 80].index))
    best = list(set(best))
    
    ##Replace non-significant words with the word other
    cv = CountVectorizer()
    all_train_corpus = cv.fit_transform(all_train.Breed)
    all_corpus = np.array(cv.get_feature_names())
    
    other = list(set(all_corpus) - set(best))

    all_train['Breed'] = [x.lower() for x in all_train['Breed']]

    for x in other:
        all_train['Breed'] = all_train['Breed'].str.replace(x, ' other ')
        
    return(all_train)

In [26]:
##Reduce DateTime feature to month and day of week

def calendar_date(all_train):

    dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
    dayOfWeek_alt={0:'Monday', 1:'Tues-Fri', 2:'Tues-Fri', 3:'Tues-Fri', 4:'Tues-Fri', 5:'Sat-Sun', 6:'Sat-Sun'}
    MonthName={1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov',12:'Dec'} 
    MonthName_alt={1:'Jan-Feb', 2:'Jan-Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul',
                   8:'Aug', 9:'Sep-Nov',10:'Sep-Nov',11:'Sep-Nov',12:'Dec'}

    all_train['DateTime'] = pd.to_datetime(all_train['DateTime'])
    all_train['weekday'] = all_train['DateTime'].dt.dayofweek.map(dayOfWeek_alt)
    all_train['month'] = all_train['DateTime'].dt.month.map(MonthName_alt)
    
    return (all_train)

In [27]:
##Combine features and return combined numpy array which will be used as inputs into sklearn models

def combine_array(dev, train):
    ##Convert calendar date features to dummy variables
    day_of_week_dev = np.array(pd.get_dummies(dev[['weekday','month']]))
    day_of_week_train = np.array(pd.get_dummies(train[['weekday','month']]))

    ##Determine common vocab set from dev and test
    vectorizer = CountVectorizer(min_df=1)
    all_vocab = vectorizer.fit_transform(pd.concat([train.Breed, dev.Breed]))
    all_vocab = vectorizer.get_feature_names()
    dev_vocab = vectorizer.fit_transform(dev.Breed)
    dev_vocab = vectorizer.get_feature_names()
    common = set(all_vocab).intersection(dev_vocab)
    
    ##Convert dev and train corpus to dense from sparse matrix
    cv = CountVectorizer(vocabulary=common)
    train_corpus = cv.fit_transform(train.Breed).todense()
    dev_corpus = cv.transform(dev.Breed).todense()
    
    ##Finally combine
    dev_combine = np.concatenate((dev_corpus, day_of_week_dev),axis=1)
    train_combine = np.concatenate((train_corpus, day_of_week_train),axis=1)
    
    return dev_combine, train_combine

In [48]:
##Separate to cats and dogs
all_train_cat = all_train[(all_train['AnimalType']=='Cat')]
all_train_dog = all_train[(all_train['AnimalType']=='Dog')]

all_train_cat = best_feat_breed(all_train_cat)
all_train_cat = calendar_date(all_train_cat)
all_train_dog = best_feat_breed(all_train_dog)
all_train_dog = calendar_date(all_train_dog)

split_cat = all_train_cat.shape[0] // 20
split_dog = all_train_dog.shape[0] // 20

dev_cat = all_train_cat[:split_cat]
train_cat = all_train_cat[split_cat:]

dev_dog = all_train_cat[:split_dog]
train_dog = all_train_cat[split_dog:]

In [49]:
dev_dog_combine, train_dog_combine = combine_array(dev_dog, train_dog)
dev_cat_combine, train_cat_combine = combine_array(dev_cat, train_cat)

In [50]:
def Logreg(train_combine, dev_combine, train, dev):
    clf = LogisticRegression(penalty='l1', C=0.1)
    clf.fit(train_combine, train.OutcomeType)
    preds = clf.predict_proba(dev_combine)
    print 'Logistc Regression Log Loss: {:.3f}'.format(log_loss(dev.OutcomeType, preds))
    print 'Logistc Regression Accuracy: {:.3f}'.format(clf.score(dev_combine, dev.OutcomeType)), '\n'

Logreg(train_dog_combine, dev_dog_combine, train_dog, dev_dog)
Logreg(train_cat_combine, dev_cat_combine, train_cat, dev_cat)

Logistc Regression Log Loss: 0.996
Logistc Regression Accuracy: 0.555 

Logistc Regression Log Loss: 0.990
Logistc Regression Accuracy: 0.572 



In [51]:
def rf(train_combine, dev_combine, train, dev):
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(train_combine, train.OutcomeType)

    print 'Accuracy (a random forest):', rfc.score(dev_combine, dev.OutcomeType)

rf(train_dog_combine, dev_dog_combine, train_dog, dev_dog)
rf(train_cat_combine, dev_cat_combine, train_cat, dev_cat)

Accuracy (a random forest): 0.553273427471
Accuracy (a random forest): 0.561151079137
