In [None]:
from pandas import read_csv
data = read_csv("../input/horse.csv")

In [None]:
def get_feature_lists_by_dtype(data):
    output = {}
    for f in data.columns:
        dtype = str(data[f].dtype)
        if dtype not in output.keys(): output[dtype] = [f]
        else: output[dtype] += [f]
    return output

In [None]:
target = "outcome"

In [None]:
dtype = get_feature_lists_by_dtype(data)

In [None]:
categories = dtype["object"]
categories.remove(target)

In [None]:
counts = ["lesion_2", "lesion_3"]

In [None]:
numerics = dtype["float64"] + dtype["int64"]

----------


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from pandas import get_dummies, concat

In [None]:
transformed = concat([data[numerics],get_dummies(data[categories])],1).fillna(0)

In [None]:
cross_val_score(XGBClassifier(), transformed, data[target],cv=10).mean()

## XGBClassifier default settings produce 71.9% accuracy rate with CV=10.

----------


In [None]:
def get_results(model, X, y):

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        from sklearn.model_selection import cross_val_score
        compute = cross_val_score(model, X, y,cv=10)
        mean = compute.mean()
        std = compute.std()
        return mean, std

def display_classifier_results(X,y):

    output = {}

    for m in models:
        try:
            model_name = type(m).__name__
            scores = get_results(m,X,y)
            row = {"Mean Accuracy" : scores[0], "Standard Deviation" : scores[1]}
            output[model_name] = row
        except:
            pass

    from pandas import DataFrame
    from IPython.display import display

    display(DataFrame(data=output).T.round(2).sort_values("Mean Accuracy", ascending=False))

In [None]:
models = []

from sklearn.neighbors import KNeighborsClassifier
models = [KNeighborsClassifier()]

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
models += [GaussianNB(), MultinomialNB(), BernoulliNB()]

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier#, VotingClassifier
models += [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), ExtraTreesClassifier()]

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
models += [LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()]

from sklearn.svm import SVC, LinearSVC
models += [SVC(),LinearSVC()]

from sklearn.neighbors.nearest_centroid import NearestCentroid
models += [NearestCentroid()]

from xgboost import XGBClassifier
models += [XGBClassifier()]

In [None]:
display_classifier_results(transformed,data[target])

In [None]:
def get_score(actuals, predictions):
    
    # === Prepare dictionary for accuracy score for each unique label === #
    
    score_dictionary = {}
    
    # === Set count to 0 for all labels === #
    
    for value in actuals.unique():
        score_dictionary[value] = 0
        
    # === Get total counts of each label in actual series === #
    
    actuals_counts = actuals.value_counts()
    
    # === Convert actuals series into list === #
    
    actuals = actuals.tolist()
    
    # === For every matched item by index in actuals and predictions list, add +1 to their counts === #
    
    for i in range(0,len(actuals)):
        
        if actuals[i] == predictions[i]:
            
            value = actuals[i]
            
            score_dictionary[value] += 1
            
    # === Divide label counts correctly guessed by total actual counts in actuals === #
            
    for key in score_dictionary.keys():
        score_dictionary[key] /= actuals_counts[key]
        
    # === Mean Accuracy === #
        
    score_dictionary["Mean Accuracy"] = Series(score_dictionary).mean()
        
    # === Return a score dictionary for this instance of classification predictions === #
                
    return score_dictionary

In [None]:
from pandas import DataFrame, Series
from IPython.display import display
from sklearn.model_selection import StratifiedKFold

def get_cross_validation_mean_score(full_data,category,model,folds):
    
    # === KFold Object === #
    
    splitter = StratifiedKFold(n_splits=folds)
    
    # === Keep Model Template === #
    
    model_copy = model
    
    # === Split full data as feature and label data === #
    
    feature_data = full_data.drop(category,1)
    label_data = full_data[category]
    
    # === Set Up List for Scores === #
    
    scores = []
    
    # === For Every Split, Add Accuracy Score by Label Dictionary to Scores List === #
    
    for train_indices, test_indices in splitter.split(feature_data, label_data):
        
        # === Test Data. Actual Label for Index === #
        
        actuals = full_data.iloc[test_indices][category]
        
        # === Reset to Unfitted Model === #
        
        model = model_copy
        
        # === Prepare Input Data for Fitting === #
        
        feature_data = full_data.iloc[train_indices].drop(category,1)
        label_data = full_data.iloc[train_indices][category]
        
        # === Fit the Data === #
        
        model.fit(feature_data,label_data)
        
        # === Obtain predictions from fitted model === #
        
        predictions = model.predict(full_data.iloc[test_indices].drop(category,1))
        
        # === Get accuracy score by label dictionary, then add to scores list === #
        
        scores += [get_score(actuals,predictions)]
        
    # === Return a mean score by label dictionary === #
        
    mean_score = DataFrame(scores).mean().to_dict()
        
    return mean_score

In [None]:
def display_classifier_results(full_data,category,models,folds):

    output = {}

    for m in models:
        try:
            model_name = type(m).__name__
            row = get_cross_validation_mean_score(full_data,category,m,folds)
            output[model_name] = row
        except:
            pass

    from pandas import DataFrame
    from IPython.display import display

    display(DataFrame(data=output).T.round(2).sort_values("Mean Accuracy", ascending=False))

In [None]:
full_data = transformed.copy()
full_data[target] = data[target]

In [None]:
display_classifier_results(full_data, target, models, 10)