# Machine Learning
---

## 1. What will happen to a dog in the shelter?

In [46]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score


import warnings
warnings.filterwarnings('ignore')

In [47]:
unique_df = pd.read_csv("../data/unique_austin_shelter.csv")
unique_df.set_index("animal_id", inplace=True)

In [48]:
features = ['is_mixed','intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in']
features1 = ['intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in']
features2 = ['is_mixed', 'sex', 'fixed', 'time_in_shelter', 'age_in']
features3 = ['intake_condition', 'sex', 'time_in_shelter', 'age_in']
features4 = ['is_mixed', 'intake_type', 'sex', 'fixed', 'age_in']
features5 = ['is_mixed', 'sex', 'fixed', 'age_in']

In [96]:
new_unique_df = unique_df[unique_df.in_shelter == "No"][['is_mixed', 'intake_condition', 'intake_type', 'name', 'sex', 'fixed', 'time_in_shelter', 'has_name', 'age_in','outcome_type']]

In [97]:
temp = new_unique_df.time_in_shelter.apply(pd.to_timedelta)
temp = temp.apply(lambda x:( x.days*24*60*60 + x.seconds)/60)
new_unique_df.time_in_shelter = temp
new_unique_df.time_in_shelter.dtype


dtype('float64')

In [98]:
new_unique_df.dropna(inplace=True)

In [99]:
new_unique_df.head()

Unnamed: 0_level_0,is_mixed,intake_condition,intake_type,name,sex,fixed,time_in_shelter,has_name,age_in,outcome_type
animal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A786884,1,Normal,Stray,Brock,Male,Yes,7132.0,1,2.0,Transfer
A706918,0,Normal,Stray,Belle,Female,Yes,134.0,1,8.005479,Return to Owner
A724273,1,Normal,Stray,Runster,Male,No,9994.0,1,0.994521,Return to Owner
A778404,1,Normal,Owner Surrender,Max,Male,No,4784.0,1,4.00274,Adoption
A682524,1,Normal,Stray,Rio,Male,Yes,4538.0,1,4.00274,Return to Owner


## Defining Training Data

## Model: KNeighborsClassifier 
### Feature Selection

In [28]:
model = KNeighborsClassifier(n_neighbors=30, n_jobs=-1)
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()

pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])


for feat in [features, features1, features2, features3, features4, features5]:
    x_train_dict = new_unique_df[feat].to_dict(orient="records")
    y_train = new_unique_df['outcome_type']
    print(feat, cross_val_score(pipeline, x_train_dict, y_train, cv=5, scoring="accuracy").mean())

['is_mixed', 'intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in'] 0.6971530504115181
['intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in'] 0.70163558915865
['is_mixed', 'sex', 'fixed', 'time_in_shelter', 'age_in'] 0.6404590170368781
['intake_condition', 'sex', 'time_in_shelter', 'age_in'] 0.647650275690558
['is_mixed', 'intake_type', 'sex', 'fixed', 'age_in'] 0.5567718064738866
['is_mixed', 'sex', 'fixed', 'age_in'] 0.5119780145296502


### Scaler testing

In [36]:
x_train_dict = new_unique_df[features1].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

In [37]:
scalers = [StandardScaler(), Normalizer(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]

In [38]:
grid_kn = GridSearchCV(pipeline,param_grid=dict(scaler=scalers),cv=5, verbose=5, n_jobs=-1)
grid_kn.fit(x_train_dict, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:  1.7min remaining:   58.8s
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed:  2.0min remaining:   16.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True), Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), MaxAbsScaler(copy=True), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [39]:
print(grid_kn.best_score_)
print(grid_kn.best_estimator_.steps)

0.7065734832371113
[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('model', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
           weights='uniform'))]


In [40]:
pd.DataFrame(grid_kn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,5.052805,2.826535,7.965209,3.085577,"StandardScaler(copy=True, with_mean=True, with...","{'scaler': StandardScaler(copy=True, with_mean...",0.69919,0.706582,0.702925,0.702545,...,0.701636,0.003318,2,0.71529,0.713788,0.716094,0.715525,0.717116,0.715563,0.001088
1,2.966998,0.564824,5.674741,0.882539,"Normalizer(copy=True, norm='l2')","{'scaler': Normalizer(copy=True, norm='l2')}",0.69881,0.707215,0.69938,0.699253,...,0.700344,0.003535,3,0.713928,0.711731,0.715145,0.71448,0.716388,0.714334,0.001539
2,3.812916,1.190793,4.673145,1.449552,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'scaler': MinMaxScaler(copy=True, feature_ran...",0.696786,0.699873,0.695327,0.698493,...,0.696647,0.002479,4,0.711301,0.709673,0.711568,0.711537,0.712654,0.711347,0.000959
3,5.145111,0.846617,7.430025,0.390906,MaxAbsScaler(copy=True),{'scaler': MaxAbsScaler(copy=True)},0.696786,0.699747,0.694821,0.69862,...,0.696495,0.002604,5,0.711206,0.709452,0.711505,0.711537,0.71259,0.711258,0.001018
4,2.040941,0.65437,3.469866,1.585345,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'scaler': RobustScaler(copy=True, quantile_ra...",0.703493,0.711013,0.706597,0.706091,...,0.706573,0.00246,1,0.71874,0.71784,0.719639,0.72119,0.721389,0.71976,0.001374


Robust scaler is the best

### K value testing

In [43]:
x_train_dict = new_unique_df[features1].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = RobustScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

model = KNeighborsClassifier(n_jobs=-1)

In [105]:
ks = [5,10,20,30,40,50]

In [45]:
grid_kn_k = GridSearchCV(model,param_grid=dict(n_neighbors=ks),cv=5)
grid_kn_k.fit(x_train_sc, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [5, 10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
print(grid_kn_k.best_score_)
print(grid_kn_k.best_estimator_.n_neighbors)

0.7071558796718322
40


In [139]:
pd.DataFrame(grid_kn_k.cv_results_).set_index("param_n_neighbors")['mean_test_score'].plot.line()

NameError: name 'grid_kn_k' is not defined

### Model Results

accuracy = 0.7071558796718322

k = 40

scaler = RobustScaler()

features = features1

model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None)

## Model: RandomForestClassifier

In [54]:
model = RandomForestClassifier(n_jobs=-1)
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()

pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])


for feat in [features, features1, features2, features3, features4, features5]:
    x_train_dict = new_unique_df[feat].to_dict(orient="records")
    y_train = new_unique_df['outcome_type']
    print(feat, cross_val_score(pipeline, x_train_dict, y_train, cv=5, scoring="accuracy").mean())

['is_mixed', 'intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in'] 0.6530440308125804
['intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in'] 0.6509176501850181
['is_mixed', 'sex', 'fixed', 'time_in_shelter', 'age_in'] 0.5705971276166315
['intake_condition', 'sex', 'time_in_shelter', 'age_in'] 0.5759396983858641
['is_mixed', 'intake_type', 'sex', 'fixed', 'age_in'] 0.5650260612953572
['is_mixed', 'sex', 'fixed', 'age_in'] 0.5228661492148154


### Scaler Testing

In [55]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

In [56]:
scalers = [StandardScaler(), Normalizer(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]

In [57]:
grid_rf = GridSearchCV(pipeline,param_grid=dict(scaler=scalers),cv=5, verbose=5, n_jobs=-1)
grid_rf.fit(x_train_dict, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:   16.6s remaining:    9.3s
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed:   22.3s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   25.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True), Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), MaxAbsScaler(copy=True), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [75]:
print(grid_rf.best_score_)
print(grid_rf.best_estimator_.steps)

0.7210574293527803
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=90, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)


In [59]:
pd.DataFrame(grid_rf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.463739,0.049157,0.153403,0.002868,"StandardScaler(copy=True, with_mean=True, with...","{'scaler': StandardScaler(copy=True, with_mean...",0.644394,0.658228,0.659111,0.648854,...,0.652664,0.005576,4,0.952137,0.952994,0.952018,0.950055,0.951766,0.951794,0.000963
1,0.461668,0.017738,0.153075,0.003597,"Normalizer(copy=True, norm='l2')","{'scaler': Normalizer(copy=True, norm='l2')}",0.678562,0.69038,0.68659,0.683804,...,0.684468,0.003923,1,0.907249,0.906274,0.907169,0.904763,0.906254,0.906342,0.000896
2,0.44443,0.044151,0.157617,0.008575,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'scaler': MinMaxScaler(copy=True, feature_ran...",0.646925,0.659873,0.659238,0.651387,...,0.653474,0.005174,3,0.952168,0.950842,0.952366,0.950404,0.95145,0.951446,0.000751
3,0.438518,0.040227,0.150395,0.002422,MaxAbsScaler(copy=True),{'scaler': MaxAbsScaler(copy=True)},0.643888,0.660253,0.659491,0.649994,...,0.653575,0.006112,2,0.953973,0.951602,0.953315,0.951828,0.951734,0.95249,0.000967
4,0.391922,0.052007,0.153726,0.00488,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'scaler': RobustScaler(copy=True, quantile_ra...",0.639205,0.659241,0.653413,0.650627,...,0.651296,0.006658,5,0.952422,0.950082,0.952113,0.952682,0.950817,0.951623,0.001002


### Parameter tuning

In [62]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = Normalizer()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

model = RandomForestClassifier(n_jobs=-1)

In [68]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 50, 90, 100],
    'max_features': [2, 3],
    'random_state': [10],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [10, 12, 15],
    'n_estimators': [100, 200, 300, 1000]
}

In [69]:
grid_rf_p = GridSearchCV(model,param_grid=param_grid,cv=5, n_jobs=-1, verbose=2)
grid_rf_p.fit(x_train_sc, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 64.4min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 97.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [10, 50, 90, 100], 'max_features': [2, 3], 'random_state': [10], 'min_samples_leaf': [2, 3, 4], 'min_samples_split': [10, 12, 15], 'n_estimators': [100, 200, 300, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [73]:
print(grid_rf_p.best_score_)
print(grid_rf_p.best_estimator_)

0.7210574293527803
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=90, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)


### Model Results

accuracy = 0.7210574293527803

scaler = Normalizer()

features = features

model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=90, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

## Model: LinearSVC

In [78]:
model = LinearSVC(verbose=2)
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()

pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])


for feat in [features, features1, features2, features3, features4, features5]:
    x_train_dict = new_unique_df[feat].to_dict(orient="records")
    y_train = new_unique_df['outcome_type']
    print(feat, cross_val_score(pipeline, x_train_dict, y_train, cv=5, scoring="accuracy").mean())

### Scaler Testing

In [79]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

In [80]:
scalers = [StandardScaler(), Normalizer(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]

In [81]:
grid_ls = GridSearchCV(pipeline,param_grid=dict(scaler=scalers),cv=5, verbose=5, n_jobs=-1)
grid_ls.fit(x_train_dict, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:  1.7min remaining:   59.0s
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed:  2.0min remaining:   16.6s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.3min finished


[LibLinear]

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=2))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True), Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), MaxAbsScaler(copy=True), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [82]:
print(grid_ls.best_score_)
print(grid_ls.best_estimator_.steps)

0.6214423174313785
[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('model', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=2))]


### Parameter tuning

In [83]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = Normalizer()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

model = LinearSVC()

In [68]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'max_iter': [500,1000,1500]
}

In [69]:
grid_ls_p = GridSearchCV(model,param_grid=param_grid,cv=5, n_jobs=-1, verbose=2)
grid_ls_p.fit(x_train_sc, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 17.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 37.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 64.4min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 97.0min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [10, 50, 90, 100], 'max_features': [2, 3], 'random_state': [10], 'min_samples_leaf': [2, 3, 4], 'min_samples_split': [10, 12, 15], 'n_estimators': [100, 200, 300, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [73]:
print(grid_ls_p.best_score_)
print(grid_ls_p.best_estimator_)

0.7210574293527803
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=90, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)


### Model Results

accuracy = 

scaler = RobustScaler()

features = features

model = 

## Model: BaggingClassifier

In [134]:
model = BaggingClassifier(n_jobs=-1)
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()

pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])


for feat in [features, features1, features2, features3, features4, features5]:
    x_train_dict = new_unique_df[feat].to_dict(orient="records")
    y_train = new_unique_df['outcome_type']
    print(feat, cross_val_score(pipeline, x_train_dict, y_train, cv=5, scoring="accuracy").mean())

### Scaler Testing

In [135]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

In [136]:
scalers = [StandardScaler(), Normalizer(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]

In [137]:
grid_bc = GridSearchCV(pipeline,param_grid=dict(scaler=scalers),cv=5, verbose=5, n_jobs=-1)
grid_bc.fit(x_train_dict, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:   23.0s remaining:   12.9s
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed:   30.7s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   34.3s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True), Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), MaxAbsScaler(copy=True), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [88]:
print(grid_bc.best_score_)
print(grid_bc.best_estimator_.steps)

0.6745670009115771
[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', Normalizer(copy=True, norm='l2')), ('model', BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False))]


In [89]:
pd.DataFrame(grid_bc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.554864,0.002476,0.152806,0.002147,"StandardScaler(copy=True, with_mean=True, with...","{'scaler': StandardScaler(copy=True, with_mean...",0.666667,0.680506,0.675066,0.672914,...,0.673554,0.004458,2,0.958341,0.958154,0.958569,0.959994,0.958096,0.958631,0.000701
1,0.943844,0.059353,0.161845,0.010232,"Normalizer(copy=True, norm='l2')","{'scaler': Normalizer(copy=True, norm='l2')}",0.667426,0.678734,0.677219,0.677219,...,0.674567,0.004192,1,0.913454,0.913712,0.913531,0.914069,0.916065,0.914166,0.000973
2,0.564024,0.022012,0.151201,0.000739,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'scaler': MinMaxScaler(copy=True, feature_ran...",0.667046,0.679367,0.67494,0.674053,...,0.672567,0.004712,4,0.960779,0.958692,0.959044,0.958253,0.958444,0.959042,0.000908
3,0.550593,0.00957,0.15077,0.00158,MaxAbsScaler(copy=True),{'scaler': MaxAbsScaler(copy=True)},0.667553,0.680127,0.676586,0.672281,...,0.67287,0.004914,3,0.959481,0.958945,0.958506,0.958569,0.958286,0.958757,0.000419
4,0.5625,0.006365,0.151941,0.001624,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'scaler': RobustScaler(copy=True, quantile_ra...",0.665781,0.676203,0.675446,0.668735,...,0.670389,0.004574,5,0.960051,0.960401,0.956924,0.960215,0.960913,0.959701,0.001418


### Parameter tuning

In [99]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = Normalizer()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

model = BaggingClassifier(n_jobs=-1)

In [104]:
param_grid = {
    'max_samples': [600,800,1000],
    'max_features': [0.5, 0.75, 1],
    'random_state': [10],
    'n_estimators': [100, 200, 300, 500]
}

In [105]:
grid_bc_p = GridSearchCV(model,param_grid=param_grid,cv=5, n_jobs=-1, verbose=2)
grid_bc_p.fit(x_train_sc, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:   11.1s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   12.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_samples': [1600], 'max_features': [1], 'random_state': [10], 'n_estimators': [100, 200, 300, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [106]:
print(grid_bc_p.best_score_)
print(grid_bc_p.best_estimator_)

0.6166565380330193
BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1, max_samples=1600,
         n_estimators=100, n_jobs=-1, oob_score=False, random_state=10,
         verbose=0, warm_start=False)


### Model Results

accuracy = 0.7185505925250684

scaler = Normalizer()

features = features

model = BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=0.75, max_samples=1000,
         n_estimators=200, n_jobs=-1, oob_score=False, random_state=10,
         verbose=0, warm_start=False)

## Predicting

In [53]:
x_train_dict = new_unique_df[features].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = Normalizer()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=90, 
                               max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                               min_impurity_split=None, min_samples_leaf=2, min_samples_split=15, 
                               min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1, 
                               oob_score=False, random_state=10, verbose=0, warm_start=False)
model.fit(x_train_sc, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=90, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=15,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [58]:
new_dog = pd.DataFrame()
features = ['is_mixed','intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in']
# new_dog['is_mixed'] = [1]
# new_dog['intake_condition'] = ['Normal']
# new_dog['intake_type'] = ["Stray"]
# new_dog['sex'] = ["Male"]
# new_dog['fixed'] = ["Yes"]
# new_dog['time_in_shelter'] = [10000]
# new_dog['age_in'] = [4.0]

new_dog = pd.DataFrame(new_unique_df.iloc[100]).T

new_dog = new_dog.to_dict(orient="records")
new_dog = vec.transform(new_dog)
new_dog_sc = scaler.transform(new_dog)

model.predict(new_dog)

array(['Return to Owner'], dtype=object)

In [59]:
new_unique_df.iloc[100]

is_mixed                          0
intake_condition             Normal
intake_type                   Stray
sex                          Female
fixed                            No
time_in_shelter                3005
age_in                      1.49315
outcome_type        Return to Owner
Name: A675255, dtype: object

In [75]:
unadopted_dog = unique_df.loc["A787254"]
time_in_shelter = pd.to_datetime('today') - pd.to_datetime(unadopted_dog.date_in)
time_in_shelter = (time_in_shelter.days*24*60*60 + time_in_shelter.seconds)/60
unadopted_dog.time_in_shelter = time_in_shelter

unadopted_dog = pd.DataFrame(unadopted_dog[features]).T
unadopted_dog

Unnamed: 0,is_mixed,intake_condition,intake_type,sex,fixed,time_in_shelter,age_in
A787254,1,Normal,Stray,Female,No,93915.1,0.00821918


In [76]:
unadopted_dog_dict = unadopted_dog.to_dict(orient="records")
unadopted_dog = vec.transform(unadopted_dog_dict)
unadopted_dog_sc = scaler.transform(unadopted_dog)

model.predict(unadopted_dog_sc)

array(['Adoption'], dtype=object)

In [77]:
preds = model.predict(x_train_sc)

In [78]:
accuracy_score(y_train, preds)

0.7789646880141754

In [79]:
precision_score(y_train=="Adoption",preds=="Adoption")

0.7279596977329975

In [80]:
recall_score(y_train=="Adoption",preds=="Adoption")

0.9632263081879792

## 2. Will your dog get adopted?

In [95]:
new_unique_df.head()

Unnamed: 0_level_0,is_mixed,intake_condition,intake_type,sex,fixed,time_in_shelter,has_name,age_in,outcome_type
animal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A786884,1,Normal,Stray,Male,Yes,7132.0,1,2.0,Transfer
A706918,0,Normal,Stray,Female,Yes,134.0,1,8.005479,Return to Owner
A724273,1,Normal,Stray,Male,No,9994.0,1,0.994521,Return to Owner
A778404,1,Normal,Owner Surrender,Male,No,4784.0,1,4.00274,Adoption
A682524,1,Normal,Stray,Male,Yes,4538.0,1,4.00274,Return to Owner


In [102]:
your_features1 = ['is_mixed','intake_condition', 'intake_type', 'sex', 'fixed', 'age_in','name', 'has_name']
your_features2 = ['intake_condition', 'intake_type', 'sex', 'fixed', 'age_in', 'has_name']
your_features3 = ['is_mixed', 'intake_type', 'sex', 'fixed', 'age_in', 'has_name']
your_features4 = ['is_mixed', 'sex', 'fixed', 'age_in', 'has_name']
your_features5 = ['intake_condition', 'sex', 'age_in', 'has_name']
your_features6 = ['is_mixed', 'sex', 'fixed', 'age_in']
your_features = [your_features1, your_features2, your_features3, your_features4, your_features5, your_features6]

## Model: KNeighborsClassifier 
### Feature Selection

In [None]:
model = KNeighborsClassifier(n_neighbors=30, n_jobs=-1)
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()

pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])


for feat in your_features:
    x_train_dict = new_unique_df[feat].to_dict(orient="records")
    y_train = new_unique_df['outcome_type']
    print(feat, cross_val_score(pipeline, x_train_dict, y_train, cv=5, scoring="accuracy").mean())

### Scaler testing

In [36]:
x_train_dict = new_unique_df[features1].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

In [37]:
scalers = [StandardScaler(), Normalizer(), MinMaxScaler(), MaxAbsScaler(), RobustScaler()]

In [38]:
grid_kn = GridSearchCV(pipeline,param_grid=dict(scaler=scalers),cv=5, verbose=5, n_jobs=-1)
grid_kn.fit(x_train_dict, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:  1.7min remaining:   58.8s
[Parallel(n_jobs=-1)]: Done  22 out of  25 | elapsed:  2.0min remaining:   16.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  2.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True), Normalizer(copy=True, norm='l2'), MinMaxScaler(copy=True, feature_range=(0, 1)), MaxAbsScaler(copy=True), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [39]:
print(grid_kn.best_score_)
print(grid_kn.best_estimator_.steps)

0.7065734832371113
[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('model', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=30, p=2,
           weights='uniform'))]


In [40]:
pd.DataFrame(grid_kn.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,5.052805,2.826535,7.965209,3.085577,"StandardScaler(copy=True, with_mean=True, with...","{'scaler': StandardScaler(copy=True, with_mean...",0.69919,0.706582,0.702925,0.702545,...,0.701636,0.003318,2,0.71529,0.713788,0.716094,0.715525,0.717116,0.715563,0.001088
1,2.966998,0.564824,5.674741,0.882539,"Normalizer(copy=True, norm='l2')","{'scaler': Normalizer(copy=True, norm='l2')}",0.69881,0.707215,0.69938,0.699253,...,0.700344,0.003535,3,0.713928,0.711731,0.715145,0.71448,0.716388,0.714334,0.001539
2,3.812916,1.190793,4.673145,1.449552,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'scaler': MinMaxScaler(copy=True, feature_ran...",0.696786,0.699873,0.695327,0.698493,...,0.696647,0.002479,4,0.711301,0.709673,0.711568,0.711537,0.712654,0.711347,0.000959
3,5.145111,0.846617,7.430025,0.390906,MaxAbsScaler(copy=True),{'scaler': MaxAbsScaler(copy=True)},0.696786,0.699747,0.694821,0.69862,...,0.696495,0.002604,5,0.711206,0.709452,0.711505,0.711537,0.71259,0.711258,0.001018
4,2.040941,0.65437,3.469866,1.585345,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'scaler': RobustScaler(copy=True, quantile_ra...",0.703493,0.711013,0.706597,0.706091,...,0.706573,0.00246,1,0.71874,0.71784,0.719639,0.72119,0.721389,0.71976,0.001374


### K value testing

In [43]:
x_train_dict = new_unique_df[features1].to_dict(orient="records")
y_train = new_unique_df['outcome_type']

vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = RobustScaler()
scaler.fit(x_train)
x_train_sc = scaler.transform(x_train)

model = KNeighborsClassifier(n_jobs=-1)

In [104]:
ks = [5,10,20,30,40,50]

In [45]:
grid_kn_k = GridSearchCV(model,param_grid=dict(n_neighbors=ks),cv=5)
grid_kn_k.fit(x_train_sc, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [5, 10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [46]:
print(grid_kn_k.best_score_)
print(grid_kn_k.best_estimator_.n_neighbors)

0.7071558796718322
40


In [139]:
pd.DataFrame(grid_kn_k.cv_results_).set_index("param_n_neighbors")['mean_test_score'].plot.line()

NameError: name 'grid_kn_k' is not defined

### Model Results

accuracy = 0.7071558796718322

k = 40

scaler = RobustScaler()

features = features1

model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None)