# Machine Learning

### Will a dog get adopted?
* Predict outcome of a dog

In [28]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score


import warnings
warnings.filterwarnings('ignore')

In [29]:
unique_df = pd.read_csv("../data/unique_austin_shelter.csv")
unique_df.set_index("animal_id", inplace=True)

In [30]:
features = ['breed', 'intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in']
features1 = ['sex', 'fixed', 'time_in_shelter', 'age_in']

In [31]:
new_unique_df = unique_df[unique_df.in_shelter == "No"][['breed', 'intake_condition', 'intake_type', 'sex', 'fixed', 'time_in_shelter', 'age_in','outcome_type']]

In [32]:
new_unique_df.dropna(inplace=True)

**Hyperparameter Tuning**
1. Features
2. Scaling
3. K 

## Defining Training Data

In [70]:
x_train_dict = new_unique_df[features][:3000].to_dict(orient="records")
y_train = new_unique_df['outcome_type'][:3000]

In [71]:
vec = DictVectorizer(sparse=False)
vec.fit(x_train_dict)
x_train = vec.transform(x_train_dict)

scaler = MinMaxScaler()
scaler.fit(x_train)
# x_train_sc = scaler.transform(x_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [72]:
vec = DictVectorizer(sparse=False)
scaler = StandardScaler()

pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])

## Model: KNeighborsClassifier

In [93]:
model = KNeighborsClassifier()
n_neighbors = np.array([10,20,30,40,50])
scalers = [Normalizer(), StandardScaler(), MinMaxScaler(), None]

In [74]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'vectorizer', 'scaler', 'model', 'vectorizer__dtype', 'vectorizer__separator', 'vectorizer__sort', 'vectorizer__sparse', 'scaler__copy', 'scaler__with_mean', 'scaler__with_std', 'model__algorithm', 'model__leaf_size', 'model__metric', 'model__metric_params', 'model__n_jobs', 'model__n_neighbors', 'model__p', 'model__weights'])

In [75]:
grid = GridSearchCV(pipeline,param_grid=dict(scaler=scalers),cv=5)
grid.fit(x_train_dict, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'scaler': [Normalizer(copy=True, norm='l2'), StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1)), None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [78]:
print(grid.best_score_)
print(grid.best_estimator_.steps)

0.517
[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('model', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]


In [79]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.331439,0.018648,3.665929,0.035891,"Normalizer(copy=True, norm='l2')","{'scaler': Normalizer(copy=True, norm='l2')}",0.500829,0.482587,0.520868,0.500835,...,0.51,0.021352,3,0.640801,0.653317,0.657643,0.642232,0.628952,0.644589,0.010109
1,0.45741,0.029438,5.674942,0.031828,"StandardScaler(copy=True, with_mean=True, with...","{'scaler': StandardScaler(copy=True, with_mean...",0.253731,0.472637,0.479132,0.252087,...,0.341333,0.110037,4,0.371297,0.585732,0.727613,0.313203,0.274958,0.454561,0.173744
2,0.375438,0.017527,5.689499,0.08648,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'scaler': MinMaxScaler(copy=True, feature_ran...",0.507463,0.504146,0.51419,0.520868,...,0.517,0.012193,1,0.638298,0.644973,0.64015,0.642232,0.63228,0.639587,0.004275
3,0.270427,0.017448,2.811606,0.021588,,{'scaler': None},0.504146,0.495854,0.527546,0.527546,...,0.516333,0.01365,2,0.644139,0.641218,0.639733,0.636401,0.632696,0.638837,0.003955


Not bothering to scale things since it takes forever and the score isnt that much better

In [96]:
grid = GridSearchCV(model,param_grid=dict(n_neighbors=n_neighbors),cv=5)
grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([10, 20, 30, 40, 50])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [97]:
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)

0.55
30


In [98]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.344832,0.006469,3.488303,0.034671,10,{'n_neighbors': 10},0.548922,0.525705,0.559265,0.537563,...,0.543667,0.01134,5,0.60242,0.598665,0.596418,0.592253,0.595674,0.597086,0.003369
1,0.340429,0.004528,3.62747,0.040013,20,{'n_neighbors': 20},0.542289,0.542289,0.550918,0.540902,...,0.545667,0.004757,3,0.572799,0.570713,0.571429,0.566847,0.576123,0.571582,0.00301
2,0.337189,0.002294,3.757156,0.034959,30,{'n_neighbors': 30},0.542289,0.529022,0.564274,0.554257,...,0.55,0.012892,1,0.565707,0.566542,0.556435,0.561849,0.559484,0.562003,0.003787
3,0.342544,0.006542,3.841673,0.031839,40,{'n_neighbors': 40},0.542289,0.525705,0.560935,0.540902,...,0.545667,0.012936,3,0.554443,0.556946,0.554352,0.554352,0.5599,0.555999,0.002189
4,0.339431,0.002726,3.909382,0.027603,50,{'n_neighbors': 50},0.547264,0.533997,0.560935,0.54424,...,0.548333,0.009297,2,0.554026,0.559449,0.55102,0.551853,0.553245,0.553919,0.002957


## Model: RandomForestClassifier

In [None]:
model = RandomForestClassifier()
0.5436666666666666
10

In [None]:
y_train_pred = model.predict(x_train_sc)

**TRAINING**

In [None]:
print("Training Accuracy")
print(accuracy_score(y_train, y_train_pred))

In [None]:
print("Training Precision")
print(precision_score(y_train, y_train_pred,pos_label="Adoption", average=None).mean())

In [None]:
print("Training Recall")
print(recall_score(y_train, y_train_pred,pos_label="Adoption", average=None).mean())

In [None]:
true_positives = ((y_train_pred == "Transfer") & (y_train == "Transfer")).sum()

precision = true_positives / (y_train_pred == "Transfer").sum()
recall = true_positives / (y_train == "Transfer").sum()
    
precision, recall

In [None]:
pipeline = Pipeline([
    ("vectorizer", vec), 
    ("scaler", scaler), 
    ("model", model)
])

print(cross_val_score(pipeline, x_train_dict, y_train, cv=10, scoring="accuracy").mean())

In [None]:
new_dog = pd.DataFrame()
new_dog['breed'] = ['German Shepherd Mix']
new_dog['intake_condition'] = ['Normal']
new_dog['intake_type'] = ['Owner Surrender']
new_dog['sex'] = ['Male']
new_dog['fixed'] = ['No']
new_dog['time_in_shelter'] = ['3 days 07:44:00.000000000']
new_dog['age_in'] = [4.0]

In [None]:
new_dog = new_dog.to_dict(orient="records")
new_dog = vec.transform(new_dog)
new_dog_sc = scaler.transform(new_dog)

preds = model.predict(new_dog_sc)


In [None]:
preds

In [None]:
new_unique_df[:10]