`conda install scipy` / `pip install scipy`

In [152]:
import pandas as pd
import numpy as np
import scipy
import altair as alt
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [153]:
cities_df = pd.read_csv('canada_usa_cities.csv')

In [154]:
cities_df.head()

Unnamed: 0,longitude,latitude,country
0,-130.0437,55.9773,USA
1,-134.4197,58.3019,USA
2,-123.078,48.9854,USA
3,-122.7436,48.9881,USA
4,-122.2691,48.9951,USA


In [155]:
train_df, test_df =train_test_split(cities_df, test_size=0.2)

In [156]:
X = cities_df.drop(columns=['country'])

y = cities_df['country']

In [157]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

In [158]:
shape_dict2={"Data portion":['X','y','X_train','y_train','X_test','y_test'],
            'Shape':[X.shape, y.shape,
                     X_train.shape, y_train.shape,
                     X_test.shape, y_test.shape]}

shape_df = pd.DataFrame(shape_dict2)
shape_df

Unnamed: 0,Data portion,Shape
0,X,"(209, 2)"
1,y,"(209,)"
2,X_train,"(167, 2)"
3,y_train,"(167,)"
4,X_test,"(42, 2)"
5,y_test,"(42,)"


In [159]:
one_city = X_train.sample(1,random_state=44)
one_city

Unnamed: 0,longitude,latitude
188,-73.2533,45.3057


In [160]:
chart_cities = alt.Chart(train_df).mark_circle(size=20,opacity=0.6).encode(
    alt.X('longitude:Q', scale=alt.Scale(domain=[-140, -40])),
    alt.Y('latitude:Q', scale=alt.Scale(domain=[20, 60])),
    alt.Color('country:N', scale=alt.Scale(domain=['Canada', 'USA'],
                                           range=['red', 'blue'])))
chart_cities

In [161]:
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [162]:
model.score(X_train, y_train)

0.9041916167664671

### cross_val_score & cross_validate

In [163]:
dt_cv_score5 = cross_val_score(model, X_train, y_train, cv=5)
dt_cv_score5

array([0.76470588, 0.82352941, 0.78787879, 0.78787879, 0.84848485])

In [164]:
pd.DataFrame(dt_cv_score5)

Unnamed: 0,0
0,0.764706
1,0.823529
2,0.787879
3,0.787879
4,0.848485


In [165]:
dt_cv_score5.mean()

0.8024955436720143

In [166]:
cv_score10 = cross_val_score(model, X_train, y_train, cv=10)

cv_score10

array([0.76470588, 0.82352941, 0.70588235, 0.94117647, 0.82352941,
       0.82352941, 0.70588235, 0.9375    , 0.9375    , 0.9375    ])

In [167]:
cv_score10.mean()

0.8400735294117647

In [168]:
pd.DataFrame(cv_score10)

Unnamed: 0,0
0,0.764706
1,0.823529
2,0.705882
3,0.941176
4,0.823529
5,0.823529
6,0.705882
7,0.9375
8,0.9375
9,0.9375


In [169]:
scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=True)
scores

{'fit_time': array([0.00300908, 0.0009973 , 0.00100112, 0.00100279, 0.00200105,
        0.00099826, 0.00200176, 0.00199866, 0.0009973 , 0.00197959]),
 'score_time': array([0.00099802, 0.00099778, 0.00100255, 0.0019958 , 0.        ,
        0.00100112, 0.0009954 , 0.00099492, 0.00102568, 0.00099635]),
 'test_score': array([0.76470588, 0.82352941, 0.70588235, 0.94117647, 0.82352941,
        0.82352941, 0.70588235, 0.9375    , 0.9375    , 0.9375    ]),
 'train_score': array([0.91333333, 0.90666667, 0.90666667, 0.9       , 0.90666667,
        0.91333333, 0.92      , 0.90066225, 0.90066225, 0.90066225])}

In [170]:
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.003009,0.000998,0.764706,0.913333
1,0.000997,0.000998,0.823529,0.906667
2,0.001001,0.001003,0.705882,0.906667
3,0.001003,0.001996,0.941176,0.9
4,0.002001,0.0,0.823529,0.906667
5,0.000998,0.001001,0.823529,0.913333
6,0.002002,0.000995,0.705882,0.92
7,0.001999,0.000995,0.9375,0.900662
8,0.000997,0.001026,0.9375,0.900662
9,0.00198,0.000996,0.9375,0.900662


In [171]:
pd.DataFrame(scores).mean()

fit_time       0.001599
score_time     0.001001
test_score     0.840074
train_score    0.906865
dtype: float64

In [172]:
model.score(X_test, y_test)

0.8095238095238095

### KNN

In [173]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train, y_train)
neigh.predict(one_city)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['Canada'], dtype=object)

In [174]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
neigh.predict(one_city)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['Canada'], dtype=object)

In [175]:
neigh = KNeighborsClassifier(n_neighbors=9)
neigh.fit(X_train, y_train)
neigh.predict(one_city)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['Canada'], dtype=object)

In [176]:
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [177]:
model.score(X_train,y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


1.0

In [178]:
model.score(X_test,y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7142857142857143

In [179]:
two_cities = X_train.sample(2)
two_cities

Unnamed: 0,longitude,latitude
61,-87.9225,43.035
86,-102.548,49.0014


In [180]:
euclidean_distances(two_cities)

array([[ 0.        , 15.79566963],
       [15.79566963,  0.        ]])

In [181]:
dists = euclidean_distances(train_df[['latitude','longitude']])
dists

array([[ 0.        , 23.8573571 ,  4.75685081, ..., 13.46426488,
        49.54689522, 37.28038511],
       [23.8573571 ,  0.        , 28.01738934, ..., 37.13499021,
        25.95750982, 13.54130335],
       [ 4.75685081, 28.01738934,  0.        , ...,  9.12907261,
        53.8803735 , 41.53213679],
       ...,
       [13.46426488, 37.13499021,  9.12907261, ...,  0.        ,
        62.95813108, 50.63570038],
       [49.54689522, 25.95750982, 53.8803735 , ..., 62.95813108,
         0.        , 12.4413698 ],
       [37.28038511, 13.54130335, 41.53213679, ..., 50.63570038,
        12.4413698 ,  0.        ]])

In [182]:
dists.shape

(167, 167)

In [183]:
pd.DataFrame(dists).loc[:5,:5]

Unnamed: 0,0,1,2,3,4,5
0,0.0,23.857357,4.756851,20.979913,1.705389,62.059471
1,23.857357,0.0,28.017389,29.478103,25.514353,38.217214
2,4.756851,28.017389,0.0,24.297373,3.772331,66.088215
3,20.979913,29.478103,24.297373,0.0,20.86778,63.233724
4,1.705389,25.514353,3.772331,20.86778,0.0,63.725136
5,62.059471,38.217214,66.088215,63.233724,63.725136,0.0


In [184]:
np.fill_diagonal(dists, np.inf)

In [185]:
pd.DataFrame(dists).loc[:5,:5]

Unnamed: 0,0,1,2,3,4,5
0,inf,23.857357,4.756851,20.979913,1.705389,62.059471
1,23.857357,inf,28.017389,29.478103,25.514353,38.217214
2,4.756851,28.017389,inf,24.297373,3.772331,66.088215
3,20.979913,29.478103,24.297373,inf,20.86778,63.233724
4,1.705389,25.514353,3.772331,20.86778,inf,63.725136
5,62.059471,38.217214,66.088215,63.233724,63.725136,inf


In [186]:
X_train.iloc[0]

longitude   -76.4813
latitude     44.2307
Name: 160, dtype: float64

In [187]:
dists[0][:5]

array([        inf, 23.8573571 ,  4.75685081, 20.97991333,  1.70538937])

In [188]:
dists[0]

array([        inf, 23.8573571 ,  4.75685081, 20.97991333,  1.70538937,
       62.05947091, 26.51303439, 24.09498015, 48.42292928,  0.24116129,
        1.46543112, 10.12120762,  1.97922441,  6.26408326,  2.07800984,
        9.98341907,  7.06667619,  5.97201112,  7.39896101, 23.7821647 ,
       21.94511908,  1.27908011, 48.96018258, 14.51919091,  2.11371932,
        4.74035343, 40.33108522,  0.13182815, 40.40852415, 49.78786569,
       26.94383646, 48.71549832,  6.22779409, 21.16303027,  1.16334477,
        7.09264451, 10.97368481,  0.27997473,  5.51046862, 46.0891427 ,
       19.62760975,  9.20219122, 33.6285267 ,  5.9515144 ,  0.20020475,
       46.99701   , 49.44754177,  2.70263554, 16.22759105, 19.97014548,
       29.7044918 , 49.07400144,  6.72083645, 21.12945645, 40.61781897,
        6.02546485, 11.11014766,  6.22263174, 19.95760564, 49.52164288,
        6.43445785,  5.578431  , 14.19485283,  5.29397578,  7.22927943,
       12.71095494,  5.27911951,  1.26061499,  9.17190265, 31.32

In [189]:
pd.DataFrame(dists[0]).describe()

Unnamed: 0,0
count,167.0
mean,inf
std,
min,0.131828
25%,5.930076
50%,10.131797
75%,37.866203
max,inf


In [190]:
np.argmin(dists[0])

27

In [191]:
X_train.iloc[[80]]

Unnamed: 0,longitude,latitude
121,-79.8729,43.2561


In [192]:
dists[0][80]

6.606397736285599

Query point

In [193]:
query_point = [[-80, 25]]
dists = euclidean_distances(X_train[['longitude','latitude']], query_point)
dists[0:5]

array([[19.54996348],
       [18.02706204],
       [24.60912622],
       [21.39718237],
       [25.24111312]])

In [194]:
np.argmin(dists)

147

In [195]:
dists[np.argmin(dists)]

array([3.83839229])

In [196]:
dists[np.argmin(dists)].item()

3.8383922936564634

In [197]:
dists[np.argmin(dists)][0]

3.8383922936564634

### depth 

In [198]:
param_grid = {
    'max_depth':scipy.stats.uniform(1, 20)}

model = DecisionTreeClassifier()

gs = RandomizedSearchCV(model, param_grid, n_jobs =-1, return_train_score=True, n_iter=10)
gs.fit(X_train, y_train)

RandomizedSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x0000028CCD69EB80>},
                   return_train_score=True)

In [199]:
gs.best_params_

{'max_depth': 7.069329843278988}

In [200]:
gs.score(X_train, y_train)

0.9401197604790419

 ### Exhaustive grid search 

In [201]:
param_grid = { 
    'gamma':[0.1, 1.0, 10,100],
    'C': [0.1, 1.0, 10,100]
}

svc = SVC()
grid_search = GridSearchCV(svc, param_grid, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1.0, 10, 100],
                         'gamma': [0.1, 1.0, 10, 100]},
             verbose=1)

With Pipeline

In [202]:
pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy='median')),
        ("scaler", StandardScaler()),
        ("svc", SVC())])

param_grid = {
    "svc__gamma": [0.1, 1.0, 10, 100],
    "svc__C": [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer',
                                        SimpleImputer(strategy='median')),
                                       ('scaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': [0.1, 1.0, 10, 100],
                         'svc__gamma': [0.1, 1.0, 10, 100]},
             return_train_score=True, verbose=1)

In [203]:
grid_search.best_params_

{'svc__C': 10, 'svc__gamma': 1.0}

In [204]:
grid_search.best_score_

0.8208556149732621

In [205]:
best_model = grid_search.best_estimator_

In [206]:
best_model.score(X_test, y_test)

0.8333333333333334

In [207]:
grid_search.score(X_test, y_test)

0.8333333333333334

In [208]:
best_model.predict(X_test)

array(['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'USA', 'Canada', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada',
       'Canada', 'Canada'], dtype=object)

In [209]:
grid_search.predict(X_test)

array(['Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'USA', 'Canada', 'USA', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'Canada',
       'Canada', 'Canada', 'Canada', 'Canada', 'USA', 'USA', 'Canada',
       'Canada', 'Canada'], dtype=object)

Randomized Search

In [210]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'svc__gamma':[0.1, 1.0, 10, 100],
    'svc__C':[0.1, 1.0, 10, 100]
}

random_search = RandomizedSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1, n_iter=10)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('imputer',
                                              SimpleImputer(strategy='median')),
                                             ('scaler', StandardScaler()),
                                             ('svc', SVC())]),
                   n_jobs=-1,
                   param_distributions={'svc__C': [0.1, 1.0, 10, 100],
                                        'svc__gamma': [0.1, 1.0, 10, 100]},
                   verbose=1)

In [211]:
random_search.score(X_test, y_test)

0.8095238095238095

Search over a range of continuous values 

In [212]:
import scipy

param_grid = {
    "svc__C": scipy.stats.uniform(0, 100),
    "svc__gamma": scipy.stats.uniform(0, 100)}

random_gs = RandomizedSearchCV(pipe, param_grid, n_jobs=-1, cv=10, return_train_score=True, n_iter=10)
random_gs.fit(X_train, y_train)

RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('imputer',
                                              SimpleImputer(strategy='median')),
                                             ('scaler', StandardScaler()),
                                             ('svc', SVC())]),
                   n_jobs=-1,
                   param_distributions={'svc__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x0000028CD4A7BAC0>,
                                        'svc__gamma': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x0000028CD4944310>},
                   return_train_score=True)

In [213]:
random_gs.best_params_

{'svc__C': 86.00505771632295, 'svc__gamma': 9.633008428390234}

In [214]:
random_gs.best_score_

0.7691176470588236

In [215]:
random_gs.score(X_test, y_test)

0.7380952380952381

How different do they score?

In [216]:
grid_search.score(X_test, y_test)

0.8333333333333334

In [217]:
random_search.score(X_test, y_test)

0.8095238095238095

In [218]:
random_gs.score(X_test, y_test)

0.7380952380952381

compare all the test score
graph