## Cross Validation

In [54]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
cars = pd.read_csv('./used_cars.csv')
cars.columns = [c.lower() for c in cars]
cars.set_index('id', inplace = True)

print('{} rows x {} columns'.format(*cars.shape)) #* is for keyword argument to unpack cars
cars.head()


297899 rows x 8 columns


Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience


In [6]:
cars['year'].value_counts()

2014    57700
2015    53490
2016    40818
2017    30297
2013    26022
2012    18205
2011    15845
2010    10807
2008     9828
2007     7975
2009     7242
2006     5778
2005     4194
2004     3050
2003     2038
2002     1421
2001      998
2000      730
2018      461
1999      455
1998      327
1997      218
Name: year, dtype: int64

In [7]:
cars['make'].value_counts()

Ford             48187
Chevrolet        45636
BMW              32415
Nissan           29595
Dodge            15188
Kia              13532
Audi             12618
GMC              12396
Jeep             11983
Acura            11049
Mercedes-Benz    10600
Lexus             8997
Chrysler          7319
Honda             6778
Cadillac          6569
Mazda             6123
Buick             5637
Lincoln           3086
Mitsubishi        1845
MINI              1782
Land              1651
Porsche           1547
Pontiac            849
FIAT               805
Mercury            518
Bentley            367
Maserati           361
Aston              149
Ferrari             74
Oldsmobile          55
Alfa                44
Lamborghini         37
Genesis             32
AM                  19
Plymouth            16
Lotus               14
McLaren             13
Fisker               5
Freightliner         4
Maybach              3
Geo                  1
Name: make, dtype: int64

In [11]:
cars['state'].value_counts()

 TX    36754
 FL    27848
 CA    27537
 GA    14240
 IL    13283
 NC    12504
 VA    11170
 PA     9405
 NY     9380
 NJ     8940
 OH     8605
 AZ     8363
 CO     8240
 WA     7829
 TN     6368
 MA     5922
 MD     5902
 IN     5668
 MO     5538
 KY     5474
 MI     4285
 AL     4157
 MN     4045
 WI     3586
 OK     3552
 OR     3537
 UT     3487
 SC     3285
 CT     3011
 KS     2788
 LA     2770
 NV     2477
 AR     2214
 MS     1752
 NM     1673
 IA     1630
 NH     1612
 NE     1607
 ID     1183
 DE      903
 RI      739
 HI      670
 ME      637
 MT      618
 WV      471
 AK      467
 SD      331
 Fl      315
 VT      305
 ND      297
 WY      188
 Va       91
 ga       68
 Md       67
 Ga       62
 Oh       23
 Az       12
 DC        7
 Ca        7
Name: state, dtype: int64

In [12]:
cars.isnull().sum()

price      0
year       0
mileage    0
city       0
state      0
vin        0
make       0
model      0
dtype: int64

#### Create a new variable that holds the calculated average sale price based on the same make, model and year.

#### Create a new binary variable to demonstrate if the actual price is above the average price.

In [13]:
cars['avg_saleprice'] = cars.groupby(['year', 'make', 'model']).price.transform('mean')
cars['gt_avg'] = (cars.price > cars.avg_saleprice).astype(int)

In [15]:
cars.head(10)

Unnamed: 0_level_0,price,year,mileage,city,state,vin,make,model,avg_saleprice,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,16472,2015,18681,Jefferson City,MO,KL4CJBSBXFB267643,Buick,EncoreConvenience,17291.768786,0
2,15749,2015,27592,Highland,IN,KL4CJASB5FB245057,Buick,EncoreFWD,16721.350598,0
3,16998,2015,13650,Boone,NC,KL4CJCSB0FB264921,Buick,EncoreLeather,19080.632911,0
4,15777,2015,25195,New Orleans,LA,KL4CJASB4FB217542,Buick,EncoreFWD,16721.350598,0
5,16784,2015,22800,Las Vegas,NV,KL4CJBSB3FB166881,Buick,EncoreConvenience,17291.768786,0
6,17020,2016,16877,Grand Island,NY,KL4CJASB7GB536760,Buick,EncoreFWD,18122.519084,0
7,15950,2015,27885,West Covina,CA,KL4CJASB3FB241802,Buick,EncoreFWD,16721.350598,0
8,17091,2016,24008,Little Rock,AR,KL4CJASBXGB565542,Buick,EncoreFWD,18122.519084,0
9,16995,2015,8624,Punta Gorda,FL,KL4CJFSB7FB173565,Buick,EncoreConvenience,17291.768786,0
10,17700,2015,13807,Jacksonville,NC,KL4CJBSB8FB068543,Buick,EncoreConvenience,17291.768786,1


In [16]:
cars.drop(columns=['city', 'vin', 'avg_saleprice'], inplace=True)

In [17]:
cars.head()

Unnamed: 0_level_0,price,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,16472,2015,18681,MO,Buick,EncoreConvenience,0
2,15749,2015,27592,IN,Buick,EncoreFWD,0
3,16998,2015,13650,NC,Buick,EncoreLeather,0
4,15777,2015,25195,LA,Buick,EncoreFWD,0
5,16784,2015,22800,NV,Buick,EncoreConvenience,0


#### Automatically encode strings with unique identifiers

In [20]:
from sklearn.preprocessing import LabelEncoder

for col in ['state', 'make', 'model', 'year']:
    le = LabelEncoder().fit(cars[col])
    cars[col] = le.transform(cars[col])

In [23]:
cars.head()

Unnamed: 0_level_0,price,year,mileage,state,make,model,gt_avg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,16472,18,18681,28,7,523,0
2,15749,18,27592,19,7,525,0
3,16998,18,13650,32,7,526,0
4,15777,18,25195,22,7,525,0
5,16784,18,22800,38,7,523,0


#### Drop gt_avg from X and use it as y. Then, split into train and test for X and y.

In [25]:
X, y = cars.drop(columns='gt_avg'), cars.gt_avg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [32]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

In [62]:
near =  KNeighborsClassifier()

cross_val_score(near, X_train, y_train, cv=10)

array([0.59718026, 0.59483048, 0.59864888, 0.59331991, 0.59718026,
       0.59655086, 0.59898456, 0.59953004, 0.60003357, 0.59540095])

In [56]:
from sklearn.model_selection import GridSearchCV

params = {'n_neighbors': [2, 3, 5, 7, 11, 13],
          'weights': ['uniform', 'distance'],
         'metric': ['euclidean', 'manhattan']}

In [57]:
neighbor = KNeighborsClassifier()

In [55]:
sorted(sklearn.neighbors.VALID_METRICS['brute'])

['braycurtis',
 'canberra',
 'chebyshev',
 'cityblock',
 'correlation',
 'cosine',
 'cosine',
 'dice',
 'euclidean',
 'hamming',
 'haversine',
 'jaccard',
 'kulsinski',
 'l1',
 'l2',
 'mahalanobis',
 'manhattan',
 'matching',
 'minkowski',
 'precomputed',
 'rogerstanimoto',
 'russellrao',
 'seuclidean',
 'sokalmichener',
 'sokalsneath',
 'sqeuclidean',
 'wminkowski',
 'yule']

In [58]:
grid = GridSearchCV(neighbor, params, verbose = 1, cv=3, n_jobs =-1)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.9min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [2, 3, 5, 7, 11, 13],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [59]:
results = grid.cv_results_
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_metric', 'param_n_neighbors', 'param_weights', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [63]:
fit_time = results['mean_fit_time']
fit_time

array([0.65503677, 0.39578938, 0.29795138, 0.27857327, 0.28761236,
       0.29119968, 0.31698465, 0.343527  , 0.27823965, 0.29637965,
       0.35768572, 0.30922405, 0.30719129, 0.29123036, 0.32702231,
       0.35729766, 0.29314137, 0.27627103, 0.2952563 , 0.29008404,
       0.28966689, 0.27623367, 0.28400334, 0.28185662])

In [69]:
params = results['params']
params

[{'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'uniform'},
 {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'distance'},
 {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'},
 {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'},
 {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'},
 {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'},
 {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'uniform'},
 {'metric': 'euclidean', 'n_neighbors': 7, 'weights': 'distance'},
 {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'},
 {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'},
 {'metric': 'euclidean', 'n_neighbors': 13, 'weights': 'uniform'},
 {'metric': 'euclidean', 'n_neighbors': 13, 'weights': 'distance'},
 {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'uniform'},
 {'metric': 'manhattan', 'n_neighbors': 2, 'weights': 'distance'},
 {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'},

In [70]:
test_scores = results['mean_test_score']
test_scores

array([0.5804405 , 0.57313097, 0.58625624, 0.58498064, 0.59466513,
       0.5931084 , 0.5995955 , 0.59857166, 0.60464755, 0.60480281,
       0.60607841, 0.60685468, 0.58777521, 0.58190073, 0.5967254 ,
       0.59551693, 0.60400975, 0.60329223, 0.60828553, 0.60810091,
       0.6137488 , 0.61434464, 0.61476005, 0.61668184])

In [72]:
for p, s in zip(params, test_scores):
    p['score'] = s
pd.DataFrame(params).sort_values(by='score', ascending = False)

Unnamed: 0,metric,n_neighbors,score,weights
23,manhattan,13,0.616682,distance
22,manhattan,13,0.61476,uniform
21,manhattan,11,0.614345,distance
20,manhattan,11,0.613749,uniform
18,manhattan,7,0.608286,uniform
19,manhattan,7,0.608101,distance
11,euclidean,13,0.606855,distance
10,euclidean,13,0.606078,uniform
9,euclidean,11,0.604803,distance
8,euclidean,11,0.604648,uniform
