## RandomSearchCV

In [1]:
import pandas as pd
import itertools
import time
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn import datasets
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load data
iris = datasets.load_iris()

# define model and search space
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                              random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
                     penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, n_iter=5, cv=5, random_state=0)

In [4]:
search = clf.fit(iris.data, iris.target)

In [5]:
search.cv_results_

{'mean_fit_time': array([0.00468955, 0.00311785, 0.00306005, 0.00278425, 0.00272245]),
 'std_fit_time': array([0.0015846 , 0.00027642, 0.00044899, 0.00025228, 0.00025534]),
 'mean_score_time': array([0.0053915 , 0.00026622, 0.00027857, 0.00023961, 0.00023375]),
 'std_score_time': array([1.01584193e-02, 5.38972818e-05, 2.90685345e-05, 4.14867510e-05,
        3.56770441e-05]),
 'param_C': masked_array(data=[2.195254015709299, 3.3770629943240693,
                    2.1795327319875875, 2.4942547871438894,
                    1.75034884505077],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_penalty': masked_array(data=['l1', 'l1', 'l1', 'l2', 'l2'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 2.195254015709299, 'penalty': 'l1'},
  {'C': 3.3770629943240693, 'penalty': 'l1'},
  {'C': 2.1795327319875875, 'penalty': 'l1'},
  {'C': 2.494254787143

## Manual Implementation

In [6]:
df = pd.DataFrame(iris.data)
df.columns = iris.feature_names
df = df.join(pd.DataFrame(iris.target).rename(columns={0 : "target"}))

In [7]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
cv_result_dict = {'params' : [], # list(itertools.product(*[parameters[k] for k in parameters]))
                 'mean_fit_times' : [],
                 'fold_0_scores' : [],
                 'fold_1_scores' : [],
                 'fold_2_scores' : [],
                 'fold_3_scores' : [],
                 'fold_4_scores' : [],
                 'mean_scores' : [],
                 'scores_std' : []}

In [9]:
for param in range(20):
    # generate the random parameters for this iteration
    rand_C = uniform.rvs(loc=0, scale=4)
    rand_penalty = np.random.choice(['l2', 'l1'])
    
    this_model = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                                    C=rand_C, penalty=rand_penalty, random_state=0)
    this_param_fit_times = []
    this_param_scores = []
    for fold in range(5):
        curr_split_test_df = df[df['target']==0][fold*10 : (fold+1) * 10].append(\
                   df[df['target']==1][fold*10 : (fold+1) * 10]).append(\
                   df[df['target']==2][fold*10 : (fold+1) * 10])
        # relative complement: df \ curr_split_test_df
        curr_split_train_df = df[~df.index.isin(curr_split_test_df.index)] 
        
        start = time.clock()
        this_model.fit(curr_split_train_df.drop(columns=['target']),
                       curr_split_train_df['target'])
        end = time.clock()
        this_param_fit_times.append(end - start)
        pred = this_model.predict(curr_split_test_df.drop(columns=['target']))
        
        # append score to the respective parameter-fold records
        score = round(accuracy_score(curr_split_test_df['target'], pred), 2)
        this_param_scores.append(score)
        cv_result_dict['fold_{}_scores'.format(fold)].append(score)
        
    cv_result_dict['params'].append((rand_C, rand_penalty))
    cv_result_dict['mean_fit_times'].append(round(np.array(this_param_fit_times).mean(), 5))
    cv_result_dict['mean_scores'].append(round(np.array(this_param_scores).mean(), 3))
    cv_result_dict['scores_std'].append(round(np.array(this_param_scores).std(), 3))

In [10]:
# makes it look better with Jupyter Notebook
for key in cv_result_dict:
    if key == "params":
        continue
    else:
        cv_result_dict[key] = np.array(cv_result_dict[key])

In [11]:
cv_result_dict

{'params': [(3.6327304745227678, 'l2'),
  (1.9598684174613061, 'l1'),
  (2.1023852441807342, 'l1'),
  (3.130910174464849, 'l2'),
  (2.323004382622702, 'l1'),
  (3.724729382721508, 'l1'),
  (2.139763843198466, 'l1'),
  (0.9310307713046209, 'l2'),
  (0.0389955037406815, 'l1'),
  (1.7782125513785636, 'l2'),
  (2.769594516494197, 'l2'),
  (2.206096847913987, 'l2'),
  (1.4751289239689114, 'l1'),
  (0.4846622346594436, 'l2'),
  (0.6737487995125924, 'l1'),
  (1.1498937131325744, 'l2'),
  (2.645468659900834, 'l2'),
  (0.07203715058823157, 'l1'),
  (2.4552082841919045, 'l2'),
  (0.7889607060151747, 'l1')],
 'mean_fit_times': array([0.00534, 0.00462, 0.00447, 0.00388, 0.0042 , 0.00397, 0.00398,
        0.00358, 0.00354, 0.00355, 0.00364, 0.00364, 0.00384, 0.00335,
        0.00363, 0.00349, 0.00366, 0.00415, 0.00367, 0.00389]),
 'fold_0_scores': array([1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 0.97, 0.77, 1.  , 1.  ,
        1.  , 1.  , 0.93, 1.  , 1.  , 1.  , 0.77, 1.  , 1.  ]),
 'fold_1_scores'