# SKLearn, joblib и joblibspark

In [1]:
#!pip install findspark

In [1]:
import findspark
findspark.init()
findspark.find()

'/usr/lib/spark'

In [2]:
#! pip install joblibspark

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats

from joblibspark import register_spark

from pyspark.ml.functions import vector_to_array

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.utils import parallel_backend

In [3]:
data = pd.read_csv('mob_price_data/train.csv')
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
# split data into features and target 

X = data.drop("price_range", axis=1).values 
y = data.price_range.values

In [5]:
# standardize the feature variables 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Define the hyperparameter search space
space = {
  "n_estimators": stats.randint(50, 150),
  "criterion": ["gini", "entropy"],
  "min_samples_leaf": stats.randint(1, 20),
  "min_samples_split": stats.uniform(0, 1)
}

n_evals = 200

##  Подбор гиперпараметров на одном узле

In [7]:
model = RandomForestClassifier()

search = RandomizedSearchCV(
  estimator=model,
  param_distributions=space,
  n_iter=n_evals,
  n_jobs=1,
  cv=2,
  verbose=2
)
search.fit(X_scaled, y)

Fitting 2 folds for each of 200 candidates, totalling 400 fits
[CV] criterion=gini, min_samples_leaf=13, min_samples_split=0.11269087426231938, n_estimators=74 
[CV]  criterion=gini, min_samples_leaf=13, min_samples_split=0.11269087426231938, n_estimators=74, total=   0.1s
[CV] criterion=gini, min_samples_leaf=13, min_samples_split=0.11269087426231938, n_estimators=74 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  criterion=gini, min_samples_leaf=13, min_samples_split=0.11269087426231938, n_estimators=74, total=   0.1s
[CV] criterion=entropy, min_samples_leaf=17, min_samples_split=0.3788483876280958, n_estimators=72 
[CV]  criterion=entropy, min_samples_leaf=17, min_samples_split=0.3788483876280958, n_estimators=72, total=   0.1s
[CV] criterion=entropy, min_samples_leaf=17, min_samples_split=0.3788483876280958, n_estimators=72 
[CV]  criterion=entropy, min_samples_leaf=17, min_samples_split=0.3788483876280958, n_estimators=72, total=   0.1s
[CV] criterion=entropy, min_samples_leaf=14, min_samples_split=0.6979710474461183, n_estimators=105 
[CV]  criterion=entropy, min_samples_leaf=14, min_samples_split=0.6979710474461183, n_estimators=105, total=   0.1s
[CV] criterion=entropy, min_samples_leaf=14, min_samples_split=0.6979710474461183, n_estimators=105 
[CV]  criterion=entropy, min_samples_leaf=14, min_samples_split=0.6979710474461183, n_estimators=105, total=   0.1s
[CV] criterion=gini, mi

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:   55.7s finished


RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8b3916aaf0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8b3916a5e0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8b09ac3fd0>},
                   verbose=2)

In [8]:
best = search.best_params_
best["CV Score"] = search.best_score_
display(pd.DataFrame(best, index=[0]))

Unnamed: 0,criterion,min_samples_leaf,min_samples_split,n_estimators,CV Score
0,entropy,6,0.002961,139,0.865


In [9]:
model = RandomForestClassifier()

search = RandomizedSearchCV(
  estimator=model,
  param_distributions=space,
  n_iter=n_evals,
  n_jobs=-1,
  cv=2,
  verbose=2
)
search.fit(X_scaled, y)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:   26.0s finished


RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8b3916aaf0>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8b3916a5e0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8b09ac3fd0>},
                   verbose=2)

In [10]:
best = search.best_params_
best["CV Score"] = search.best_score_
display(pd.DataFrame(best, index=[0]))

Unnamed: 0,criterion,min_samples_leaf,min_samples_split,n_estimators,CV Score
0,entropy,7,0.007087,147,0.856


## Подбор гиперпараметров на кластере

In [11]:
register_spark()

parallelism = 4
with parallel_backend("spark", n_jobs=parallelism):
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=space,
        n_iter=n_evals,
        cv=2,
        verbose=2
    )
    search.fit(X_scaled, y)

[Parallel(n_jobs=4)]: Using backend SparkDistributedBackend with 4 concurrent workers.


Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   19.3s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   30.3s
[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:   32.6s finished


In [12]:
best = search.best_params_
best["CV Score"] = search.best_score_
display(pd.DataFrame(best, index=[0]))

Unnamed: 0,criterion,min_samples_leaf,min_samples_split,n_estimators,CV Score
0,entropy,3,0.010258,72,0.8665


In [11]:
register_spark()

In [12]:
model = RandomForestClassifier()

In [None]:
parallelism = 16
with parallel_backend("spark", n_jobs=parallelism):
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=space,
        n_iter=n_evals,
        cv=2,
        verbose=2
    )
    search.fit(X_scaled, y)

In [None]:
spark.stop()