<h2>Import Libraries</h2>

In [1]:
import pandas as pd

from sklearn.model_selection import GridSearchCV, cross_val_score
import dask_ml.model_selection as dcv
from dask.distributed import Client
import joblib

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

%load_ext autoreload
%autoreload 2

<p><br>
<h2>Load TEST/TRAIN Data</h2>

In [2]:
SEED = 42

In [3]:
data_train = pd.read_csv('data-train-final.csv', index_col=0)
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 3607 to 56422
Columns: 1017 entries, installer_crety to construction_year
dtypes: float64(1017)
memory usage: 369.1 MB


In [4]:
y_train = pd.read_csv('labels-train-final.csv', index_col=0)
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47520 entries, 3607 to 56422
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  47520 non-null  object
dtypes: object(1)
memory usage: 742.5+ KB


In [5]:
classes = y_train.status_group.unique()

In [6]:
data_test = pd.read_csv('data-test-final.csv', index_col=0)
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11880 entries, 2980 to 41264
Columns: 1017 entries, installer_crety to construction_year
dtypes: float64(1017)
memory usage: 92.3 MB


In [7]:
y_test = pd.read_csv('labels-test-final.csv', index_col=0)
y_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11880 entries, 2980 to 41264
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  11880 non-null  object
dtypes: object(1)
memory usage: 185.6+ KB


<p><br>
<h2>Models</h2>

<h3>Random Forest Classifier</h3>
<h4>Build/Train</h4>

In [8]:
# GridSearchCV hyperparameters TUNING
param_grid = {
  'bootstrap': [True, False],
  'criterion': ['entropy', 'gini'],
  'max_features': ['auto', 'sqrt', 'log2'],
  'max_depth': [50, 100, None],
  'n_estimators': [100, 500, 1000], # 100 is default
}

K = 3

clf = RandomForestClassifier()

In [9]:
dask_client = Client(n_workers=2, threads_per_worker=8, memory_limit='8GB') #spawns a local cluster; memory_limit is per worker
dask_client

0,1
Client  Scheduler: tcp://127.0.0.1:51851  Dashboard: http://127.0.0.1:51850/status,Cluster  Workers: 2  Cores: 16  Memory: 16.00 GB


In [10]:
# # Dask Grid Search
# grid_clf = dcv.GridSearchCV(
#     clf, 
#     param_grid,
#     cv=K
# )
# with joblib.parallel_backend('dask', scatter=[data_train, y_train]):
#     %time _ = grid_clf.fit(data_train, y_train)
#     best_parameters = grid_clf.best_params_

# with dask backend, sklearn GridSearch CV appears to perform "better" (faster) than dask's GridSearchCV; dask's GridSearchCV also appears to use more memory and is sensitive to the local cluster's workers' memory_limit setting
# # COMMENT OUT EVERYTHING BELOW TO GO WITH LAST GridSearchCV RESULT
# grid_clf = GridSearchCV(
#     clf, 
#     param_grid, 
#     cv=K, 
#     n_jobs=-1, 
#     verbose=20
# )
# with joblib.parallel_backend('dask', scatter=[data_train, y_train]):
#     %time _ = grid_clf.fit(data_train, y_train)
#     best_parameters = grid_clf.best_params_
# # COMMENT OUT EVERYTHING ABOVE TO GO WITH LAST GridSearchCV RESULT

# last run:
# Grid Search found the following optimal parameters: 
# 	bootstrap: True
# 	criterion: 'gini'
# 	max_depth: 50
# 	max_features: 'log2'
# 	n_estimators: 1000

# UNCOMMENT OUT EVERYTHING BELOW TO GO WITH LAST GridSearchCV RESULT
best_parameters = {}
best_parameters['bootstrap'] = True
best_parameters['criterion'] = 'entropy'
best_parameters['max_depth'] = 50
best_parameters['max_features'] = 'auto'
best_parameters['n_estimators'] = 1000
# UNCOMMENT OUT EVERYTHING ABOVE TO GO WITH LAST GridSearchCV RESULT

In [11]:
print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))# # GridSearchCV hyperparameters TUNING

Grid Search found the following optimal parameters: 
	bootstrap: True
	criterion: 'gini'
	max_depth: 50
	max_features: 'auto'
	n_estimators: 1000


In [12]:
clf = RandomForestClassifier(
    oob_score = True,
    bootstrap=best_parameters['bootstrap'],
    criterion=best_parameters['criterion'],
    max_depth=best_parameters['max_depth'],
    max_features=best_parameters['max_features'],
    n_estimators=best_parameters['n_estimators'],
    random_state=SEED,
    n_jobs=-1,
    verbose=1
)
with joblib.parallel_backend('dask', scatter=[data_train, y_train]):
    %time clf.fit(data_train, y_train)

[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.9min finished
CPU times: user 2min 14s, sys: 20.9 s, total: 2min 35s
Wall time: 4min 15s


<h4>Predict</h4>
<h5>Training Data</h5>

In [13]:
with joblib.parallel_backend('dask', scatter=[data_train, y_train]):
    pred_train = clf.predict(data_train)
    print("\nscore: ", clf.score(data_train, y_train), "\n")
    print("\noob score: ", clf.oob_score_, "\n")
    print(classification_report(y_train, pred_train, target_names=classes))

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    2.8s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    5.2s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    6.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    1.2s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    2.8s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    5.1s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    6.6s finished

score:  0.9949074074074075 


oob score:  0.7973905723905724 

                         precision    recall  f1-score   support

             functional       0.99      1.00      1.00     25802

<h5>Testing Data</h5>

In [14]:
with joblib.parallel_backend('dask', scatter=[data_train, y_train]):
    pred_test = clf.predict(data_test)
    print("\nscore: ", clf.score(data_test, y_test), "\n")
    print(classification_report(y_test, pred_test, target_names=classes))

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    1.2s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.7s
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:    1.3s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed:    1.7s finished

score:  0.7978956228956229 

                         precision    recall  f1-score   support

             functional       0.81      0.87      0.84      6457
         non functional       0.5

In [15]:
# placeholder