##### Jupyter Notebook, Step 4 - Build Model
- Implement your final model
- (Optionally) use the entire data set

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
from tqdm import tqdm
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
# from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import (precision_score, 
                             accuracy_score, 
                             roc_auc_score, 
                             roc_curve, 
                             precision_recall_curve, 
                             recall_score
#                              make_scorer,
#                              auc,
#                              classification_report,
#                              confusion_matrix
                            )
# !conda install psycopg2 --yes
# import psycopg2 as pg2
# from psycopg2.extras import RealDictCursor
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
# draw all the data for just our 20 features

# run the models above a certain threshold on these, w/ a held-out test set

In [None]:
# import UCI madelon data and set the nan column to labels

def import_and_labels(data, labels):
    
    data = data
    labels = labels
    madelon_train = pd.read_csv(data, delimiter=' ', header=None)
    madelon_labels = pd.read_csv(labels, delimiter=' ', header=None)
    madelon_train[500] = madelon_labels

    return madelon_train

madelon_train = import_and_labels('madelon_train.data.csv','madelon_train.labels.csv')

In [None]:
UCIsample_list = [28,48,64,105,128,153,241,281,318,336,338,378,433,442,451,453,455,472,475,493,500]
DBsample_list = ['feat_257','feat_269','feat_308','feat_315',\
                 'feat_336','feat_341','feat_395','feat_504','feat_526','feat_639','feat_681',\
                 'feat_701','feat_724','feat_736','feat_769','feat_808','feat_829','feat_867','feat_920',\
                 'feat_956','target']

def drop_noise(df, signal_list):
    temp_df = df
    for column in temp_df.columns:
        if column not in signal_list:
            temp_df.drop(column, axis=1, inplace=True)
    return temp_df

In [None]:
UCIfull_clean = drop_noise(madelon_train, UCIsample_list)
UCIfull_clean.head()

In [None]:
pickle.dump(UCIfull_clean, open('UCIfull_clean', "wb" ) )


In [None]:
def con_cur_to_class_db():
    con = pg2.connect(host='34.211.227.227',
                  dbname='postgres',
                  user='postgres')
    cur = con.cursor(cursor_factory=RealDictCursor)
    return con, cur

def draw_sample():
    con, cur = con_cur_to_class_db()
    cur.execute('SELECT feat_257,feat_269,feat_308,feat_315,feat_336,feat_341,feat_395,feat_504,feat_526,feat_639,feat_681,feat_701,feat_724,feat_736,feat_769,feat_808,feat_829,feat_867,feat_920,feat_956,target FROM madelon;')
    mad_db = cur.fetchall()
    con.close()
    return pd.DataFrame(mad_db)

In [None]:
DBfull_clean = draw_sample()
pickle.dump(DBfull_clean, open('DBfull_clean', "wb" ) )

In [None]:
DBfull_clean.shape

In [None]:
DBfull_clean = pickle.load( open( "DBfull_clean", "rb" ) )

# drop the seemingly unneeded ID column
# DBfull_clean.drop('_id', axis=1, inplace=True)
DBfull_clean.head()

In [8]:
# define my controlling function and pipelines to gridsearch

DBfull_clean = pickle.load( open( "DBfull_clean", "rb" ) )
UCIfull_clean = pickle.load( open( "UCIfull_clean", "rb" ) )

def score_pipelines(sample_list, model_zip):
    results = []
    for sample in sample_list:
        y = sample.iloc[:,-1]
        X = sample.iloc[:,0:-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.25, stratify=y)
        for model_name, model in tqdm(model_zip):
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            recall = recall_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            results.append({
                    'sample': sample.name,
                    'name':'{}'.format(model_name),
                    'model': model,
                    'best_params': model.best_params_,
                    'train_accuracy': model.score(X_train, y_train),
                    'test_accuracy': model.score(X_test, y_test),
                    'recall': recall,
                    'precision': precision,
            })
    return pd.DataFrame(results)

knn = Pipeline([
    ('scaling', StandardScaler()), 
#     ('scaling', StandardScaler(with_mean=False)), 
    ('pca', PCA(n_components=5)),
    ('classifier', KNeighborsClassifier())])

svc = Pipeline([
    ('scaling', StandardScaler()), 
#     ('scaling', StandardScaler(with_mean=False)),     
    ('pca', PCA(n_components=5)),
    ('classifier', SVC())])

bag = Pipeline([
    ('scaling', StandardScaler()), 
#     ('scaling', StandardScaler(with_mean=False)),     
    ('pca', PCA(n_components=5)),
    ('clf', BaggingClassifier(DecisionTreeClassifier(random_state=42), max_samples=.8, random_state=42))])

rfc = Pipeline([
    ('scaling', StandardScaler()), 
#     ('scaling', StandardScaler(with_mean=False)),     
    ('pca', PCA(n_components=5)),
    ('clf',RandomForestClassifier(random_state=42))])

etc = Pipeline([
    ('scaling', StandardScaler()), 
#     ('scaling', StandardScaler(with_mean=False)),     
    ('pca', PCA(n_components=5)),
    ('clf',ExtraTreesClassifier(random_state=42))])

knn_params = {
              'classifier__weights':['distance'],
              'classifier__n_neighbors':np.arange(3,13,2)}

svc_params = {
               'classifier__gamma':np.logspace(-3,3,7), 
               'classifier__C':np.logspace(-3,3,7),
               'classifier__kernel':['rbf']}

bag_params = {
    'clf__base_estimator': [DecisionTreeClassifier(max_depth=md, random_state=42) for md in [5,7,10,None]],
    'clf__n_estimators':[10,50,100]}

rfc_params = {
    'clf__n_estimators':[10,50,100,200],
    'clf__max_features':['auto','log2']}

etc_params = {
    'clf__bootstrap':[True, False],
    'clf__n_estimators':[10,50,100,200]}

bag_gs = GridSearchCV(bag, bag_params, cv=5, n_jobs=-1, verbose=1)
rfc_gs = GridSearchCV(rfc, rfc_params, cv=5, n_jobs=-1, verbose=1)
etc_gs = GridSearchCV(etc, etc_params, cv=5, n_jobs=-1, verbose=1)
knn_gs = GridSearchCV(knn, knn_params, cv=5, n_jobs=-1, verbose=1)
svc_gs = GridSearchCV(svc, svc_params, cv=5, n_jobs=-1, verbose=1)

UCIfull_clean.name = 'UCIfull_clean'
DBfull_clean.name = 'DBfull_clean' 

# drop pipes that did poorly in gridsearching after tuning: logr, dct, 
pipe_names = ['knn_gs', 'svc_gs', 'bag_gs', 'rfc_gs', 'etc_gs']
pipe_list = [knn_gs, svc_gs, bag_gs, rfc_gs, etc_gs]
model_zip = list(zip(pipe_names, pipe_list))


In [9]:
sample_list = [UCIfull_clean]
results = score_pipelines(sample_list, model_zip)
display(results.sort_values('test_accuracy', ascending=False))

  0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 5 candidates, totalling 25 fits





JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/opt/conda/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/opt/conda/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7f79def11ed0, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/opt/conda/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/opt/conda/lib/python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7f79def11ed0, file "/...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/opt/conda/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/opt/conda/lib/python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()
     17 
     18 
     19 
     20 

...........................................................................
/opt/conda/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/opt/conda/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/opt/conda/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
/opt/conda/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/opt/conda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/opt/conda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/opt/conda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/opt/conda/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 11, 11, 1, 10, 26, 734374, tzinfo=tzlocal()), 'msg_id': 'A0341C46063B4393B8F8B29085A5CB59', 'msg_type': 'execute_request', 'session': 'B7857C7A06174F88859E10F31B92B67D', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'A0341C46063B4393B8F8B29085A5CB59', 'msg_type': 'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'B7857C7A06174F88859E10F31B92B67D']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 11, 11, 1, 10, 26, 734374, tzinfo=tzlocal()), 'msg_id': 'A0341C46063B4393B8F8B29085A5CB59', 'msg_type': 'execute_request', 'session': 'B7857C7A06174F88859E10F31B92B67D', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'A0341C46063B4393B8F8B29085A5CB59', 'msg_type': 'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'B7857C7A06174F88859E10F31B92B67D'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 11, 11, 1, 10, 26, 734374, tzinfo=tzlocal()), 'msg_id': 'A0341C46063B4393B8F8B29085A5CB59', 'msg_type': 'execute_request', 'session': 'B7857C7A06174F88859E10F31B92B67D', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': 'A0341C46063B4393B8F8B29085A5CB59', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))"
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/opt/conda/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))",), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))",)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))", store_history=True, silent=False, shell_futures=True)
   2723                 self.displayhook.exec_result = result
   2724 
   2725                 # Execute the user code
   2726                 interactivity = "none" if silent else self.ast_node_interactivity
   2727                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2728                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2729                 
   2730                 self.last_execution_succeeded = not has_raised
   2731                 self.last_execution_result = result
   2732 

...........................................................................
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-9-83908fb7087f>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7f79a0993898, executi..._before_exec=None error_in_exec=None result=None>)
   2845 
   2846         try:
   2847             for i, node in enumerate(to_run_exec):
   2848                 mod = ast.Module([node])
   2849                 code = compiler(mod, cell_name, "exec")
-> 2850                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f79a0ae6ae0, file "<ipython-input-9-83908fb7087f>", line 2>
        result = <ExecutionResult object at 7f79a0993898, executi..._before_exec=None error_in_exec=None result=None>
   2851                     return True
   2852 
   2853             for i, node in enumerate(to_run_interactive):
   2854                 mod = ast.Interactive([node])

...........................................................................
/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f79a0ae6ae0, file "<ipython-input-9-83908fb7087f>", line 2>, result=<ExecutionResult object at 7f79a0993898, executi..._before_exec=None error_in_exec=None result=None>)
   2905         outflag = True  # happens in more places, so it's easier as default
   2906         try:
   2907             try:
   2908                 self.hooks.pre_run_code_hook()
   2909                 #rprint('Running code', repr(code_obj)) # dbg
-> 2910                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f79a0ae6ae0, file "<ipython-input-9-83908fb7087f>", line 2>
        self.user_global_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'BaggingClassifier': <class 'sklearn.ensemble.bagging.BaggingClassifier'>, 'DBfull_clean':         feat_257  feat_269  feat_308  feat_315  ...0 -0.559682       1  

[200000 rows x 21 columns], 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nimport se...ictCursor\nfrom sklearn.linear_model import Linear', 'import pandas as pd\nimport numpy as np\nimport se...DictCursor\nfrom sklearn.linear_model import Lasso', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', 'import pandas as pd\nimport numpy as np\nimport se... sklearn.feature_selection import SelectFromModel', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', "sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))"], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, ...}
        self.user_ns = {'AdaBoostClassifier': <class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>, 'BaggingClassifier': <class 'sklearn.ensemble.bagging.BaggingClassifier'>, 'DBfull_clean':         feat_257  feat_269  feat_308  feat_315  ...0 -0.559682       1  

[200000 rows x 21 columns], 'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nimport se...ictCursor\nfrom sklearn.linear_model import Linear', 'import pandas as pd\nimport numpy as np\nimport se...DictCursor\nfrom sklearn.linear_model import Lasso', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', 'import pandas as pd\nimport numpy as np\nimport se... sklearn.feature_selection import SelectFromModel', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', '# define my controlling function and pipelines t..._gs]\nmodel_zip = list(zip(pipe_names, pipe_list))', "sample_list = [UCIfull_clean]\nresults = score_pi...ts.sort_values('test_accuracy', ascending=False))"], 'KNeighborsClassifier': <class 'sklearn.neighbors.classification.KNeighborsClassifier'>, 'Lasso': <class 'sklearn.linear_model.coordinate_descent.Lasso'>, 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, ...}
   2911             finally:
   2912                 # Reset our crash handler in place
   2913                 sys.excepthook = old_excepthook
   2914         except SystemExit as e:

...........................................................................
/home/jovyan/work/project_3/<ipython-input-9-83908fb7087f> in <module>()
      1 
----> 2 
      3 
      4 
      5 sample_list = [UCIfull_clean]
      6 results = score_pipelines(sample_list, model_zip)
      7 display(results.sort_values('test_accuracy', ascending=False))
      8 
      9 
     10 

...........................................................................
/home/jovyan/work/project_3/<ipython-input-8-d78be5dfc6fc> in score_pipelines(sample_list=[      28   48   64   105  128  153  241  281  31...2  487  560  449    1  

[2000 rows x 21 columns]], model_zip=[('knn_gs', GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1)), ('svc_gs', GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1)), ('bag_gs', GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1)), ('rfc_gs', GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1)), ('etc_gs', GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1))])
      8     for sample in sample_list:
      9         y = sample.iloc[:,-1]
     10         X = sample.iloc[:,0:-1]
     11         X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.25, stratify=y)
     12         for model_name, model in tqdm(model_zip):
---> 13             model.fit(X_train, y_train)
     14             y_pred = model.predict(X_test)
     15             recall = recall_score(y_test, y_pred)
     16             precision = precision_score(y_test, y_pred)
     17             results.append({

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1), X=      28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1500 rows x 20 columns], y=1082   -1
1771   -1
164     1
23      1
506    -...    1
578     1
1067    1
Name: 500, dtype: int64, groups=None)
    940 
    941         groups : array-like, with shape (n_samples,), optional
    942             Group labels for the samples used while splitting the dataset into
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
        self._fit = <bound method BaseSearchCV._fit of GridSearchCV(...rain_score=True,
       scoring=None, verbose=1)>
        X =       28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1500 rows x 20 columns]
        y = 1082   -1
1771   -1
164     1
23      1
506    -...    1
578     1
1067    1
Name: 500, dtype: int64
        groups = None
        self.param_grid = {'classifier__n_neighbors': array([ 3,  5,  7,  9, 11]), 'classifier__weights': ['distance']}
    946 
    947 
    948 class RandomizedSearchCV(BaseSearchCV):
    949     """Randomized search on hyper parameters.

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self=GridSearchCV(cv=5, error_score='raise',
       e...train_score=True,
       scoring=None, verbose=1), X=      28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1500 rows x 20 columns], y=1082   -1
1771   -1
164     1
23      1
506    -...    1
578     1
1067    1
Name: 500, dtype: int64, groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterGrid object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterGrid object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV._fit.<locals>.<genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sat Nov 11 01:10:26 2017
PID: 170                                Python 3.6.2: /opt/conda/bin/python
...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (Pipeline(steps=[('scaling', StandardScaler(copy=...ighbors=3, p=2,
           weights='distance'))]),       28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1500 rows x 20 columns], 1082   -1
1771   -1
164     1
23      1
506    -...    1
578     1
1067    1
Name: 500, dtype: int64, <function _passthrough_scorer>, array([ 283,  286,  287, ..., 1497, 1498, 1499]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...99, 300, 301, 307, 308, 311, 315, 316, 317, 318]), 1, {'classifier__n_neighbors': 3, 'classifier__weights': 'distance'}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (Pipeline(steps=[('scaling', StandardScaler(copy=...ighbors=3, p=2,
           weights='distance'))]),       28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1500 rows x 20 columns], 1082   -1
1771   -1
164     1
23      1
506    -...    1
578     1
1067    1
Name: 500, dtype: int64, <function _passthrough_scorer>, array([ 283,  286,  287, ..., 1497, 1498, 1499]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...99, 300, 301, 307, 308, 311, 315, 316, 317, 318]), 1, {'classifier__n_neighbors': 3, 'classifier__weights': 'distance'})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=Pipeline(steps=[('scaling', StandardScaler(copy=...ighbors=3, p=2,
           weights='distance'))]), X=      28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1500 rows x 20 columns], y=1082   -1
1771   -1
164     1
23      1
506    -...    1
578     1
1067    1
Name: 500, dtype: int64, scorer=<function _passthrough_scorer>, train=array([ 283,  286,  287, ..., 1497, 1498, 1499]), test=array([  0,   1,   2,   3,   4,   5,   6,   7,  ...99, 300, 301, 307, 308, 311, 315, 316, 317, 318]), verbose=1, parameters={'classifier__n_neighbors': 3, 'classifier__weights': 'distance'}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    233 
    234     try:
    235         if y_train is None:
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method Pipeline.fit of Pipeline(steps=[('...ghbors=3, p=2,
           weights='distance'))])>
        X_train =       28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1200 rows x 20 columns]
        y_train = 569    -1
13     -1
1654   -1
1218   -1
1890   -...    1
578     1
1067    1
Name: 500, dtype: int64
        fit_params = {}
    239 
    240     except Exception as e:
    241         # Note fit time as time until error
    242         fit_time = time.time() - start_time

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('scaling', StandardScaler(copy=...ighbors=3, p=2,
           weights='distance'))]), X=      28   48   64   105  128  153  241  281  31...7  452  503  537  612  

[1200 rows x 20 columns], y=569    -1
13     -1
1654   -1
1218   -1
1890   -...    1
578     1
1067    1
Name: 500, dtype: int64, **fit_params={})
    265         self : Pipeline
    266             This estimator
    267         """
    268         Xt, fit_params = self._fit(X, y, **fit_params)
    269         if self._final_estimator is not None:
--> 270             self._final_estimator.fit(Xt, y, **fit_params)
        self._final_estimator.fit = <bound method SupervisedIntegerMixin.fit of KNei...neighbors=3, p=2,
           weights='distance')>
        Xt = array([], shape=(1200, 0), dtype=float64)
        y = 569    -1
13     -1
1654   -1
1218   -1
1890   -...    1
578     1
1067    1
Name: 500, dtype: int64
        fit_params = {}
    271         return self
    272 
    273     def fit_transform(self, X, y=None, **fit_params):
    274         """Fit the model and transform with the final estimator

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/neighbors/base.py in fit(self=KNeighborsClassifier(algorithm='auto', leaf_size..._neighbors=3, p=2,
           weights='distance'), X=array([], shape=(1200, 0), dtype=float64), y=569    -1
13     -1
1654   -1
1218   -1
1890   -...    1
578     1
1067    1
Name: 500, dtype: int64)
    756         y : {array-like, sparse matrix}
    757             Target values of shape = [n_samples] or [n_samples, n_outputs]
    758 
    759         """
    760         if not isinstance(X, (KDTree, BallTree)):
--> 761             X, y = check_X_y(X, y, "csr", multi_output=True)
        X = array([], shape=(1200, 0), dtype=float64)
        y = 569    -1
13     -1
1654   -1
1218   -1
1890   -...    1
578     1
1067    1
Name: 500, dtype: int64
    762 
    763         if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
    764             if y.ndim != 1:
    765                 warnings.warn("A column-vector y was passed when a 1d array "

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X=array([], shape=(1200, 0), dtype=float64), y=569    -1
13     -1
1654   -1
1218   -1
1890   -...    1
578     1
1067    1
Name: 500, dtype: int64, accept_sparse='csr', dtype='numeric', order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=True, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, warn_on_dtype=False, estimator=None)
    516     y_converted : object
    517         The converted and validated y.
    518     """
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
        ensure_min_features = 1
        warn_on_dtype = False
        estimator = None
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
    524                         dtype=None)
    525     else:

...........................................................................
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array=array([], shape=(1200, 0), dtype=float64), accept_sparse=['csr'], dtype=None, order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, warn_on_dtype=False, estimator=None)
    419         n_features = array.shape[1]
    420         if n_features < ensure_min_features:
    421             raise ValueError("Found array with %d feature(s) (shape=%s) while"
    422                              " a minimum of %d is required%s."
    423                              % (n_features, shape_repr, ensure_min_features,
--> 424                                 context))
        context = ''
    425 
    426     if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig:
    427         msg = ("Data with input dtype %s was converted to %s%s."
    428                % (dtype_orig, array.dtype, context))

ValueError: Found array with 0 feature(s) (shape=(1200, 0)) while a minimum of 1 is required.
___________________________________________________________________________

In [None]:
for i in range(0,5):
    display(results.best_params.iloc[i])
    
# using some of these lessons, modify for the full DB run

In [None]:
# adjusting the parameters based on which ones crash AWS and which don't...

knn_params = {
              'classifier__weights':['distance'],
              'classifier__n_neighbors':np.arange(5,11,2)}

svc_params = {
               'classifier__gamma':np.logspace(-2,2,3), 
               'classifier__C':np.logspace(-2,2,3),
               'classifier__kernel':['rbf']}

rfc_params = {
                'clf__n_estimators':[100],
                'clf__max_features':['auto']}

svc_gs = GridSearchCV(svc, svc_params, cv=3, n_jobs=-1, verbose=10)
knn_gs = GridSearchCV(knn, knn_params, cv=3, n_jobs=-1, verbose=10)
rfc_gs = GridSearchCV(rfc, rfc_params, cv=3, n_jobs=-1, verbose=10)

pipe_names = ['svc_gs', 'knn_gs', 'rfc_gs']
pipe_list = [svc_gs, knn_gs, rfc_gs]
model_zip = list(zip(pipe_names, pipe_list))

sample_list = [DBfull_clean]
results = score_pipelines(sample_list, model_zip)
display(results.sort_values('test_accuracy', ascending=False))

In [None]:
for i in range(0,1):
    display(results.best_params.iloc[i])

In [None]:
# etc_params = {
# #     'clf__bootstrap':[True, False],
#     'clf__n_estimators':[100,200]}

rfc_params = {
    'clf__n_estimators':[100,200],
    'clf__max_features':['auto']}

# etc_gs = GridSearchCV(etc, etc_params, cv=3, n_jobs=-1, verbose=10)
rfc_gs = GridSearchCV(rfc, rfc_params, cv=3, n_jobs=-1, verbose=10)

pipe_names = ['rfc_gs']
pipe_list = [rfc_gs]
model_zip = list(zip(pipe_names, pipe_list))

sample_list = [DBfull_clean]
results = score_pipelines(sample_list, model_zip)
display(results.sort_values('test_accuracy', ascending=False))