## Error Detection Challenge

In [60]:
#importing necessary libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder, LabelBinarizer #for data transformations
%matplotlib inline

In [61]:
os.getcwd()

'C:\\Users\\User\\Desktop\\competitive-programming\\sea_turtle_challenge'

In [62]:
path = 'sea_turtle_challenge/data/'

#### Reading in the data files.
Here i use **latin-1** as my encoding to walk-around the 
'can't decode'unicode error that occurs on usage of the usual 
pd.read_csv('**<'dataset'>.csv**') method. The reason is that the files may not be 
in real csv format but instead html format.
the **cp1252** encoding could as well solve the issue

In [63]:
dirty_data = pd.read_csv('../sea_turtle_challenge/data/dirty_data.csv', encoding='latin-1')
clean_data = pd.read_csv('../sea_turtle_challenge/data/cleaned_data.csv', encoding='latin-1')
test_data = pd.read_csv('../sea_turtle_challenge/data/test_data.csv', encoding='latin-1')

In [64]:
# previewing the first few dirty data records
dirty_data.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,4/14/1998,researcher_12,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,58.5,,Unknown,,Released,4/14/1998,,,,
1,1998_RE_0002,7/7/1998,researcher_4,not_recorded,ocean,longline,fisher_522,site_22,species_1,not_recorded,...,37.0,,,1B-1LLS,,,,,,
2,1998_RE_0003,8/3/1998,,site_12,creek,net,fisher_1254,not_recorded,species_1,site_109,...,33.0,,,,,,,,,
3,1998_RE_0004,8/7/1998,researcher_12,site_110,creek,net,fisher_360,not_recorded,species_2,site_113,...,31.5,,,,,,,,,
4,1998_RE_0005,9/25/1998,researcher_17,not_recorded,creek,collected floater,fisher_865,site_8,species_3,site_109,...,63.5,,,Bs on C+ old panga wounds,,,Found trapped in mangroves,,,


In [65]:
# previewing the first few cleaned data records
clean_data.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,4/17/1998,researcher_19,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,58.42,,Unknown,,Released,4/17/1998,,,,
1,1998_RE_0002,7/7/1998,researcher_15,site_15,creek,longline,fisher_522,site_22,species_1,not_recorded,...,36.83,,Unknown,1B-1LLS,,,,,,
2,1998_RE_0003,8/3/1998,not_recorded,site_12,creek,not_recorded,fisher_1254,not_recorded,species_1,site_109,...,33.0,,Unknown,,Released,8/3/1998,,,,
3,1998_RE_0004,8/7/1998,researcher_19,site_110,creek,not_recorded,fisher_360,not_recorded,species_2,site_108,...,31.75,,Unknown,There was pillings on carapace.,Released,8/7/1998,,,,
4,1998_RE_0005,9/25/1998,researcher_17,site_8,creek,collected floater,fisher_865,site_8,species_3,site_121,...,63.5,,Unknown,Bs on C+ old panga wounds,Released,9/25/1998,Found trapped in mangroves,,,


In [66]:
#filling the NaN columns with 1's to indicate no errors
dirty_data.fillna(0, inplace=True)
clean_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

### Target variables
Since the model is going to be trained on both clean and dirty data, we have to generate the target variables *(errors of the respective columns)* by stacking the clean and dirty dataframes side by side

In [67]:
#generating targets values (errors)
targets = dirty_data.where(dirty_data.values==clean_data.values)
targets.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,,,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,,0.0,Unknown,0,Released,,0,0,0.0,0.0
1,1998_RE_0002,7/7/1998,,,,longline,fisher_522,site_22,species_1,not_recorded,...,,0.0,,1B-1LLS,0,0.0,0,0,0.0,0.0
2,1998_RE_0003,8/3/1998,,site_12,creek,,fisher_1254,not_recorded,species_1,site_109,...,33.0,0.0,,0,,,0,0,0.0,0.0
3,1998_RE_0004,8/7/1998,,site_110,creek,,fisher_360,not_recorded,species_2,,...,,0.0,,,,,0,0,0.0,0.0
4,1998_RE_0005,9/25/1998,researcher_17,,creek,collected floater,fisher_865,site_8,species_3,,...,63.5,0.0,,Bs on C+ old panga wounds,,,Found trapped in mangroves,0,0.0,0.0


In [68]:
#filling the new NaN targets to represent 1 (error)
targets.fillna(1, inplace=True)

#dropping the rescue_ID column as it tells nothing about the error
targets.drop('Rescue_ID', axis=1, inplace=True)

*replacing non-1 entries in the targets dataframe as 0 to indicate no error since they match in both the dirty and cleaned data*

In [69]:
targets = targets.replace(dirty_data.where(dirty_data.values==clean_data.values), 0)

In [70]:
targets.head()

Unnamed: 0,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,Tag_1,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1,1,0,0,0,0,0,0,0,1,...,1.0,0.0,0,0,0,1,0,0,0.0,0.0
1,0,1,1,1,0,0,0,0,0,0,...,1.0,0.0,1,0,0,0,0,0,0.0,0.0
2,0,1,0,0,1,0,0,0,0,0,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0
3,0,1,0,0,1,0,0,0,1,0,...,1.0,0.0,1,1,1,1,0,0,0.0,0.0
4,0,0,1,0,0,0,0,0,1,0,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0


In [71]:
dirty_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4292 entries, 0 to 4291
Data columns (total 26 columns):
Rescue_ID                4292 non-null object
Date_Caught              4292 non-null object
Researcher               4292 non-null object
CaptureSite              4292 non-null object
ForagingGround           4292 non-null object
CaptureMethod            4292 non-null object
Fisher                   4292 non-null object
LandingSite              4292 non-null object
Species                  4292 non-null object
ReleaseSite              4292 non-null object
Tag_1                    4292 non-null object
Tag_2                    4292 non-null object
Tag_3                    4292 non-null float64
Lost_Tags                4292 non-null object
T_Number                 4292 non-null object
CCL_cm                   4292 non-null float64
CCW_cm                   4292 non-null float64
Weight_Kg                4292 non-null float64
Sex                      4292 non-null object
TurtleCharacter

In [72]:
dirty_data["Date_Caught"] = pd.to_datetime(dirty_data.Date_Caught)
test_data["Date_Caught"] = pd.to_datetime(test_data.Date_Caught)

In [73]:
dirty_data["year"] = dirty_data["Date_Caught"].dt.year
test_data["year"] = test_data["Date_Caught"].dt.year

In [74]:
#dropping the Id columns in the train and test data
test_data.drop(["Rescue_ID", "Date_Caught"], axis=1, inplace=True)
dirty_data.drop(["Rescue_ID", "Date_Caught"], axis=1, inplace=True)

#training features and labels
features = dirty_data
labels = targets

In [75]:
features.head()

Unnamed: 0,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,Tag_1,Tag_2,...,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure,year
0,researcher_12,site_110,creek,net,fisher_619,site_58,species_1,site_80,Missing data,,...,0.0,Unknown,0,Released,4/14/1998,0,0,0.0,0.0,1998
1,researcher_4,not_recorded,ocean,longline,fisher_522,site_22,species_1,not_recorded,NotTagged_0002,,...,0.0,0,1B-1LLS,0,0,0,0,0.0,0.0,1998
2,0,site_12,creek,net,fisher_1254,not_recorded,species_1,site_109,NotTagged_0003,,...,0.0,0,0,0,0,0,0,0.0,0.0,1998
3,researcher_12,site_110,creek,net,fisher_360,not_recorded,species_2,site_113,NotTagged_0004,,...,0.0,0,0,0,0,0,0,0.0,0.0,1998
4,researcher_17,not_recorded,creek,collected floater,fisher_865,site_8,species_3,site_109,NotTagged_0005,,...,0.0,0,Bs on C+ old panga wounds,0,0,Found trapped in mangroves,0,0.0,0.0,1998


In [86]:
labels.head()

Unnamed: 0,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,Tag_1,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1,1,0,0,0,0,0,0,0,1,...,1.0,0.0,0,0,0,1,0,0,0.0,0.0
1,0,1,1,1,0,0,0,0,0,0,...,1.0,0.0,1,0,0,0,0,0,0.0,0.0
2,0,1,0,0,1,0,0,0,0,0,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0
3,0,1,0,0,1,0,0,0,1,0,...,1.0,0.0,1,1,1,1,0,0,0.0,0.0
4,0,0,1,0,0,0,0,0,1,0,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0


In [76]:
""" applying transformations to the dirty and test data i.e. text to int conversion """
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
"""learning vocabulary of the training data"""
vect.fit(features)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [77]:
#examining fitted vocabulary
vect.get_feature_names()

['capturemethod',
 'capturesite',
 'ccl_cm',
 'ccw_cm',
 'date_release',
 'expenditure',
 'fisher',
 'foragingground',
 'landingsite',
 'lost_tags',
 'pcvnumber',
 'release_admiss_notes',
 'releasesite',
 'researcher',
 'sex',
 'specialremarks',
 'species',
 'status',
 't_number',
 'tag_1',
 'tag_2',
 'tag_3',
 'turtlecharacteristics',
 'weight_kg',
 'year']

In [78]:
#transforming training data into a document-term matrix
feats = vect.transform(features)
feats

<25x25 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [79]:
#converting the sparse-matrix to a dense-matrix
feats.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
      

In [80]:
#examining the vocabulary and document-term matrix together
new_features = pd.DataFrame(feats.toarray(), columns = vect.get_feature_names())
new_features.head()

Unnamed: 0,capturemethod,capturesite,ccl_cm,ccw_cm,date_release,expenditure,fisher,foragingground,landingsite,lost_tags,...,specialremarks,species,status,t_number,tag_1,tag_2,tag_3,turtlecharacteristics,weight_kg,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Training the model

In [81]:
#importing the model and scoring metrics from the sklearn library
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [82]:
model = RandomForestClassifier(n_estimators=100, criterion='mae', n_jobs=-1, random_state=2)

In [83]:
model.fit(new_features, labels)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\Users\User\Anaconda3\lib\runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
C:\Users\User\Anaconda3\lib\runpy.py in _run_code(code=<code object <module> at 0x01846968, file "C:\Us...lib\site-packages\ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\User\Anaconda3\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\Users\User\Anaconda3\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\U...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x01846968, file "C:\Us...lib\site-packages\ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': r'C:\Users\User\Anaconda3\lib\site-packages\__pycache__\ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\Users\User\Anaconda3\lib\site-packages\ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...nda3\\lib\\site-packages\\ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from 'C:\\Users\\U...a3\\lib\\site-packages\\ipykernel\\kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\tornado\platform\asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    122         except (RuntimeError, AssertionError):
    123             old_loop = None
    124         try:
    125             self._setup_logging()
    126             asyncio.set_event_loop(self.asyncio_loop)
--> 127             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Win...EventLoop running=True closed=False debug=False>>
    128         finally:
    129             asyncio.set_event_loop(old_loop)
    130 
    131     def stop(self):

...........................................................................
C:\Users\User\Anaconda3\lib\asyncio\base_events.py in run_forever(self=<_WindowsSelectorEventLoop running=True closed=False debug=False>)
    417             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    418                                    finalizer=self._asyncgen_finalizer_hook)
    419         try:
    420             events._set_running_loop(self)
    421             while True:
--> 422                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_Windo...EventLoop running=True closed=False debug=False>>
    423                 if self._stopping:
    424                     break
    425         finally:
    426             self._stopping = False

...........................................................................
C:\Users\User\Anaconda3\lib\asyncio\base_events.py in _run_once(self=<_WindowsSelectorEventLoop running=True closed=False debug=False>)
   1427                         logger.warning('Executing %s took %.3f seconds',
   1428                                        _format_handle(handle), dt)
   1429                 finally:
   1430                     self._current_handle = None
   1431             else:
-> 1432                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(724, 1)>>
   1433         handle = None  # Needed to break cycles when an exception occurs.
   1434 
   1435     def _set_coroutine_wrapper(self, enabled):
   1436         try:

...........................................................................
C:\Users\User\Anaconda3\lib\asyncio\events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(724, 1)>)
    140             self._callback = None
    141             self._args = None
    142 
    143     def _run(self):
    144         try:
--> 145             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (724, 1)
    146         except Exception as exc:
    147             cb = _format_callback_source(self._callback, self._args)
    148             msg = 'Exception in callback {}'.format(cb)
    149             context = {

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\tornado\platform\asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=724, events=1)
    112             self.writers.remove(fd)
    113         del self.handlers[fd]
    114 
    115     def _handle_events(self, fd, events):
    116         fileobj, handler_func = self.handlers[fd]
--> 117         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    118 
    119     def start(self):
    120         try:
    121             old_loop = asyncio.get_event_loop()

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'model.fit(new_features, labels)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 2, 27, 20, 41, 14, 641915, tzinfo=tzutc()), 'msg_id': '42c53ce232bd4a3a8ae7e51678e8fee6', 'msg_type': 'execute_request', 'session': 'fbdf0eac2f3e4cf289d09201b3cd9f38', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '42c53ce232bd4a3a8ae7e51678e8fee6', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'fbdf0eac2f3e4cf289d09201b3cd9f38']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'model.fit(new_features, labels)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 2, 27, 20, 41, 14, 641915, tzinfo=tzutc()), 'msg_id': '42c53ce232bd4a3a8ae7e51678e8fee6', 'msg_type': 'execute_request', 'session': 'fbdf0eac2f3e4cf289d09201b3cd9f38', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '42c53ce232bd4a3a8ae7e51678e8fee6', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'fbdf0eac2f3e4cf289d09201b3cd9f38'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'model.fit(new_features, labels)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2019, 2, 27, 20, 41, 14, 641915, tzinfo=tzutc()), 'msg_id': '42c53ce232bd4a3a8ae7e51678e8fee6', 'msg_type': 'execute_request', 'session': 'fbdf0eac2f3e4cf289d09201b3cd9f38', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '42c53ce232bd4a3a8ae7e51678e8fee6', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='model.fit(new_features, labels)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'model.fit(new_features, labels)'
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('model.fit(new_features, labels)',), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('model.fit(new_features, labels)',)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='model.fit(new_features, labels)', store_history=True, silent=False, shell_futures=True)
   2657         -------
   2658         result : :class:`ExecutionResult`
   2659         """
   2660         try:
   2661             result = self._run_cell(
-> 2662                 raw_cell, store_history, silent, shell_futures)
        raw_cell = 'model.fit(new_features, labels)'
        store_history = True
        silent = False
        shell_futures = True
   2663         finally:
   2664             self.events.trigger('post_execute')
   2665             if not silent:
   2666                 self.events.trigger('post_run_cell', result)

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in _run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='model.fit(new_features, labels)', store_history=True, silent=False, shell_futures=True)
   2780                 self.displayhook.exec_result = result
   2781 
   2782                 # Execute the user code
   2783                 interactivity = 'none' if silent else self.ast_node_interactivity
   2784                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2785                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2786                 
   2787                 self.last_execution_succeeded = not has_raised
   2788                 self.last_execution_result = result
   2789 

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Expr object>], cell_name='<ipython-input-83-285ba71256e1>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 556cb90, execution_co...rue silent=False shell_futures=True> result=None>)
   2904                     return True
   2905 
   2906             for i, node in enumerate(to_run_interactive):
   2907                 mod = ast.Interactive([node])
   2908                 code = compiler(mod, cell_name, "single")
-> 2909                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x0E775B20, file "<ipython-input-83-285ba71256e1>", line 1>
        result = <ExecutionResult object at 556cb90, execution_co...rue silent=False shell_futures=True> result=None>
   2910                     return True
   2911 
   2912             # Flush softspace
   2913             if softspace(sys.stdout, 0):

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x0E775B20, file "<ipython-input-83-285ba71256e1>", line 1>, result=<ExecutionResult object at 556cb90, execution_co...rue silent=False shell_futures=True> result=None>)
   2958         outflag = True  # happens in more places, so it's easier as default
   2959         try:
   2960             try:
   2961                 self.hooks.pre_run_code_hook()
   2962                 #rprint('Running code', repr(code_obj)) # dbg
-> 2963                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x0E775B20, file "<ipython-input-83-285ba71256e1>", line 1>
        self.user_global_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'In': ['', "#importing necessary libraries\nimport pandas as ..._ipython().run_line_magic('matplotlib', 'inline')", 'os.getcwd()', "path = 'sea_turtle_challenge/data'", "dirty_data = pd.read_csv(path/'dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(path/dirty_data.csv, en...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "get_ipython().run_line_magic('pinfo', 'os.path.join')", "dirty_data = pd.read_csv(os.path.join(path, 'dir...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, 'dir...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, dirt...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, ../d...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, '../...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('../dirty_data.csv', en...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "path = 'sea_turtle_challenge/data/'", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('../dirty_data.csv', en...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(path+'../dirty_data.csv...ead_csv(path/'test_data.csv', encoding='latin-1')", ...], 'LabelBinarizer': <class 'sklearn.preprocessing.label.LabelBinarizer'>, 'LabelEncoder': <class 'sklearn.preprocessing.label.LabelEncoder'>, 'Out': {2: r'C:\Users\User\Desktop\competitive-programming\sea_turtle_challenge', 27: <module 'ntpath' from 'C:\\Users\\User\\Anaconda3\\lib\\ntpath.py'>, 30:       Rescue_ID Date_Caught     Researcher   Cap...     0.0  
4         0.0  

[5 rows x 26 columns], 31:       Rescue_ID Date_Caught     Researcher Captu...
4       0.0         0.0  

[5 rows x 26 columns], 33:       Rescue_ID Date_Caught     Researcher Captu...     0.0  
4         0.0  

[5 rows x 26 columns], 36:    Date_Caught  Researcher  CaptureSite  Foragin...        0.0          0.0  

[5 rows x 25 columns], 37:       Rescue_ID Date_Caught     Researcher   Cap... 0       0.0         0.0  

[5 rows x 51 columns], 42:       Researcher   CaptureSite ForagingGround   ...   0.0         0.0  1998  

[5 rows x 25 columns], 50: CountVectorizer(analyzer='word', binary=False, d...\w+\\b',
        tokenizer=None, vocabulary=None), 51: ['capturemethod', 'capturesite', 'ccl_cm', 'ccw_cm', 'date_release', 'expenditure', 'fisher', 'foragingground', 'landingsite', 'lost_tags', 'pcvnumber', 'release_admiss_notes', 'releasesite', 'researcher', 'sex', 'specialremarks', 'species', 'status', 't_number', 'tag_1', ...], ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, '_':    capturemethod  capturesite  ccl_cm  ccw_cm  d...  0  
4          0     0  

[5 rows x 25 columns], '_2': r'C:\Users\User\Desktop\competitive-programming\sea_turtle_challenge', '_27': <module 'ntpath' from 'C:\\Users\\User\\Anaconda3\\lib\\ntpath.py'>, '_30':       Rescue_ID Date_Caught     Researcher   Cap...     0.0  
4         0.0  

[5 rows x 26 columns], ...}
        self.user_ns = {'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'In': ['', "#importing necessary libraries\nimport pandas as ..._ipython().run_line_magic('matplotlib', 'inline')", 'os.getcwd()', "path = 'sea_turtle_challenge/data'", "dirty_data = pd.read_csv(path/'dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(path/dirty_data.csv, en...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "get_ipython().run_line_magic('pinfo', 'os.path.join')", "dirty_data = pd.read_csv(os.path.join(path, 'dir...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, 'dir...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, dirt...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, ../d...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(os.path.join(path, '../...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('../dirty_data.csv', en...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "path = 'sea_turtle_challenge/data/'", "dirty_data = pd.read_csv('path/dirty_data.csv', ...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv('../dirty_data.csv', en...ead_csv(path/'test_data.csv', encoding='latin-1')", "dirty_data = pd.read_csv(path+'../dirty_data.csv...ead_csv(path/'test_data.csv', encoding='latin-1')", ...], 'LabelBinarizer': <class 'sklearn.preprocessing.label.LabelBinarizer'>, 'LabelEncoder': <class 'sklearn.preprocessing.label.LabelEncoder'>, 'Out': {2: r'C:\Users\User\Desktop\competitive-programming\sea_turtle_challenge', 27: <module 'ntpath' from 'C:\\Users\\User\\Anaconda3\\lib\\ntpath.py'>, 30:       Rescue_ID Date_Caught     Researcher   Cap...     0.0  
4         0.0  

[5 rows x 26 columns], 31:       Rescue_ID Date_Caught     Researcher Captu...
4       0.0         0.0  

[5 rows x 26 columns], 33:       Rescue_ID Date_Caught     Researcher Captu...     0.0  
4         0.0  

[5 rows x 26 columns], 36:    Date_Caught  Researcher  CaptureSite  Foragin...        0.0          0.0  

[5 rows x 25 columns], 37:       Rescue_ID Date_Caught     Researcher   Cap... 0       0.0         0.0  

[5 rows x 51 columns], 42:       Researcher   CaptureSite ForagingGround   ...   0.0         0.0  1998  

[5 rows x 25 columns], 50: CountVectorizer(analyzer='word', binary=False, d...\w+\\b',
        tokenizer=None, vocabulary=None), 51: ['capturemethod', 'capturesite', 'ccl_cm', 'ccw_cm', 'date_release', 'expenditure', 'fisher', 'foragingground', 'landingsite', 'lost_tags', 'pcvnumber', 'release_admiss_notes', 'releasesite', 'researcher', 'sex', 'specialremarks', 'species', 'status', 't_number', 'tag_1', ...], ...}, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, '_':    capturemethod  capturesite  ccl_cm  ccw_cm  d...  0  
4          0     0  

[5 rows x 25 columns], '_2': r'C:\Users\User\Desktop\competitive-programming\sea_turtle_challenge', '_27': <module 'ntpath' from 'C:\\Users\\User\\Anaconda3\\lib\\ntpath.py'>, '_30':       Rescue_ID Date_Caught     Researcher   Cap...     0.0  
4         0.0  

[5 rows x 26 columns], ...}
   2964             finally:
   2965                 # Reset our crash handler in place
   2966                 sys.excepthook = old_excepthook
   2967         except SystemExit as e:

...........................................................................
C:\Users\User\Desktop\competitive-programming\sea_turtle_challenge\<ipython-input-83-285ba71256e1> in <module>()
----> 1 model.fit(new_features, labels)

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in fit(self=RandomForestClassifier(bootstrap=True, class_wei...lse, random_state=2, verbose=0, warm_start=False), X=array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32), y=array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]]), sample_weight=None)
    323             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
    324                              backend="threading")(
    325                 delayed(_parallel_build_trees)(
    326                     t, self, X, y, sample_weight, i, len(trees),
    327                     verbose=self.verbose, class_weight=self.class_weight)
--> 328                 for i, t in enumerate(trees))
        i = 99
    329 
    330             # Collect newly grown trees
    331             self.estimators_.extend(trees)
    332 

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseForest.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Wed Feb 27 23:41:16 2019
PID: 4904                  Python 3.6.5: C:\Users\User\Anaconda3\python.exe
...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _parallel_build_trees>, (DecisionTreeClassifier(class_weight=None, criter...        random_state=1872583848, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...lse, random_state=2, verbose=0, warm_start=False), array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32), array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]]), None, 0, 100), {'class_weight': None, 'verbose': 0})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _parallel_build_trees>
        args = (DecisionTreeClassifier(class_weight=None, criter...        random_state=1872583848, splitter='best'), RandomForestClassifier(bootstrap=True, class_wei...lse, random_state=2, verbose=0, warm_start=False), array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32), array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]]), None, 0, 100)
        kwargs = {'class_weight': None, 'verbose': 0}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _parallel_build_trees(tree=DecisionTreeClassifier(class_weight=None, criter...        random_state=1872583848, splitter='best'), forest=RandomForestClassifier(bootstrap=True, class_wei...lse, random_state=2, verbose=0, warm_start=False), X=array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32), y=array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]]), sample_weight=None, tree_idx=0, n_trees=100, verbose=0, class_weight=None)
    116                 warnings.simplefilter('ignore', DeprecationWarning)
    117                 curr_sample_weight *= compute_sample_weight('auto', y, indices)
    118         elif class_weight == 'balanced_subsample':
    119             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    120 
--> 121         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
        tree.fit = <bound method DecisionTreeClassifier.fit of Deci...       random_state=1872583848, splitter='best')>
        X = array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)
        y = array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])
        sample_weight = None
        curr_sample_weight = array([0., 1., 0., 1., 0., 0., 2., 1., 0., 2., 0..., 0., 3.,
       1., 3., 2., 1., 1., 2., 0., 1.])
    122     else:
    123         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
    124 
    125     return tree

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\tree\tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...        random_state=1872583848, splitter='best'), X=array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32), y=array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]]), sample_weight=array([0., 1., 0., 1., 0., 0., 2., 1., 0., 2., 0..., 0., 3.,
       1., 3., 2., 1., 1., 2., 0., 1.]), check_input=False, X_idx_sorted=None)
    785 
    786         super(DecisionTreeClassifier, self).fit(
    787             X, y,
    788             sample_weight=sample_weight,
    789             check_input=check_input,
--> 790             X_idx_sorted=X_idx_sorted)
        X_idx_sorted = None
    791         return self
    792 
    793     def predict_proba(self, X, check_input=True):
    794         """Predict class probabilities of the input samples X.

...........................................................................
C:\Users\User\Anaconda3\lib\site-packages\sklearn\tree\tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...        random_state=1872583848, splitter='best'), X=array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., ... 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32), y=array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]]), sample_weight=array([0., 1., 0., 1., 0., 0., 2., 1., 0., 2., 0..., 0., 3.,
       1., 3., 2., 1., 1., 2., 0., 1.]), check_input=False, X_idx_sorted=None)
    231 
    232         self.max_features_ = max_features
    233 
    234         if len(y) != n_samples:
    235             raise ValueError("Number of labels=%d does not match "
--> 236                              "number of samples=%d" % (len(y), n_samples))
        y = array([[1., 1., 0., ..., 0., 0., 0.],
       [0...., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])
        n_samples = 25
    237         if not 0 <= self.min_weight_fraction_leaf <= 0.5:
    238             raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
    239         if max_depth <= 0:
    240             raise ValueError("max_depth must be greater than zero. ")

ValueError: Number of labels=4292 does not match number of samples=25
___________________________________________________________________________

In [47]:
""" reading the sample submission file and checking its shape """

sample_sub = pd.read_csv('submission_example.csv')
sample_sub.shape

(34050, 2)

In [49]:
sample_sub.head()

Unnamed: 0,ID,error
0,2011_RE_0001 x Date_Caught,0.0
1,2011_RE_0001 x Researcher,1.0
2,2011_RE_0001 x CaptureSite,1.0
3,2011_RE_0001 x ForagingGround,0.0
4,2011_RE_0001 x CaptureMethod,1.0


In [48]:
""" checking the shape of our test data """

test_data.shape

(1362, 26)

In [40]:
test_data.head()

Unnamed: 0,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,Tag_1,Tag_2,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,researcher_4,site_38,ocean,net,fisher_360,site_118,species_1,site_80,KEB7543,KEB7544,...,92.4,0.0,0.0,Looks healthy& well fed. Small barnacles on bo...,0,0,0,0,0.0,0.0
1,researcher_4,site_36,creek,longline,fisher_1118,site_58,species_1,site_80,KE1970,,...,53.7,23.5,0.0,Green algae on the carapce. A small hole of 1c...,0,0,0,0,0.0,0.0
2,researcher_4,site_53,creek,net,fisher_703,site_8,species_1,site_80,KE5881,,...,53.6,25.0,0.0,A V-notch between 1st and 2nd inner scales of ...,0,0,0,0,0.0,0.0
3,researcher_4,site_53,creek,net,fisher_861,site_8,species_1,site_80,KE7514,,...,53.9,25.5,0.0,Thin green algae on the carapace. Clean plastr...,0,0,0,0,0.0,0.0
4,researcher_4,site_53,creek,net,fisher_242,site_8,species_1,site_80,KE5972,,...,48.5,20.0,0.0,Flaking on the shell. Additional small CS betw...,0,0,caught a day before,0,0.0,0.0
