## Error Detection Challenge

In [1]:
#importing necessary libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, LabelBinarizer #for data transformations

#### Reading in the data files.
Here i use **latin-1** as my encoding to walk-around the 
'can't decode'unicode error that occurs on usage of the usual 
pd.read_csv('**<'dataset'>.csv**') method. The reason is that the files may not be 
in real csv format but instead html format.
the **cp1252** encoding could as well solve the issue

In [3]:
dirty_data = pd.read_csv('dirty_data.csv', encoding='latin-1')
clean_data = pd.read_csv('cleaned_data.csv', encoding='latin-1')
test_data = pd.read_csv('test_data.csv', encoding='latin-1')

In [4]:
# previewing the first few dirty data records
dirty_data.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,4/14/1998,researcher_12,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,58.5,,Unknown,,Released,4/14/1998,,,,
1,1998_RE_0002,7/7/1998,researcher_4,not_recorded,ocean,longline,fisher_522,site_22,species_1,not_recorded,...,37.0,,,1B-1LLS,,,,,,
2,1998_RE_0003,8/3/1998,,site_12,creek,net,fisher_1254,not_recorded,species_1,site_109,...,33.0,,,,,,,,,
3,1998_RE_0004,8/7/1998,researcher_12,site_110,creek,net,fisher_360,not_recorded,species_2,site_113,...,31.5,,,,,,,,,
4,1998_RE_0005,9/25/1998,researcher_17,not_recorded,creek,collected floater,fisher_865,site_8,species_3,site_109,...,63.5,,,Bs on C+ old panga wounds,,,Found trapped in mangroves,,,


In [5]:
# previewing the first few cleaned data records
clean_data.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,4/17/1998,researcher_19,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,58.42,,Unknown,,Released,4/17/1998,,,,
1,1998_RE_0002,7/7/1998,researcher_15,site_15,creek,longline,fisher_522,site_22,species_1,not_recorded,...,36.83,,Unknown,1B-1LLS,,,,,,
2,1998_RE_0003,8/3/1998,not_recorded,site_12,creek,not_recorded,fisher_1254,not_recorded,species_1,site_109,...,33.0,,Unknown,,Released,8/3/1998,,,,
3,1998_RE_0004,8/7/1998,researcher_19,site_110,creek,not_recorded,fisher_360,not_recorded,species_2,site_108,...,31.75,,Unknown,There was pillings on carapace.,Released,8/7/1998,,,,
4,1998_RE_0005,9/25/1998,researcher_17,site_8,creek,collected floater,fisher_865,site_8,species_3,site_121,...,63.5,,Unknown,Bs on C+ old panga wounds,Released,9/25/1998,Found trapped in mangroves,,,


In [6]:
#filling the NaN columns with 1's to indicate no errors
dirty_data.fillna(0, inplace=True)
clean_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

### Target variables
Since the model is going to be trained on both clean and dirty data, we have to generate the target variables *(errors of the respective columns)* by stacking the clean and dirty dataframes side by side

In [7]:
#generating targets values (errors)
targets = dirty_data.where(dirty_data.values==clean_data.values)
targets.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,,,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,,0.0,Unknown,0,Released,,0,0,0.0,0.0
1,1998_RE_0002,7/7/1998,,,,longline,fisher_522,site_22,species_1,not_recorded,...,,0.0,,1B-1LLS,0,0.0,0,0,0.0,0.0
2,1998_RE_0003,8/3/1998,,site_12,creek,,fisher_1254,not_recorded,species_1,site_109,...,33.0,0.0,,0,,,0,0,0.0,0.0
3,1998_RE_0004,8/7/1998,,site_110,creek,,fisher_360,not_recorded,species_2,,...,,0.0,,,,,0,0,0.0,0.0
4,1998_RE_0005,9/25/1998,researcher_17,,creek,collected floater,fisher_865,site_8,species_3,,...,63.5,0.0,,Bs on C+ old panga wounds,,,Found trapped in mangroves,0,0.0,0.0


In [8]:
#filling the new NaN targets to represent 1 (error)
targets.fillna(1, inplace=True)

#dropping the rescue_ID column as it tells nothing about the error
targets.drop('Rescue_ID', axis=1, inplace=True)

*replacing non-1 entries in the targets dataframe as 0 to indicate no error since they match in both the dirty and cleaned data*

In [9]:
targets = targets.replace(dirty_data.where(dirty_data.values==clean_data.values), 0)

In [10]:
targets.head()

Unnamed: 0,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,Tag_1,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1,1,0,0,0,0,0,0,0,1,...,1.0,0.0,0,0,0,1,0,0,0.0,0.0
1,0,1,1,1,0,0,0,0,0,0,...,1.0,0.0,1,0,0,0,0,0,0.0,0.0
2,0,1,0,0,1,0,0,0,0,0,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0
3,0,1,0,0,1,0,0,0,1,0,...,1.0,0.0,1,1,1,1,0,0,0.0,0.0
4,0,0,1,0,0,0,0,0,1,0,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0


Merging the target variables to the dirty_data inorder to create labels for each of the features

In [11]:
train_set = pd.concat([dirty_data,targets], axis=1)
train_set.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,1998_RE_0001,4/14/1998,researcher_12,site_110,creek,net,fisher_619,site_58,species_1,site_80,...,1.0,0.0,0,0,0,1,0,0,0.0,0.0
1,1998_RE_0002,7/7/1998,researcher_4,not_recorded,ocean,longline,fisher_522,site_22,species_1,not_recorded,...,1.0,0.0,1,0,0,0,0,0,0.0,0.0
2,1998_RE_0003,8/3/1998,0,site_12,creek,net,fisher_1254,not_recorded,species_1,site_109,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0
3,1998_RE_0004,8/7/1998,researcher_12,site_110,creek,net,fisher_360,not_recorded,species_2,site_113,...,1.0,0.0,1,1,1,1,0,0,0.0,0.0
4,1998_RE_0005,9/25/1998,researcher_17,not_recorded,creek,collected floater,fisher_865,site_8,species_3,site_109,...,0.0,0.0,1,0,1,1,0,0,0.0,0.0


Checking the column information

In [12]:
dirty_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4292 entries, 0 to 4291
Data columns (total 26 columns):
Rescue_ID                4292 non-null object
Date_Caught              4292 non-null object
Researcher               4292 non-null object
CaptureSite              4292 non-null object
ForagingGround           4292 non-null object
CaptureMethod            4292 non-null object
Fisher                   4292 non-null object
LandingSite              4292 non-null object
Species                  4292 non-null object
ReleaseSite              4292 non-null object
Tag_1                    4292 non-null object
Tag_2                    4292 non-null object
Tag_3                    4292 non-null float64
Lost_Tags                4292 non-null object
T_Number                 4292 non-null object
CCL_cm                   4292 non-null float64
CCW_cm                   4292 non-null float64
Weight_Kg                4292 non-null float64
Sex                      4292 non-null object
TurtleCharacter

In [13]:
dirty_data["Date_Caught"] = pd.to_datetime(dirty_data.Date_Caught)
test_data["Date_Caught"] = pd.to_datetime(test_data.Date_Caught)

In [14]:
dirty_data["year"] = dirty_data["Date_Caught"].dt.year
test_data["year"] = test_data["Date_Caught"].dt.year

In [15]:
#dropping the Id columns in the train and test data
test_data.drop(["Rescue_ID", "Date_Caught"], axis=1, inplace=True)
dirty_data.drop(["Rescue_ID", "Date_Caught"], axis=1, inplace=True)

#training features and labels
features = dirty_data
labels = targets

In [22]:
features.head()

Unnamed: 0,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,Tag_1,Tag_2,...,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure,year
0,researcher_12,site_110,creek,net,fisher_619,site_58,species_1,site_80,Missing data,,...,0.0,Unknown,0,Released,4/14/1998,0,0,0.0,0.0,1998
1,researcher_4,not_recorded,ocean,longline,fisher_522,site_22,species_1,not_recorded,NotTagged_0002,,...,0.0,0,1B-1LLS,0,0,0,0,0.0,0.0,1998
2,0,site_12,creek,net,fisher_1254,not_recorded,species_1,site_109,NotTagged_0003,,...,0.0,0,0,0,0,0,0,0.0,0.0,1998
3,researcher_12,site_110,creek,net,fisher_360,not_recorded,species_2,site_113,NotTagged_0004,,...,0.0,0,0,0,0,0,0,0.0,0.0,1998
4,researcher_17,not_recorded,creek,collected floater,fisher_865,site_8,species_3,site_109,NotTagged_0005,,...,0.0,0,Bs on C+ old panga wounds,0,0,Found trapped in mangroves,0,0.0,0.0,1998


In [None]:
""" applying transformations to the dirty and test data i.e. text to int conversion """

encoder = LabelEncoder()
dirty_1hot = encoder.fit_transform(features)
test_1hot = encoder.fit_transform(test_data)

### Training the model

In [23]:
#importing the model and scoring metrics from the sklearn library
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [24]:
model = RandomForestClassifier(n_estimators=100, criterion='mae', n_jobs=-1, random_state=2)

In [None]:
model.fit(features, labels)

In [47]:
""" reading the sample submission file and checking its shape """

sample_sub = pd.read_csv('submission_example.csv')
sample_sub.shape

(34050, 2)

In [49]:
sample_sub.head()

Unnamed: 0,ID,error
0,2011_RE_0001 x Date_Caught,0.0
1,2011_RE_0001 x Researcher,1.0
2,2011_RE_0001 x CaptureSite,1.0
3,2011_RE_0001 x ForagingGround,0.0
4,2011_RE_0001 x CaptureMethod,1.0


In [48]:
""" checking the shape of our test data """

test_data.shape

(1362, 26)

In [73]:
test_data.head()

Unnamed: 0,Rescue_ID,Date_Caught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,ReleaseSite,...,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,Date_Release,Release_Admiss_Notes,SpecialRemarks,PCVNumber,Expenditure
0,2011_RE_0001,1/1/2011,researcher_4,site_38,ocean,net,fisher_360,site_118,species_1,site_80,...,92.4,,,Looks healthy& well fed. Small barnacles on bo...,,,,,,
1,2011_RE_0002,1/2/2011,researcher_4,site_36,creek,longline,fisher_1118,site_58,species_1,site_80,...,53.7,23.5,,Green algae on the carapce. A small hole of 1c...,,,,,,
2,2011_RE_0003,1/3/2011,researcher_4,site_53,creek,net,fisher_703,site_8,species_1,site_80,...,53.6,25.0,,A V-notch between 1st and 2nd inner scales of ...,,,,,,
3,2011_RE_0004,1/3/2011,researcher_4,site_53,creek,net,fisher_861,site_8,species_1,site_80,...,53.9,25.5,,Thin green algae on the carapace. Clean plastr...,,,,,,
4,2011_RE_0005,1/3/2011,researcher_4,site_53,creek,net,fisher_242,site_8,species_1,site_80,...,48.5,20.0,,Flaking on the shell. Additional small CS betw...,,,caught a day before,,,
