In [1]:
# import directories
%matplotlib notebook
import os
import csv
import pandas as pd

# create path for data file
data_csv = os.path.join('data','human_trafficking.csv')
country_csv = os.path.join('data','CountryCodes.csv')

# use read_csv function from pandas directory to create a data frame
data_df=pd.read_csv(data_csv, low_memory = False)
country_df = pd.read_csv(country_csv)

Clean Country Code CSV
 1. shift rows that had commas in Country name
 2. update Country name of shifted rows
 3. drop empty column

In [2]:
country_df['Unnamed: 16'] = country_df['Unnamed: 16'].fillna('0')

In [3]:
country_df.loc[country_df['Unnamed: 16'] != '0', ['Country or Area', 'M49 Code',
       'ISO-alpha2 Code', 'ISO-alpha3 Code', 'Least Developed Countries (LDC)',
       'Land Locked Developing Countries (LLDC)',
       'Small Island Developing States (SIDS)',
       'Developed / Developing Countries', 'Unnamed: 16']] = country_df.loc[country_df['Unnamed: 16'] != '0', ['Country or Area', 'M49 Code',
       'ISO-alpha2 Code', 'ISO-alpha3 Code', 'Least Developed Countries (LDC)',
       'Land Locked Developing Countries (LLDC)',
       'Small Island Developing States (SIDS)',
       'Developed / Developing Countries', 'Unnamed: 16']].shift(periods=-1, axis="columns")

In [4]:
country_df.loc[country_df['ISO-alpha2 Code'] =='BQ', ['Country or Area']] = 'Bonarie; Sint Eustatius and Saba'
country_df.loc[country_df['ISO-alpha2 Code'] =='HK', ['Country or Area']] = 'Hong Kong'
country_df.loc[country_df['ISO-alpha2 Code'] =='MO', ['Country or Area']] = 'Macao'
country_df.loc[country_df['Unnamed: 16'] != '0', :]

country_df = country_df.drop(columns = 'Unnamed: 16')

In [5]:
# reduce columns in country data frame
country_df = country_df[['Region Name', 'Sub-region Name', 'Intermediate Region Name', 'Country or Area','ISO-alpha2 Code', 'ISO-alpha3 Code']]
country_df = country_df.rename(columns = {'ISO-alpha2 Code': 'ISO Code'})

In [6]:
# file path for clean country csv data
cleanCountry_csv = os.path.join('Cleandata','CleanCountryCodes.csv')

# write to csv
country_df.to_csv(cleanCountry_csv, index=False, header=True)

Clean Trafficking Data

In [7]:
# data_df.groupby('yearOfRegistration').count()

In [8]:
# because there are only 28 records for the year 2019, remove records
data_df = data_df.loc[data_df['yearOfRegistration'] < 2019, :]

In [9]:
# -99 is used when data is missing
# for all the columns that use -99 as a string, replace the value with unknown
data_df = data_df.replace('-99', 'unknown')

# citizenship column also uses '00' which is not a valid country code
data_df['citizenship'] = data_df['citizenship'].replace('00', 'unknown')

# for all the columns that use -99 as a int, replace the value with unknown
data_df = data_df.replace(-99, 0)

In [10]:
# groom typeOfLabourConcatenated values
# for those with a value & 'Not specified', remove 'Not specified'
data_df['typeOfLabourConcatenated'] = data_df['typeOfLabourConcatenated'].replace('Other;Not specified', 'Other')
data_df['typeOfLabourConcatenated'] = data_df['typeOfLabourConcatenated'].replace('Domestic work;Not specified', 'Domestic work')
data_df['typeOfLabourConcatenated'] = data_df['typeOfLabourConcatenated'].replace('Construction;Not specified', 'Construction')
data_df['typeOfLabourConcatenated'] = data_df['typeOfLabourConcatenated'].replace('Agriculture;Not specified', 'Agriculture')
data_df['typeOfLabourConcatenated'] = data_df['typeOfLabourConcatenated'].replace('Domestic work;Other', 'Domestic work')

# change 'Not specified' to 'unknown'
data_df['typeOfLabourConcatenated'] = data_df['typeOfLabourConcatenated'].replace('Not specified', 'unknown')

In [11]:
# groom typeOfExploitConcatenated values
# replace Forced labour;Other with Forced labour
data_df['typeOfExploitConcatenated'] = data_df['typeOfExploitConcatenated'].replace('Forced labour;Other', 'Forced labour')
data_df['typeOfExploitConcatenated'] = data_df['typeOfExploitConcatenated'].replace('Forced labour;Sexual exploitation;Combined sexual and labour exploitation', 'Forced labour and Sexual exploitation')

In [12]:
# merge country/region information for citizenship
data_df = data_df.merge(country_df, how ='left', left_on='citizenship', right_on='ISO Code')

# rename added columns
data_df = data_df.rename(columns = {'Region Name': 'Citizenship Region',
                                    'Sub-region Name': 'Citizenship Sub-Region',
                                    'Intermediate Region Name': 'Citizenship Intermediate Region',
                                    'Country or Area': 'Citizenship Country',
                                    'ISO Code': 'Citizenship ISO Code',
                                    'ISO-alpha3 Code': 'Citizenship ISO3 Code'})

In [13]:
# merge country/region information for country of exploit
data_df = data_df.merge(country_df, how ='left', left_on='CountryOfExploitation', right_on='ISO Code')

data_df = data_df.rename(columns = {'Region Name': 'Exploit Region',
                                    'Sub-region Name': 'Exploit Sub-Region',
                                    'Intermediate Region Name': 'Exploit Intermediate Region',
                                    'Country or Area': 'Exploit Country',
                                    'ISO Code': 'Exploit ISO Code',
                                    'ISO-alpha3 Code': 'Exploit ISO3 Code'})

In [14]:
# where citizenship country is unknown - mark citizenship country data unknow
data_df.loc[data_df['citizenship']=='unknown', ['Citizenship Region', 'Citizenship Sub-Region', 
                                                'Citizenship Intermediate Region',
                                                'Citizenship Country']] = 'unknown'

# where country of exploit is unknown - mark exploit country data unknow
data_df.loc[data_df['CountryOfExploitation']=='unknown', ['Exploit Region', 'Exploit Sub-Region', 
                                                'Exploit Intermediate Region',
                                                'Exploit Country']] = 'unknown'

In [15]:
# remove extra data columns
data_df = data_df.drop(columns = ['Citizenship ISO Code', 'Exploit ISO Code'])

In [16]:
# export full cleaned data set w/merge
cleanData_csv = os.path.join('Cleandata','CleanFullDataSet.csv')

# write to csv
data_df.to_csv(cleanData_csv, index=False, header=True)

In [17]:
# create subsets of data for export
labor_df = data_df[['yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
                    'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
                    'citizenship','Citizenship Region', 'Citizenship Sub-Region',
                    'Citizenship Intermediate Region', 'Citizenship Country',
                    'CountryOfExploitation', 'Exploit Region', 'Exploit Sub-Region',
                    'Exploit Intermediate Region', 'Exploit Country',
                    'typeOfLabourConcatenated','isForcedLabour', 'isSexualExploit',
                    'isOtherExploit', 'isSexAndLabour',
                    'isForcedMarriage', 'isForcedMilitary', 'isOrganRemoval',
                    'isSlaveryAndPractices', 'typeOfExploitConcatenated',
                    'typeOfLabourAgriculture', 'typeOfLabourAquafarming',
                    'typeOfLabourBegging', 'typeOfLabourConstruction',
                    'typeOfLabourDomesticWork', 'typeOfLabourHospitality',
                    'typeOfLabourIllicitActivities', 'typeOfLabourManufacturing',
                    'typeOfLabourMiningOrDrilling', 'typeOfLabourPeddling',
                    'typeOfLabourTransportation', 'typeOfLabourOther','typeOfLabourNotSpecified']]


In [18]:
sex_df = data_df[['yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
                  'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
                  'citizenship','Citizenship Region', 'Citizenship Sub-Region',
                  'Citizenship Intermediate Region', 'Citizenship Country',
                  'CountryOfExploitation', 'Exploit Region', 'Exploit Sub-Region',
                  'Exploit Intermediate Region', 'Exploit Country','typeOfSexConcatenated',
                  'typeOfSexProstitution', 'typeOfSexPornography','typeOfSexRemoteInteractiveServices',
                  'typeOfSexPrivateSexualServices']]
                  

In [19]:
exploit_df = data_df[['yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
                      'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
                      'citizenship','Citizenship Region', 'Citizenship Sub-Region',
                      'Citizenship Intermediate Region', 'Citizenship Country',
                      'CountryOfExploitation', 'Exploit Region', 'Exploit Sub-Region',
                      'Exploit Intermediate Region', 'Exploit Country','typeOfExploitConcatenated',
                      'isForcedLabour', 'isSexualExploit', 'isOtherExploit',
                      'isSexAndLabour','isForcedMarriage',
                      'isForcedMilitary', 'isOrganRemoval','isSlaveryAndPractices']]

In [20]:
control_df = data_df[['yearOfRegistration', 'Datasource', 'gender', 'ageBroad',
                      'majorityStatus', 'majorityStatusAtExploit', 'majorityEntry',
                      'citizenship','Citizenship Region', 'Citizenship Sub-Region',
                      'Citizenship Intermediate Region', 'Citizenship Country',
                      'CountryOfExploitation', 'Exploit Region', 'Exploit Sub-Region',
                      'Exploit Intermediate Region', 'Exploit Country','meansOfControlConcatenated',
                      'meansOfControlDebtBondage', 'meansOfControlTakesEarnings', 
                      'meansOfControlRestrictsFinancialAccess','meansOfControlThreats',
                      'meansOfControlPsychologicalAbuse', 'meansOfControlPhysicalAbuse',
                      'meansOfControlSexualAbuse', 'meansOfControlFalsePromises',
                      'meansOfControlPsychoactiveSubstances', 
                      'meansOfControlRestrictsMovement', 'meansOfControlRestrictsMedicalCare', 
                      'meansOfControlExcessiveWorkingHours', 'meansOfControlUsesChildren',
                      'meansOfControlThreatOfLawEnforcement','meansOfControlWithholdsNecessities',
                      'meansOfControlWithholdsDocuments', 'meansOfControlOther','meansOfControlNotSpecified']]

In [21]:
# export clean data sets
LaborData_csv = os.path.join('Cleandata','CleanLaborSet.csv')
SexData_csv = os.path.join('Cleandata','CleanSexSet.csv')
ExploitData_csv = os.path.join('Cleandata','CleanExploitSet.csv')
ControlData_csv = os.path.join('Cleandata','CleanControlSet.csv')

# write to csv
labor_df.to_csv(LaborData_csv, index=False, header=True)
sex_df.to_csv(SexData_csv, index=False, header=True)
exploit_df.to_csv(ExploitData_csv, index=False, header=True)
control_df.to_csv(ControlData_csv, index=False, header=True)

In [22]:
data_df.loc[(data_df['typeOfLabourConcatenated']== 'unknown') &
    (data_df['meansOfControlConcatenated']== 'unknown') &
    (data_df['typeOfSexConcatenated']== 'unknown') &
    (data_df['typeOfExploitConcatenated'] == 'unknown')]

Unnamed: 0,yearOfRegistration,Datasource,gender,ageBroad,majorityStatus,majorityStatusAtExploit,majorityEntry,citizenship,meansOfControlDebtBondage,meansOfControlTakesEarnings,...,Citizenship Region,Citizenship Sub-Region,Citizenship Intermediate Region,Citizenship Country,Citizenship ISO3 Code,Exploit Region,Exploit Sub-Region,Exploit Intermediate Region,Exploit Country,Exploit ISO3 Code
11,2002,Case Management,Female,18--20,Adult,unknown,unknown,MD,0,0,...,Europe,Eastern Europe,,Republic of Moldova,MDA,unknown,unknown,unknown,unknown,
12,2002,Case Management,Female,18--20,Adult,unknown,unknown,MD,0,0,...,Europe,Eastern Europe,,Republic of Moldova,MDA,unknown,unknown,unknown,unknown,
13,2002,Case Management,Female,18--20,Adult,unknown,unknown,MD,0,0,...,Europe,Eastern Europe,,Republic of Moldova,MDA,unknown,unknown,unknown,unknown,
14,2002,Case Management,Female,18--20,Adult,unknown,unknown,MD,0,0,...,Europe,Eastern Europe,,Republic of Moldova,MDA,unknown,unknown,unknown,unknown,
15,2002,Case Management,Female,18--20,Adult,unknown,unknown,MD,0,0,...,Europe,Eastern Europe,,Republic of Moldova,MDA,unknown,unknown,unknown,unknown,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44349,2018,Case Management,Male,9--17,Minor,unknown,unknown,UA,0,0,...,Europe,Eastern Europe,,Ukraine,UKR,Europe,Eastern Europe,,Ukraine,UKR
44350,2018,Case Management,Male,9--17,Minor,unknown,unknown,UA,0,0,...,Europe,Eastern Europe,,Ukraine,UKR,Europe,Eastern Europe,,Ukraine,UKR
44351,2018,Case Management,Male,9--17,Minor,unknown,unknown,UA,0,0,...,Europe,Eastern Europe,,Ukraine,UKR,Europe,Eastern Europe,,Ukraine,UKR
44352,2018,Case Management,Male,9--17,Minor,unknown,unknown,UA,0,0,...,Europe,Eastern Europe,,Ukraine,UKR,Europe,Eastern Europe,,Ukraine,UKR
