In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import yaml
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import datetime
from datetime import datetime
from datetime import date

import nltk
from nltk import word_tokenize

# Google Cloud Language Translation API
# We're using the basic version here == "v2" 
from google.cloud import translate_v2

import timeit


## Read in Files & Data

In [2]:
## Google Cloud API Credential
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = r'sixth-topic-347322-4297c4a3d919.json'

In [3]:
## Addendum with language identified
addendum_untranslated = pd.read_csv("addendum_wdetails_cleaned.csv")
                             

## Make Sure that Google Cloud API Work

In [4]:

translate_client = translate_v2.Client()

text = "hola como estas"
target = "en"

output = translate_client.translate(text, target_language=target)

print(output)

{'translatedText': 'Hello how are you', 'detectedSourceLanguage': 'es', 'input': 'hola como estas'}


## Function to Run Translation on Dataset

In [5]:
def google_translate_messages(one_row):
    '''
    Pass in a df row.
    Find the message under the 'SECTION_DETAILS column'
    '''
    one_message = one_row.loc['SECTION_DETAILS']
    
    # initialize the Google Cloud translation client
    translate_client = translate_v2.Client()
    
    # set the target language
    target = 'en'
    
    
    try:
    # apply the translation 
        output = translate_client.translate(one_message, 
                                            format_='html',
                                            target_language=target)
    except:
        output = {'translatedText': 'translation_error', 
                  'detectedSourceLanguage': 'translation_error', 
                  'orig_content':'translation_error'}

    return list(output.values())

## Data Manipulation and Cleaning Before Translating

In [6]:
# Check if there is duplicated rows because google translation takes a lot of time
# it is good to prevent running translation twice
len(addendum_untranslated)
addendum_untranslated[['SECTION_DETAILS']].drop_duplicates()
len(addendum_untranslated)


99538

Unnamed: 0,SECTION_DETAILS
0,Employer will furnish free and convenient cook...
1,Three (3) months experience with references re...
2,Living & laundry facilities available. Housing...
3,Should the Employers worker’s compensation ins...
4,The employer shall provide transportation in t...
...,...
99528,California Tax ID Number: 062-1432-4
99532,Persons seeking employment as experienced Agri...
99534,Other Job Specifications Include:\n1.\tThe wor...
99535,may experience occasional exposure to hazards ...


99538

In [32]:
# Filter out non English Rows
nonEnglish = addendum_untranslated.loc[(addendum_untranslated['lang1']!="en") | (addendum_untranslated['lang2'].notnull()) | (addendum_untranslated['lang3'].notnull())]
English = addendum_untranslated.loc[(addendum_untranslated['lang1']=="en") & (addendum_untranslated['lang2'].isnull()) & (addendum_untranslated['lang2'].isnull())]

assert len(addendum_untranslated)==len(nonEnglish)+len(English)

In [26]:
# Split df into 20 dataframes, so that we can revisit if code breaks + internet crashes 
split_df = np.array_split(nonEnglish, 10)

# N rows/columns per df. 
for i in range(len(split_df)):
    print('df', i+1, ':', 
        split_df[i].shape)

df 1 : (1670, 15)
df 2 : (1670, 15)
df 3 : (1670, 15)
df 4 : (1670, 15)
df 5 : (1670, 15)
df 6 : (1670, 15)
df 7 : (1669, 15)
df 8 : (1669, 15)
df 9 : (1669, 15)
df 10 : (1669, 15)


## Translate Job Postings

In [9]:
# Set up to run translation

# output path
filename = 'translated_msgs_'
ext = '.csv'


# For every dataframe in split_df, 
# run through the translation, unpack the results, 
# and save as csv

for i in range(len(split_df)):
    one_df = split_df[i]
    
    start_translation_time = timeit.default_timer() #time start
    
    # run translation
    one_df['output_list'] = one_df.apply(google_translate_messages, axis = 1)

    stop_translation_time = timeit.default_timer() #time end
    
    time_lapse = stop_translation_time - start_translation_time
    print("took " + str(time_lapse) + " seconds to run")
    
    # unpack the translation results into their own columns
    one_df[['translatedText', 'detectedSourceLanguage', 'orig_content']] = \
        pd.DataFrame(one_df.output_list.to_list(),  
                     index = one_df.index)

    one_df.to_csv(filename + str(i) + ext)
    
    print("wrote results for df ", i + 1)

took 889.24137314 seconds to run
wrote results for df  1
took 928.3444729280002 seconds to run
wrote results for df  2
took 851.3610782659998 seconds to run
wrote results for df  3
took 838.3814368590001 seconds to run
wrote results for df  4
took 848.9665148869999 seconds to run
wrote results for df  5
took 851.0585088010002 seconds to run
wrote results for df  6
took 857.8521529560003 seconds to run
wrote results for df  7
took 912.0228779660001 seconds to run
wrote results for df  8
took 891.6887572940004 seconds to run
wrote results for df  9
took 886.3544101749994 seconds to run
wrote results for df  10


In [11]:
## Read all the translated files

# output path
filename = 'translated_msgs_'
ext = '.csv'

# init first df
translated_msgs_init = pd.read_csv(filename + '0' + ext)
print(translated_msgs_init.shape)
# initialize list to store our pickles
csv = []

# For every file in the folder, read it in as a dataframe, 
# then append to the pickles list
for i in range(1, len(split_df)):
    df = pd.read_csv( filename + str(i) + ext)
    print(df.shape)
    csv.append(df)

(1670, 20)
(1670, 20)
(1670, 20)
(1670, 20)
(1670, 20)
(1670, 20)
(1669, 20)
(1669, 20)
(1669, 20)
(1669, 20)


In [20]:
## Combine the 10 datasets together
translated_data = pd.concat([translated_msgs_init, *csv])
# Look at the languages detected
translated_msgs.detectedSourceLanguage.value_counts()
# Check if the combined translated datasets match the number of rows of the original nonEnglish datasets
assert len(translated_data)==len(nonEnglish)

es                   16432
en                     257
id                       3
it                       1
gu                       1
de                       1
translation_error        1
Name: detectedSourceLanguage, dtype: int64

16696

16696

## Merge back with English Job Postings

In [55]:
## Merge back the Job Posting with purely english
## ?? Does it matter if the order is messed up?? 
translated_postings = English.append(translated_data, ignore_index=True)
assert len(translated_postings)==len(addendum_untranslated)

In [56]:
## Clean the columns 
## 1) replace translatedText and orig_content of the original english addendum with Section_Details
translated_postings.orig_content.fillna(translated_postings.SECTION_DETAILS, inplace=True)
translated_postings.translatedText.fillna(translated_postings.SECTION_DETAILS, inplace=True)
## 2) get rid of irrelvant columns
translated_postings_cleaned = translated_postings.loc[:, ~translated_postings.columns.isin(['Unnamed: 0.1', 'output_list',"all","is_missing_all"])]
translated_postings_cleaned.head()


Unnamed: 0.1,Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,lang1,lang1_prob,lang2,lang2_prob,lang3,lang3_prob,translatedText,detectedSourceLanguage,orig_content
0,0,FOIA_2021-F-05932_FY2020,0,H-300-19274-066174,Meal Provision,E.1,Employer will furnish free and convenient cook...,en,0.999999,,,,,Employer will furnish free and convenient cook...,,Employer will furnish free and convenient cook...
1,1,FOIA_2021-F-05932_FY2020,1,H-300-19274-066174,Job Requirements,B.6,Three (3) months experience with references re...,en,0.999996,,,,,Three (3) months experience with references re...,,Three (3) months experience with references re...
2,2,FOIA_2021-F-05932_FY2020,2,H-300-19274-066174,Daily Transportation,F.1,Living & laundry facilities available. Housing...,en,0.999996,,,,,Living & laundry facilities available. Housing...,,Living & laundry facilities available. Housing...
3,3,FOIA_2021-F-05932_FY2020,3,H-300-19274-066174,Job Requirements,B.6,Should the Employers worker’s compensation ins...,en,0.999994,,,,,Should the Employers worker’s compensation ins...,,Should the Employers worker’s compensation ins...
4,4,FOIA_2021-F-05932_FY2020,4,H-300-19274-066174,Daily Transportation,F.1,The employer shall provide transportation in t...,en,0.999998,,,,,The employer shall provide transportation in t...,,The employer shall provide transportation in t...


## Preprocessing

## Job Postings + Violation Data

In [45]:
violations = pd.read_csv("whd_violations.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [50]:
violations["CASE_NUMBER"].size
violations["CASE_NUMBER"].nunique()
translated_postings["CASE_NUMBER"].size
translated_postings["CASE_NUMBER"].nunique()

95169

68852

99538

13530