In [2]:
import os
import pandas as pd
import numpy as np
import datetime
import yaml
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import datetime
from datetime import datetime
from datetime import date

import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')


# Google Cloud Language Translation API
# We're using the basic version here == "v2" 
from google.cloud import translate_v2

import timeit
import time

## nltk imports
# ! pip install gensim
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics

[nltk_data] Downloading package punkt to /Users/euniceliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/euniceliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Major tasks accomplished in this notebook
- 1) Classify the outcome variables (CASE_STATUS) into binary 0,1
- 2) Pre-process the job addendums
- 3) Create a Movie Reviews Classifier Using TF-IDF 
- 4) Create and run a Multinomial Naive Bayes model to see the prediction of translation status by the text features

## Read in data

In [4]:
translated_job_postings=pd.read_csv("translated_job_postings.csv")
translated_job_postings.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,lang1,lang1_prob,lang2,lang2_prob,lang3,lang3_prob,translatedText,orig_content,translated
0,0,0,FOIA_2021-F-05932_FY2020,0,H-300-19274-066174,Meal Provision,E.1,Employer will furnish free and convenient cook...,en,0.999999,,,,,Employer will furnish free and convenient cook...,Employer will furnish free and convenient cook...,NO
1,1,1,FOIA_2021-F-05932_FY2020,1,H-300-19274-066174,Job Requirements,B.6,Three (3) months experience with references re...,en,0.999996,,,,,Three (3) months experience with references re...,Three (3) months experience with references re...,NO
2,2,2,FOIA_2021-F-05932_FY2020,2,H-300-19274-066174,Daily Transportation,F.1,Living & laundry facilities available. Housing...,en,0.999996,,,,,Living & laundry facilities available. Housing...,Living & laundry facilities available. Housing...,NO
3,3,3,FOIA_2021-F-05932_FY2020,3,H-300-19274-066174,Job Requirements,B.6,Should the Employers worker’s compensation ins...,en,0.999994,,,,,Should the Employers worker’s compensation ins...,Should the Employers worker’s compensation ins...,NO
4,4,4,FOIA_2021-F-05932_FY2020,4,H-300-19274-066174,Daily Transportation,F.1,The employer shall provide transportation in t...,en,0.999998,,,,,The employer shall provide transportation in t...,The employer shall provide transportation in t...,NO


## Create classifer to show whether an addendum is translated or not

### Concatenate addendums of the same case number

In [4]:
## join the job postings of the same case number together
concatenate_job=translated_job_postings.groupby(["CASE_NUMBER"])['translatedText'].apply(lambda x: ''.join(x))
# concatenate_job.head()
## change to dataframe
concatenate_job_data= concatenate_job.to_frame().rename(columns={'translatedText':'combined_job_postings'}).reset_index()
concatenate_job_data.head()

Unnamed: 0,CASE_NUMBER,combined_job_postings
0,H-300-19274-066154,Operating large farm tractors and equipment as...
1,H-300-19274-066174,Employer will furnish free and convenient cook...
2,H-300-19274-066180,INBOUND 1) Advance the worker the required tra...
3,H-300-19274-066199,"For workers who are provided housing, transpor..."
4,H-300-19274-066237,The inbound transportation will be reimbursed ...


### Create a column translated_2 for (binary variables 0 as not translated and 1 as translated)
- here what i am trying to do is to see if a particular case_number's job posting has EVER being translated or not. since getting the max value (in this case would indicate if the case number has translated job postings). It is important to note that it would be the case that some job postings would have some part that is not translated.

In [7]:
## code translated column into numeric for groupby
translated_job_postings["translated_2"]=np.where(translated_job_postings.translated=="yes",1,0)
## check
translated_job_postings.translated.value_counts()
translated_job_postings.translated_2.value_counts()

NO     82842
yes    16696
Name: translated, dtype: int64

0    82842
1    16696
Name: translated_2, dtype: int64

In [8]:
case_translated = translated_job_postings.groupby('CASE_NUMBER')['translated_2'].max()
case_translated= case_translated.to_frame().rename(columns={'':'ever_translated'}).reset_index()
case_translated.head()
case_translated.shape

Unnamed: 0,CASE_NUMBER,translated_2
0,H-300-19274-066154,0
1,H-300-19274-066174,0
2,H-300-19274-066180,0
3,H-300-19274-066199,1
4,H-300-19274-066237,0


(13530, 2)

In [9]:
## merge back with the concatenate_job 
concatenate_job_data.shape
combined_certificate_postings_translated= concatenate_job_data.merge(case_translated, on='CASE_NUMBER', how='inner')
combined_certificate_postings_translated.shape
combined_certificate_postings_translated.head()
combined_certificate_postings_translated.translated_2.value_counts()

(13530, 2)

(13530, 3)

Unnamed: 0,CASE_NUMBER,combined_job_postings,translated_2
0,H-300-19274-066154,Operating large farm tractors and equipment as...,0
1,H-300-19274-066174,Employer will furnish free and convenient cook...,0
2,H-300-19274-066180,INBOUND 1) Advance the worker the required tra...,0
3,H-300-19274-066199,"For workers who are provided housing, transpor...",1
4,H-300-19274-066237,The inbound transportation will be reimbursed ...,0


0    11749
1     1781
Name: translated_2, dtype: int64

## Run Text Processing

### Define Constants

In [12]:
other_stopwords = ["after", "before", "employer", "employ", "job", "although", "provide", "complete","hour","time",
                  "begin","list","require","task","transportation","worker","workers","working","work","worked","works"]

list_stopwords = stopwords.words("english")+ other_stopwords

stopwords_complete = list_stopwords + other_stopwords
    
porter = PorterStemmer()

### Convert characters to lower case

In [48]:
## lower case
start_time = time.time()
combined_certificate_postings_translated['postings_lower']= combined_certificate_postings_translated['combined_job_postings'].apply(lambda x: x.lower())
print("--- %s seconds ---" % (time.time() - start_time))


--- 1.1177070140838623 seconds ---


### Tokenize the words


In [49]:
## tokenized
start_time = time.time()
combined_certificate_postings_translated['postings_tokenized'] = combined_certificate_postings_translated['postings_lower'].apply(word_tokenize)
print("--- %s seconds ---" % (time.time() - start_time))

--- 169.00208806991577 seconds ---


### Remove stopwords

In [51]:
## remove stopwords
start_time = time.time()
combined_certificate_postings_translated['posting_without_stopwords']=combined_certificate_postings_translated['postings_tokenized'].apply(lambda x: [word for word in x if word not in stopwords_complete])
print("--- %s seconds ---" % (time.time() - start_time))

--- 56.1389901638031 seconds ---


### Perform Stemming

In [52]:
## stemming 
start_time = time.time()
combined_certificate_postings_translated['stemmed'] = combined_certificate_postings_translated['posting_without_stopwords'].apply(lambda x: [porter.stem(y)for y in x]) # Stem every word.
print("--- %s seconds ---" % (time.time() - start_time))

--- 314.5942647457123 seconds ---


### Remove words that is less than 3 characters and punctuation

In [53]:
## keep isalpha() and the length of the word that is greater than 3
combined_certificate_postings_translated['cleaned']=combined_certificate_postings_translated['stemmed'].apply(lambda x: [word for word in x if word.isalpha() and len(word)>3])
combined_certificate_postings_translated['cleaned']

0        [oper, larg, farm, tractor, equip, well, oper,...
1        [furnish, free, conveni, cook, kitchen, facil,...
2        [inbound, advanc, requir, subsist, cost, other...
3        [provid, hous, cost, provid, hous, worksit, pr...
4        [inbound, reimburs, basi, less, requir, econom...
                               ...                        
13525    [inbound, subsist, reimburs, first, week, empl...
13526    [econom, reason, charg, distanc, involv, compl...
13527    [task, descript, constitut, assign, differ, ta...
13528    [falsifi, identif, personnel, medic, product, ...
13529    [clarif, assur, compens, agre, renew, compens,...
Name: cleaned, Length: 13530, dtype: object

### Join back each word

In [54]:
combined_certificate_postings_translated['cleaned']=combined_certificate_postings_translated['cleaned'].apply(lambda x: " ".join(x))
combined_certificate_postings_translated['cleaned']

0        oper larg farm tractor equip well oper motor v...
1        furnish free conveni cook kitchen facil cost o...
2        inbound advanc requir subsist cost otherwis co...
3        provid hous cost provid hous worksit provid ac...
4        inbound reimburs basi less requir econom reaso...
                               ...                        
13525    inbound subsist reimburs first week employ off...
13526    econom reason charg distanc involv complet con...
13527    task descript constitut assign differ task mul...
13528    falsifi identif personnel medic product record...
13529    clarif assur compens agre renew compens prior ...
Name: cleaned, Length: 13530, dtype: object

## Modeling 

### Split the data into training and testing


In [56]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(combined_certificate_postings_translated,test_size=0.2)


In [57]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.9)

## Create a job postings classifier using TF-IDF

In [58]:
def tfidf_topwords(df,kind):
    if kind=="test":
        tf_idf = vectorizer.transform(df['cleaned'])
    elif kind=="train":
        tf_idf = vectorizer.fit_transform(df['cleaned'])
        tf_idf = vectorizer.transform(df['cleaned'])
    else:
        return ("wrong input")
    return tf_idf

In [59]:
X_train_tf=tfidf_topwords(train,"train")
X_test_tf=tfidf_topwords(test,"test")

## Create and run a Multinomial Naive Bayes model+ Modeling Result 

In [61]:
# y value
train_y=train["translated_2"]
test_y=test["translated_2"]

In [64]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)
y_pred = naive_bayes_classifier.predict(X_test_tf)
print(metrics.classification_report(test_y, y_pred, target_names=['Translated', 'NotTranslated']))

MultinomialNB()

               precision    recall  f1-score   support

   Translated       0.95      0.99      0.97      2347
NotTranslated       0.92      0.69      0.79       359

     accuracy                           0.95      2706
    macro avg       0.94      0.84      0.88      2706
 weighted avg       0.95      0.95      0.95      2706



In [65]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[2325   22]
 [ 112  247]]


#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------#

## Findings + Future Works

### The model works pretty well as the accuracy and weighted average of precision, recall and f1-score all are 0.95. This can infer that there are some specific vocabularies associated with translated and untranslated text. Notably, although the model works quite well, the data is still quite imbalanced; thus, future works should definitely tackle this shortcoming. Moreover, future research can also look into whether a text is translated or not is associated with whether the case is certified or denied.