In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import yaml
import re
import time


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import datetime
from datetime import datetime
from datetime import date

import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')


# Google Cloud Language Translation API
# We're using the basic version here == "v2" 
from google.cloud import translate_v2

import timeit

import os
from os import listdir


## nltk imports
# ! pip install gensim
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import CountVectorizer




[nltk_data] Downloading package punkt to /Users/euniceliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/euniceliu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Dataset used in this notebook
- read in: combined_certificate_postings.csv (the merged data that contains the job addendums and the CASE_STATUS variables)


# Major tasks accomplished in this notebook
- 1) Classify the outcome variables (CASE_STATUS) into binary 0,1
- 2) Pre-process the job addendums
- 3) Create a job posting classifier Using TF-IDF 
- 4) Create and run a Multinomial Naive Bayes model to see the prediction of CASE_STATUS by the text features
- 5) Compare the result of Multinomial Naive Bayes model that use TF-IDF score vs count
- 6) Explore the Bernoulli Naive Bayes model

# Why are these tasks important
- Provide insights on how the job addendums help predict the Case_status  
- Showcase the strength of incorporating TF-IDF score into my model which support that my model (creating a job posting classifier using TF-IDF scores and create and run a Multinomial Naive Bayes model) is the better choice

# Directory

In [2]:
# dropbox
dropbox_general = "/Users/euniceliu/Dropbox (Dartmouth College)/"
DROPBOX_DATA_PATH = os.path.join(dropbox_general,
                                "qss20_finalproj_rawdata/summerwork/")
DATA_RAW_DIR = os.path.join(DROPBOX_DATA_PATH, "raw/")
DATA_ID_DIR = os.path.join(DROPBOX_DATA_PATH, "intermediate/")
WRITEFOLDER = os.path.join(DATA_ID_DIR)
# github
GITHUB_DATA_PATH = "../data/raw_data/"

## Pre-Modeling

In [6]:
## Read in the combined dataset
combined_certificate_postings= pd.read_csv(DATA_ID_DIR+'combined_certificate_postings.csv')
combined_certificate_postings.head()

Unnamed: 0.1,Unnamed: 0,CASE_NUMBER,CASE_STATUS,combined_job_postings
0,0,H-300-20265-835437,Determination Issued - Certification,the most economical and reasonable charges for...
1,1,H-300-20260-827678,Determination Issued - Certification,Incoming transportation and subsistence advanc...
2,2,H-300-20260-827308,Determination Issued - Certification,An employee may be terminated for just cause. ...
3,3,H-300-20258-821801,Determination Issued - Certification,"ELECTRONIC COMMUNICATION\nCell phones, along w..."
4,4,H-300-20258-821682,Determination Issued - Certification,Employee Expectations and Behavior Continued:\...


### Classify the outcome variables CASE_STATUS into 0 or 1 binary outcome 

In [7]:
## 1) check the category
combined_certificate_postings.CASE_STATUS.unique()
combined_certificate_postings['CASE_STATUS'].value_counts()


array(['Determination Issued - Certification',
       'Determination Issued - Denied',
       'Determination Issued - Partial Certification',
       'Determination Issued - Certification (Expired)',
       'Determination Issued - Partial Certification (Expired)'],
      dtype=object)

Determination Issued - Certification                      11165
Determination Issued - Certification (Expired)             1812
Determination Issued - Denied                               168
Determination Issued - Partial Certification                 80
Determination Issued - Partial Certification (Expired)       25
Name: CASE_STATUS, dtype: int64

In [8]:
## 2) Classify the CASE_STATUS into binary variables (0 and 1)
## if certification +  certification (expired) >> 1 (approved cases)
## if partial certification + partial certification (expired) + denied (no approved cases)
combined_certificate_postings['CASE_OUTCOME'] = np.where(combined_certificate_postings['CASE_STATUS'].str.contains('Denied|Partial'), 0, 1)


In [9]:
## 3) check if coding to binary vairable work
combined_certificate_postings['CASE_OUTCOME'].unique()
combined_certificate_postings['CASE_STATUS'].value_counts()
combined_certificate_postings['CASE_OUTCOME'].value_counts()
# the number does add up!

array([1, 0])

Determination Issued - Certification                      11165
Determination Issued - Certification (Expired)             1812
Determination Issued - Denied                               168
Determination Issued - Partial Certification                 80
Determination Issued - Partial Certification (Expired)       25
Name: CASE_STATUS, dtype: int64

1    12977
0      273
Name: CASE_OUTCOME, dtype: int64

###  Randomly Select 1000 Our Certified Case (Labelled as One)

In [10]:
## subset to certified case (label as 1) and not certified case (label as 0)
certified = combined_certificate_postings[combined_certificate_postings["CASE_OUTCOME"]==1]
notcertified = combined_certificate_postings[combined_certificate_postings["CASE_OUTCOME"]==0]
## show be 12977 rows
certified.shape
## show be 273 rows
notcertified.shape

(12977, 5)

(273, 5)

In [11]:
## randomly select 1000 positive cases
certified_1000 = certified.sample(n = 1000)
certified_1000.shape
certified_1000.head()

(1000, 5)

Unnamed: 0.1,Unnamed: 0,CASE_NUMBER,CASE_STATUS,combined_job_postings,CASE_OUTCOME
9758,9758,H-300-19330-167821,Determination Issued - Certification,Employer will furnish free and convenient cook...,1
418,418,H-300-20212-743475,Determination Issued - Certification,The employer will also provide advance subsist...,1
4021,4021,H-300-20064-374924,Determination Issued - Certification,All workers will be subject to a trial period ...,1
4286,4286,H-300-20057-353443,Determination Issued - Certification,"Stake, string, tie and thin crops, as instruct...",1
10339,10339,H-300-19312-136148,Determination Issued - Certification,The employer retains possession and control of...,1


In [12]:
## rowbind back to our notcertified dataset
combined_selected = pd.concat([certified_1000, notcertified])
combined_selected.shape

(1273, 5)

## Run Text Processing


### Convert characters to lower case

In [13]:
## lower case
start_time = time.time()
combined_selected['postings_lower']= combined_selected['combined_job_postings'].apply(lambda x: x.lower())
print("%s seconds" % (time.time() - start_time))


0.08867788314819336 seconds


### Tokenize the words

In [14]:
## tokenized
start_time = time.time()
combined_selected['postings_tokenized'] = combined_selected['postings_lower'].apply(word_tokenize)
print("%s seconds" % (time.time() - start_time))

14.528738975524902 seconds


### Remove stopwords

In [15]:
## define stopwords
other_stopwords = ["after", "before", "employer", "employ", "job", "although", "provide", "complete","hour","time",
                  "begin","list","require","task","transportation","worker","workers","working","work","worked","works"]

list_stopwords = stopwords.words("english")+ other_stopwords

stopwords_complete = list_stopwords + other_stopwords
start_time = time.time()
## remove those
combined_selected['posting_without_stopwords']=combined_selected['postings_tokenized'].apply(lambda x: [word for word in x if word not in stopwords_complete])

print("%s seconds" % (time.time() - start_time))

4.894984245300293 seconds


### Perform Stemming

In [16]:
## stemming 
start_time = time.time()
porter = PorterStemmer()
combined_selected['stemmed'] = combined_selected['posting_without_stopwords'].apply(lambda x: [porter.stem(y)for y in x]) # Stem every word.
print("%s seconds" % (time.time() - start_time))

23.47009301185608 seconds


### Remove words that is less than 3 characters and punctuation

In [14]:
## keep isalpha() and the length of the word that is greater than 3
combined_selected['cleaned']=combined_selected['stemmed'].apply(lambda x: [word for word in x if word.isalpha() and len(word)>3])
combined_selected['cleaned']

7479     [inbound, cost, first, workweek, employ, exten...
12714    [provid, airplan, charter, oper, vehicl, commo...
6546     [offer, seven, week, farmwork, must, perform, ...
9490     [shall, form, farm, vehicl, common, mean, plac...
9142     [qualifi, inbound, outbound, travel, reimburs,...
                               ...                        
13245              [paid, done, arizona, done, california]
13246    [rule, guidanc, regard, accept, conduct, stand...
13247    [upon, complet, contract, dismiss, earlier, re...
13248    [april, middl, irrig, detail, around, farm, us...
13249    [hous, offer, hous, provid, hous, clean, compl...
Name: cleaned, Length: 1273, dtype: object

### Join back each word


In [15]:
combined_selected['cleaned']=combined_selected['cleaned'].apply(lambda x: " ".join(x))
combined_selected['cleaned']

7479     inbound cost first workweek employ extent shif...
12714    provid airplan charter oper vehicl common mean...
6546     offer seven week farmwork must perform seven w...
9490     shall form farm vehicl common mean place provi...
9142     qualifi inbound outbound travel reimburs entit...
                               ...                        
13245                    paid done arizona done california
13246    rule guidanc regard accept conduct standard ge...
13247    upon complet contract dismiss earlier reason c...
13248    april middl irrig detail around farm usag plum...
13249    hous offer hous provid hous clean complianc ap...
Name: cleaned, Length: 1273, dtype: object

## Modeling 

### Split the data into training and testing

In [25]:
combined_selected.head()
combined_selected['CASE_OUTCOME'].value_counts()

Unnamed: 0.1,Unnamed: 0,CASE_NUMBER,CASE_STATUS,combined_job_postings,CASE_OUTCOME,postings_lower,postings_tokenized,posting_without_stopwords,stemmed,cleaned
6283,6283,H-300-20020-263904,Determination Issued - Certification,All workers are required to follow common sani...,1,all workers are required to follow common sani...,"[all, workers, are, required, to, follow, comm...","[required, follow, common, sanitary, practices...","[requir, follow, common, sanitari, practic, ti...",requir follow common sanitari practic time par...
1350,1350,H-300-20163-646331,Determination Issued - Certification,Incoming transportation and subsistence advanc...,1,incoming transportation and subsistence advanc...,"[incoming, transportation, and, subsistence, a...","[incoming, subsistence, advanced/paid, 50, %, ...","[incom, subsist, advanced/paid, 50, %, complet...",incom subsist complet contract deduct accord a...
9862,9862,H-300-19326-162101,Determination Issued - Certification (Expired),H-2A workers must depart the United States at ...,1,h-2a workers must depart the united states at ...,"[h-2a, workers, must, depart, the, united, sta...","[h-2a, must, depart, united, states, completio...","[h-2a, must, depart, unit, state, complet, con...",must depart unit state complet contract period...
8840,8840,H-300-19350-199601,Determination Issued - Certification,ELECTRONIC COMMUNICATION\nCell phones along wi...,1,electronic communication\ncell phones along wi...,"[electronic, communication, cell, phones, alon...","[electronic, communication, cell, phones, alon...","[electron, commun, cell, phone, along, suffici...",electron commun cell phone along suffici minut...
10637,10637,H-300-19302-115364,Determination Issued - Certification,Incoming transportation and subsistence advanc...,1,incoming transportation and subsistence advanc...,"[incoming, transportation, and, subsistence, a...","[incoming, subsistence, advanced/paid, 50, %, ...","[incom, subsist, advanced/paid, 50, %, complet...",incom subsist complet contract deduct accord a...


1    1000
0     273
Name: CASE_OUTCOME, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(combined_selected,test_size=0.2)

In [27]:
train.head()

Unnamed: 0.1,Unnamed: 0,CASE_NUMBER,CASE_STATUS,combined_job_postings,CASE_OUTCOME,postings_lower,postings_tokenized,posting_without_stopwords,stemmed,cleaned
6191,6191,H-300-20021-265051,Determination Issued - Partial Certification (...,Employer will train workers. Training will inc...,0,employer will train workers. training will inc...,"[employer, will, train, workers, ., training, ...","[train, ., training, include, limited, safety,...","[train, ., train, includ, limit, safeti, train...",train train includ limit safeti train protect ...
10450,10450,H-300-19309-126181,Determination Issued - Certification (Expired),Harassment: The employer committed to providin...,1,harassment: the employer committed to providin...,"[harassment, :, the, employer, committed, to, ...","[harassment, :, committed, providing, safe, ,,...","[harass, :, commit, provid, safe, ,, flexibl, ...",harass commit provid safe flexibl respect envi...
10419,10419,H-300-19310-129885,Determination Issued - Certification,...incurred by the worker for transportation a...,1,...incurred by the worker for transportation a...,"[..., incurred, by, the, worker, for, transpor...","[..., incurred, daily, subsistence, place, com...","[..., incur, daili, subsist, place, come, ,, w...",incur daili subsist place come whether abroad ...
10724,10724,H-300-19298-110492,Determination Issued - Certification (Expired),No deductions except those required by law wil...,1,no deductions except those required by law wil...,"[no, deductions, except, those, required, by, ...","[deductions, except, required, law, made, brin...","[deduct, except, requir, law, made, bring, 's,...",deduct except requir made bring earn feder min...
12747,12747,H-300-20329-926801,Determination Issued - Certification,TERMINATIONS: The employer may terminate the ...,1,terminations: the employer may terminate the ...,"[terminations, :, the, employer, may, terminat...","[terminations, :, may, terminate, notification...","[termin, :, may, termin, notif, appropri, stat...",termin termin notif appropri state feder agenc...


## Create a job postings classifier using TF-IDF

In [28]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.9)

### Write transform and fit into a function so the code looks neater

In [29]:
def tfidf_topwords(df,kind):
    if kind=="test":
        tf_idf = vectorizer.transform(df['cleaned'])
    elif kind=="train":
        tf_idf = vectorizer.fit_transform(df['cleaned'])
        tf_idf = vectorizer.transform(df['cleaned'])
    else:
        return ("wrong input")
    return tf_idf

In [30]:
X_train_tf=tfidf_topwords(train,"train")
X_test_tf=tfidf_topwords(test,"test")

In [31]:
X_train_tf.shape

(1018, 2696)

In [32]:
X_test_tf.shape

(255, 2696)

## Create and run a Multinomial Naive Bayes model + Modeling Result 

In [34]:
# y value
train_y=train["CASE_OUTCOME"]
test_y=test["CASE_OUTCOME"]

In [35]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)
y_pred = naive_bayes_classifier.predict(X_test_tf)
print(metrics.classification_report(test_y, y_pred, target_names=['Certified', 'Denied']))

MultinomialNB()

              precision    recall  f1-score   support

   Certified       0.86      0.11      0.19        55
      Denied       0.80      0.99      0.89       200

    accuracy                           0.80       255
   macro avg       0.83      0.55      0.54       255
weighted avg       0.81      0.80      0.74       255



In [36]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[  6  49]
 [  1 199]]


#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------#

## Findings using TF-IDF to create a job posting classifier & run a Multinomial Naive Bayes model


### As presented in the cells above, the data is extremely imbalanced (with 12977 positive cases (fully certified) and 273 negative cases (not fully certified & denied)). In this case, extra wrangling of the data is required or else the result of the predictive model would not be informative because the predictors will highly likely to predict any given test sample to the positive group which also means that the precision and recall will be very high. On the other hand, in this situation, the precision and recall would be very low for the negative cases. Thus, I randomly sampled the postive cases (case status that is fully certified) to only include 1000 cases to miminize the effect of imbalanced dataset. The model works better  as the accuracy is 0.8 and the weighted average for precision, recall and f1-score is relatively high which is respectively 0.81, 0.80 and 0.74.

# Extra exploration using the Bernoulli Naive Bayes model and Count Vectorizer + Multinomial Naive Bayes model

## Bernoulli Naive Bayes 

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
CVectorizer=CountVectorizer(ngram_range=(1,3),min_df=5)
uni_bigrams=CVectorizer.fit(train['cleaned'])
X_train_vecotrized=uni_bigrams.transform(train['cleaned'])
X_test_vecotrized=CVectorizer.transform(test['cleaned'])


In [28]:
train_y=train["CASE_OUTCOME"]
test_y=test["CASE_OUTCOME"]

In [29]:
from sklearn.naive_bayes import BernoulliNB
BernoulliNB_classifier = BernoulliNB()
BernoulliNB_classifier.fit(X_train_vecotrized, train_y)
y_pred = BernoulliNB_classifier.predict(X_test_vecotrized)
print(metrics.classification_report(test_y, y_pred, target_names=['Certified', 'Denied']))


BernoulliNB()

              precision    recall  f1-score   support

   Certified       0.34      0.89      0.49        55
      Denied       0.95      0.53      0.68       200

    accuracy                           0.60       255
   macro avg       0.64      0.71      0.58       255
weighted avg       0.82      0.60      0.64       255



## Count Vectorizer + Multinomial Naive Bayes model

In [30]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_vecotrized, train_y)
y_pred = naive_bayes_classifier.predict(X_test_vecotrized)
print(metrics.classification_report(test_y, y_pred, target_names=['Certified', 'Denied']))

MultinomialNB()

              precision    recall  f1-score   support

   Certified       0.35      0.80      0.49        55
      Denied       0.92      0.60      0.73       200

    accuracy                           0.64       255
   macro avg       0.64      0.70      0.61       255
weighted avg       0.79      0.64      0.67       255



### After running the Bernoulli Naive Bayes model, I recognize that it would not make sense to make direct comparison of that with the Multinomial Naive Bayes model since I not only use different model but also the vectorizer is different. In this case, it is hard to analyze whether the better/worse performance of the model is due to the vectorizer choice or the model choice. Thus, I run another Multinomial Naive Bayes model but instead of using the TF-IDF score for words, I decided to use the count of the words for text classification because I am interested to know whether my initial choice of going for term frequency-inverse document frequency (TF-IDF) is the correct choice. We can see that the performace of model is better in the first model (the one using TF-IDF) as the accuracy score is 0.80 wheras in the second model (when using the count of the words) the accuracy score is 0.60. This makes sense because TF-IDF help to capture both the importance and relevance of a word whereas simply counting the occurance of words might include words that are too common across job postings which makes the prediction less accurate.


## Future Works

### Indeed, there might be some bias generated from only include the 1000 positive cases. I choice this way of dealing with the imbalanced data issue because it is the most intuitive and easiest way to quickly fix the issue. Future works can include performing other methods that tackle the imbalanced data issue such performing data augmentation, which can allow further verification of the results. Moreover, I also think it would be interesting see how having a addendum or not (as some of the cases do not have addendums--those cases are dropped in the very beggining) predict the case status.