### Testing out model generation
Use the cleaned out description to create n-grams/lemmatize/tfidf to get information about the features in the values
- Use LDA or ngram frequency to get the immigration markers
- Use collocation finder to get the years/experience markers*
- Create a model with tfidf to see if the job provides immigration or not

In [8]:
import job_description_features as jdf
import job_postings as jp
import pandas as pd

In [2]:
#data = pd.read_excel('Training Postings.xlsx', sheet_name='Postings')
#data['Description']
#data = pd.read_csv('output-today.csv')

data = pd.read_excel('Training Postings.xlsx', sheet_name='Sheet1')
data.head()

Unnamed: 0,phrases,Result
0,security clearance required,False
1,we are unable to offer sponsorship,False
2,US citizens and green card holders,False
3,not authorized to work in the United States wi...,False
4,not authorized to work in the US without spons...,False


In [3]:
def text_normalization_cleaning(text):
    value = jdf.Description_Features(text)
    clean_value = value.clean_description_text()
    return clean_value

In [4]:
data['clean_desc'] = data.phrases.apply(text_normalization_cleaning)
data.head()

Unnamed: 0,phrases,Result,clean_desc
0,security clearance required,False,security clearance required
1,we are unable to offer sponsorship,False,we are unable to offer sponsorship
2,US citizens and green card holders,False,us citizen and green card holder
3,not authorized to work in the United States wi...,False,not authorized to work in the united state wit...
4,not authorized to work in the US without spons...,False,not authorized to work in the us without spons...


In [1]:
#collocations
key_phrases = ['EEO', 'visa sponsorship is available for this postion' +
               ' is an equal opportunity employer and is committed to' + 
               'providing a work environment that is free of discrimination and harassment.' +
               'It does not discriminate against applicants or employees with respect to any' +
               'terms or conditions of employment on account of race, color, religion, creed,' +
               'national origin, ancestry, sex, sexual orientation, age, genetic information,' +
               'physical or mental disability (actual or perceived), medical condition including' + 
               'genetic characteristics, marital status, citizenship status, military service' +
               'status, gender, gender identity, registered domestic partner status, or any other' +
               'characteristic protected by applicable federal, state or local laws.']

## Need to try cross validation since we have a small data size

## Testing Classification
Try a classification model on the common phrases to see if it works better than clustering. When we tried clustering, the prediction was not good.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords

class Classification_Model:
    
    def __init__(self, X_docs, y_targets, ngrams=(1,1)):
        self.y_names = [True, False]
        self.y_values = y_targets
        self.X_values = X_docs
        self.vectorizer = TfidfVectorizer(ngram_range=ngrams,
            max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
        self.X_vectors = self.vectorizer.fit_transform(self.X_values).toarray()
        self.scaler = MinMaxScaler()
        
    def preprocessing_data(self, model):
        #train test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X_vectors, self.y_values, test_size=0.20, random_state=42)
        #scale data
        self.scaler.fit(self.X_train)
        self.X_train_scaled = self.scaler.transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        self.model = model
        self.model.fit(self.X_train_scaled, self.y_train) 
        self.y_pred = self.model.predict(self.X_test_scaled)
        
    def model_metrics(self):
        #confusion matrix, index = self.y_names, columns=self.y_names
        self.conf_matrix = pd.DataFrame(confusion_matrix(self.y_test, self.y_pred))
        #classification report with recall, precision, accuracy, target_names=self.y_names
        print(classification_report(self.y_test, self.y_pred))
        print('Accuracy score of ensemble classifier:', accuracy_score(self.y_test, self.y_pred))
        return self.conf_matrix        


In [12]:
classification_model = Classification_Model(data.clean_desc, data.Result, ngrams=(2,2))

In [14]:
modelNB = GaussianNB()
classification_model.preprocessing_data(modelNB)
classification_model.model_metrics()

              precision    recall  f1-score   support

       False       0.83      1.00      0.91         5
        True       1.00      0.50      0.67         2

    accuracy                           0.86         7
   macro avg       0.92      0.75      0.79         7
weighted avg       0.88      0.86      0.84         7

Accuracy score of ensemble classifier: 0.8571428571428571


Unnamed: 0,0,1
0,5,0
1,1,1


In [16]:
model_tree = DecisionTreeClassifier()
classification_model.preprocessing_data(model_tree)
classification_model.model_metrics()

              precision    recall  f1-score   support

       False       0.83      1.00      0.91         5
        True       1.00      0.50      0.67         2

    accuracy                           0.86         7
   macro avg       0.92      0.75      0.79         7
weighted avg       0.88      0.86      0.84         7

Accuracy score of ensemble classifier: 0.8571428571428571


Unnamed: 0,0,1
0,5,0
1,1,1


In [17]:
model_rf = RandomForestClassifier(n_estimators=1000, random_state=0)
classification_model.preprocessing_data(model_rf)
classification_model.model_metrics()

              precision    recall  f1-score   support

       False       0.83      1.00      0.91         5
        True       1.00      0.50      0.67         2

    accuracy                           0.86         7
   macro avg       0.92      0.75      0.79         7
weighted avg       0.88      0.86      0.84         7

Accuracy score of ensemble classifier: 0.8571428571428571


Unnamed: 0,0,1
0,5,0
1,1,1


### Testing Latent Sentiment Analysis 
See if we can get the most common phrases and use those as a base to decide/determine job condition

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [15]:
vectorizer_ct = CountVectorizer(ngram_range=(6,6))
data_ct = vectorizer_ct.fit_transform(data['clean_desc'])

In [16]:
lsa = TruncatedSVD(n_components=2, n_iter=100)
lsa.fit(data_ct)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=100,
             random_state=None, tol=0.0)

In [18]:
len(lsa.components_[0])

281

In [22]:
terms = vectorizer_ct.get_feature_names()
for i, comp in enumerate(lsa.components_):
    terms_comp = zip(terms, comp)
    sortedTerms = sorted(terms_comp,key = lambda x: x[1], reverse=True) [:5] #first ten items
    print('\nConcept {}'.format(i))
    for term in sortedTerms: print(term[0])


Concept 0
or any other characteristic protected by
status or any other characteristic protected
is an equal opportunity employer and
account of race color religion creed
actual or perceived medical condition including

Concept 1
age color national origin citizenship status
all person regardless of age color
and or expression genetic information marital
any other characteristic protected by federal
assistance veteran status or any other


### Testing Clustering
Using clustering to see if the descriptions naturally separate by the categories we want. We should try to perform clustering on n-grams instead of on unigrams
- vectorizer_tdidf = looks for trigrams

In [70]:
#clusters /trying to see if there is an unsupervised thingy going
from sklearn.cluster import KMeans
class Cluster_Model:
        
    def __init__(self, cleaned_corpus, k_clusters = 2, ngrams=(3,3)):
        self.corpus = cleaned_corpus
        self.vectorizer = TfidfVectorizer(ngram_range=ngrams)
        self.tfidf_vectors = self.vectorizer.fit_transform(self.corpus)
        self.true_k = k_clusters
        self.model = KMeans(n_clusters=self.true_k, init='k-means++', max_iter=100, n_init=1, n_jobs=-1)

    def get_top_cluster_terms(self, n_results):
        print("Top terms per cluster:")
        self.model.fit(self.tfidf_vectors)
        order_centroids = self.model.cluster_centers_.argsort()[:, ::-1]
        terms = self.vectorizer.get_feature_names()
        for i in range(self.true_k):
            print("\nCluster {}:".format(i)),
            values = [terms[ind] for ind in order_centroids[i, :n_results]]
            print("Terms ({}):".format(len(values)), values)
            #for ind in order_centroids[i, :30]:
                #print('{}'.format(terms[ind]))
                
    def predict_text_cluster(self, text):
        print("Prediction")
        Y = self.vectorizer.transform([text])
        self.prediction = self.model.predict(Y)
        print(self.prediction)

In [64]:
model = Cluster_Model(data.clean_desc, ngrams=(2,2))

In [69]:
model.get_top_cluster_terms(10)
print()
model.predict_text_cluster('no sponsorship or visa at this time')

Top terms per cluster:

Cluster 0:
Terms (10): ['this position', 'active dod', 'citizenship status', 'in the', 'regardless of', 'equal employment', 'for this', 'employment regardless', 'of citizenship', 'to work']

Cluster 1:
Terms (10): ['clearance required', 'security clearance', 'ts sci', 'us citizen', 'citizen and', 'card holder', 'and green', 'green card', 'ability to', 'and ability']

Prediction
[0]


### LDA topic/feature extraction
May help us find common traits in text so that we can classify the descriptions based on topics of interest (immigration, health industry)

In [28]:
corpus_A = list(data.desc_visa_tokens[data['desc_visa_tokens']!=''])

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import Normalizer

In [34]:
class LDA_Topic_Model():
    def __init__(self):
        self.model = Pipeline([
            ('vect', TfidfVectorizer()),
            ('model', LatentDirichletAllocation(n_components=2, n_jobs=-1)),
        ])
        
    def fit_transform(self, documents):
        self.model.fit_transform(documents)
        return self.model
    
    def get_topics(self, n = 25):
        vectorizer = self.model.named_steps['vect']
        model = self.model.steps[-1][1]
        model.n_jobs = -1
        names = vectorizer.get_feature_names()
        topics = dict()
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n - 1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
        return topics

In [35]:
from time import time
t = time()
if __name__ == '__main__':
    documents = corpus_A
    lda = LDA_Topic_Model()
    lda.fit_transform(documents)
    topics = lda.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic+1))
        print(terms)
print('Time to process: {}:{:02d}'.format(round((time() - t)/60), round((time() - t)%60)))

Topic #1:
['data', 'status', 'experience', 'work', 'equal', 'opportunity', 'protected', 'employment', 'disability', 'applicant', 'information', 'team', 'national', 'employer', 'without', 'veteran', 'business', 'must', 'origin', 'knowledge', 'race', 'orientation', 'gender']
Topic #2:
['job', 'business', 'science', 'experience', 'degree', 'ability', 'skill', 'practice', 'work', 'required', 'computer', 'field', 'robert', 'opening', 'half', 'authorized', 'company', 'not', 'mathematics', 'related', 'solution', 'problem', 'eligible']
Time to process: 0:01
