### Testing out model generation
Use the cleaned out description to create n-grams/lemmatize/tfidf to get information about the features in the values
- Use LDA or ngram frequency to get the immigration markers
- Use collocation finder to get the years/experience markers*
- Create a model with tfidf to see if the job provides immigration or not

In [4]:
import job_description_features as jdf
import job_postings as jp
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer

In [8]:
data = pd.read_csv('output-today.csv')
wn = WordNetLemmatizer()

In [9]:
data['desc_visa_tokens'] = data['desc_visa'].apply(tokenize)
data.head()

Unnamed: 0,title,company,location,desc_raw,desc_visa,sponsor,desc_visa_tokens
0,Data Scientist,Warner Bros. Entertainment,"Burbank, CA",The JobWarner Bros. Entertainment Inc. seeks a...,check company stance,Unknown,
1,Scientist (Level 1),"Arima Genomics, Inc.","San Diego, CA",Role: Scientist I – Product DevelopmentCompany...,role scientist product development company des...,Unknown,"[role, scientist, product, development, compan..."
2,Data Scientist,CGG,"Houston, TX",Company DescriptionCGG is a fully integrated G...,process seismic data peta scale warehouse info...,Unknown,"[process, seismic, data, peta, scale, warehous..."
3,Data Scientist,"Prudent Technologies and Consulting, Inc.","Grapevine, TX",Position Title: Data Scientist Location: G...,check company stance,Unknown,
4,Data Scientist,Seismic,"San Diego, CA","DescriptionAbout Seismic Data Team:At Seismic,...",check company stance,Unknown,


#### Testing the collocation finder 
- So that we can use it to see what are the common phrases. We will then decide whether or not to employ n-grams and cluster them.

In [31]:
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures

def tokenize(text):
        if text == 'check company stance': tokens = ''
        else: tokens = [wn.lemmatize(word) for word in word_tokenize(text)]
        return tokens
    
def collocation_finder(text, n_gram_total, n_gram_filter_word):
        cf = TrigramCollocationFinder.from_words(tokenize(text)) 
        #checking what words appear frequently with 'word' in this case it is 'work'
        n_filter = lambda *words: n_gram_filter_word not in words
        cf.apply_ngram_filter(n_filter)
        #apply frq filter removes occurences that happened less than x times
        collocation_scores = cf.nbest(TrigramAssocMeasures.likelihood_ratio, n_gram_total)
        return collocation_scores

In [41]:
texts = list(data.desc_visa[data['desc_visa_tokens'] != ''])
for text in texts:
    grams = collocation_finder(text, 5, 'not')
    #if len(grams) > 0:   print(grams)

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [97]:
vectorizer_tfidf = TfidfVectorizer(ngram_range=(3,3))
vectorizer_ct = CountVectorizer(ngram_range=(3,6))
data_tfidf = vectorizer_ct.fit_transform(
    data.desc_visa[data['desc_visa'] != 'check company stance'])

In [98]:
lsa = TruncatedSVD(n_components=2, n_iter=100)
lsa.fit(data_tfidf)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=100,
             random_state=None, tol=0.0)

In [99]:
lsa.components_[0]

array([ 1.02936346e-09,  1.02936325e-09,  1.02936332e-09, ...,
       -5.16280175e-19, -5.16280175e-19, -5.16280175e-19])

In [100]:
terms = vectorizer_ct.get_feature_names()
for i, comp in enumerate(lsa.components_):
    terms_comp = zip(terms, comp)
    sortedTerms = sorted(terms_comp,key = lambda x: x[1], reverse=True) [:15] #first ten items
    print('\nConcept {}'.format(i))
    for term in sortedTerms: print(term[0])


Concept 0
business analyst ii
description business analyst
problem solving skills
ici enc ies
prof ici enc
prof ici enc ies
various business units
ability act effective
ability act effective leader
ability act effective leader among
ability act effective leader among business
ability authoring analyzing
ability authoring analyzing process
ability authoring analyzing process flow
ability authoring analyzing process flow diagrams

Concept 1
us citizenship required
national security space
national security space programs
security space programs
apache mxnet tensor
apache mxnet tensor flow
apache mxnet tensor flow caffe2
apache mxnet tensor flow caffe2 keras
caffe2 keras microsoft
caffe2 keras microsoft cognitive
caffe2 keras microsoft cognitive toolkit
caffe2 keras microsoft cognitive toolkit torch
cognitive toolkit torch
cognitive toolkit torch no
deep learning frameworks


In [None]:
#clusters /trying to see if there is an unsupervised thingy going
from sklearn.cluster import KMeans
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, n_jobs=-1)
model.fit(data_tfidf)

In [None]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster {}:".format(i)),
    terms = [terms[ind] for ind in order_centroids[i, :20]]
    print("Terms:", terms)
    #for ind in order_centroids[i, :30]:
        #print('{}'.format(terms[ind]))

### LDA topic/feature extraction
May help us find common traits in text so that we can classify the descriptions based on topics of interest (immigration, health industry)

In [28]:
corpus_A = list(data.desc_visa_tokens[data['desc_visa_tokens']!=''])

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.preprocessing import Normalizer

In [34]:
class LDA_Topic_Model():
    def __init__(self):
        self.model = Pipeline([
            ('vect', TfidfVectorizer()),
            ('model', LatentDirichletAllocation(n_components=2, n_jobs=-1)),
        ])
        
    def fit_transform(self, documents):
        self.model.fit_transform(documents)
        return self.model
    
    def get_topics(self, n = 25):
        vectorizer = self.model.named_steps['vect']
        model = self.model.steps[-1][1]
        model.n_jobs = -1
        names = vectorizer.get_feature_names()
        topics = dict()
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n - 1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
        return topics

In [35]:
from time import time
t = time()
if __name__ == '__main__':
    documents = corpus_A
    lda = LDA_Topic_Model()
    lda.fit_transform(documents)
    topics = lda.get_topics()
    for topic, terms in topics.items():
        print("Topic #{}:".format(topic+1))
        print(terms)
print('Time to process: {}:{:02d}'.format(round((time() - t)/60), round((time() - t)%60)))

Topic #1:
['data', 'status', 'experience', 'work', 'equal', 'opportunity', 'protected', 'employment', 'disability', 'applicant', 'information', 'team', 'national', 'employer', 'without', 'veteran', 'business', 'must', 'origin', 'knowledge', 'race', 'orientation', 'gender']
Topic #2:
['job', 'business', 'science', 'experience', 'degree', 'ability', 'skill', 'practice', 'work', 'required', 'computer', 'field', 'robert', 'opening', 'half', 'authorized', 'company', 'not', 'mathematics', 'related', 'solution', 'problem', 'eligible']
Time to process: 0:01
