#### Testing filtering out sentences
- Split a description into sentence tokens = **done!**
- Parse through each sentence to see if it has any of the key words for immigration pointers = **done!**
- If there is no flag word in the description, add a new description e.g. 'no immigration flag'or maybe we will make one that denotes nothing and say 'check company stance'  = **done!** 
- Use the top 100 H1B list iff the original pass through returns 'check company stance' = **done!**

In [1]:
import pandas as pd
import job_postings as jp
import job_description_features as jdf
from time import time

In [2]:
t = time() #get today's postings (last 24 hrs), filter out links we dont want 
postings = jp.New_Postings()
jp.print_time('Time to retrieve postings:', t)

14 search terms: 
--------------------------------

United States: 350 result links for June 16, 2020
Time to retrieve postings: 4:09 mins


In [3]:
t = time() #create dataframe with cleaned columns
data = postings.get_job_postings()
jp.print_time('Time to create dataframe:', t)

Total remaining job postings: 290
Time to create dataframe: 0:11 mins


In [4]:
visa_yes = data[data['sponsor'] == 'Yes']
visa_find = data[data['sponsor'] != 'Yes']

In [5]:
visa_yes.shape

(13, 6)

In [6]:
visa_find.shape

(277, 6)

In [7]:
data.to_csv('output-today.csv', index = False)

In [3]:
#method to remove stopwords, punctuation, lemmatize job descriptions
from nltk import WordNetLemmatizer
from collections import defaultdict

word_freq = defaultdict(int)
#method to get tokens (might switch to ngrams) from the cleaned description
def process_job_descriptions(descs):
    wn = WordNetLemmatizer()
    for desc in descs:
        if desc != 'check company stance':
            desc = [wn.lemmatize(w) for w in desc if len(w) > 1 or (len(w) == 1 and w =='r')]
            for word in desc:
                word_freq[word] += 1
    print('Total unique tokens from list:', len(word_freq))
    return desc

#### Testing Recursion 
May be helpful to save processing time for later

In [None]:
def meth(n):
    if n == 0: return n
    if n == 1: return n+n
    return meth(n-1) + n

In [None]:
#synonyms = might help with getting equivalent of visa/sponsorship words 
#so that we dont store a large corpus and also look for synonyms for phrases
from nltk.corpus import wordnet

def get_word_lemmas(word):
    lemma_defs = []
    synonyms = []
    antonyms = []
    syn = wordnet.synsets(word)
    print('Word of Interest:', syn[0].lemmas()[0].name())
    for s in syn: 
        lemma_defs.append((s.lemmas()[0].name(), s.definition()))
    #trying to see lemmas?
    for s in syn:
        for l in s.lemmas():
            if l.name() not in synonyms: synonyms.append(l.name()) 
            if l.antonyms(): 
                for ant in l.antonyms(): 
                    if ant.name() not in antonyms: antonyms.append(ant.name())
    return [lemma_defs, synonyms, antonyms]

In [None]:
wordsyns = get_word_lemmas('work')

In [None]:
#similarities between words (scores between 0 and 1)
w = wordnet.synset('work.n.01') #n = noun
v = wordnet.synset('visa.n.01')
a = wordnet.synset('authorization.n.01')
v.wup_similarity(a)

In [None]:
descs = []
for word in words:
    x = defe.Description_Features(word)
    descs.append(x.clean_description_text())
descs

### Saving copy of data in excel (if needed)
We started this but maybe we can do a web display of the data instead with that streamlit thing!

In [None]:
t = time()
print('Links:', len(links), '\tPostings:', len(postings))
print('Descriptions:', len(descs))
descs = [posting.description for posting in postings]
#save to file
import csv
with open('Postings.csv', mode='w', encoding='UTF-8') as posts_file:
    post_writer = csv.writer(posts_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL,
                            lineterminator = '\n')
    post_writer.writerow(['Title', 'Location', 'Company', 'Link', 'Description'])
    for posting in postings:
        post_writer.writerow([posting.job_title, posting.job_location, posting.company_name,\
                              posting.job_link, posting.description])
posts_file.close()
print_time('Time to save job postings:', t)

In [16]:
x = [1,2,3,4,5,6]
x[:-1]

[1, 2, 3, 4, 5]

In [20]:
 start = 0
        cut = len(self.postings) + 10
        end = len(self.links)
        while start <= end and cut <= end:
            for link in self.links[start: cut]:
                post = Posting(link=link)
                if post: self.postings.append(post)
            start = cut + 1
            cut+= 10      
            if cut > end: cut = end

IndentationError: unexpected indent (<ipython-input-20-f4310e79e484>, line 2)

 ### Gensim Models

In [None]:
#modules
from collections import defaultdict
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

descs = data.desc_raw

In [None]:
#gensim model
#size: The number of dimensions of the embeddings and the default is 100.
#window: The maximum distance between a target word and words around the target word. The default window is 5.
#min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.
#workers: The number of partitions during training and the default workers is 3.
#sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.
model = Word2Vec(min_count=1, size= 50,workers=3, window =3, sg = 0, alpha=0.03, min_alpha=0.0007)

from time import time
t = time()
model.build_vocab(descs, progress_per=1000)
jp.print_time('Time to build vocab:', t)
t = time()
size = int(len(descs)*0.8)

model.train(descs[:size], total_examples=model.corpus_count, epochs=30, report_delay=1)
jp.print_time('Time to train model:', t)

In [None]:
#make model more memory efficient
model.init_sims(replace=True)
#what words are most similar to this
model.wv.most_similar(positive=['data'])
#how similar are these two
model.wv.similarity('python', 'spark')
#odd one out
model.wv.doesnt_match(['master', 'python', 'r'])
#master is to degree as python is to....?
model.wv.most_similar(positive=['master', 'python'], negative=['degree'], topn=5)