# push to posting
in this notebook we will write the logic to push from our DB into the posting collections. 

The posting collection will be a mock of a job posting site to keep track. 

In [97]:
#export
from datetime import date, timedelta
from pymongo import MongoClient
import numpy as np # best match
import pandas as pd

In [2]:
# setting our mongoclient
client = MongoClient('mongodb+srv://test:test@cluster0-ehci6.mongodb.net/test?retryWrites=true&w=majority')
db = client.get_database('Job_Scraper')

# pulling from records
records = db.Scraped_Jobs

# pushing to postings
postings = db.Postings

In [99]:
#export
# compare distances between sequence of words
def levenshtein_ratio_and_distance(s, t):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                cost = 2
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    # Computation of the Levenshtein Distance Ratio
    Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
    return Ratio

In [100]:
#export
def find_values_(user_input, records):
    """
    mocking out api call. This will retrieve all matching user input without levensthtein
    """
    user_input = user_input[0].upper() + user_input[1:]
    values = records.find({'title': { '$regex': f'^{user_input}' }})
    return list(values)

In [55]:
# 3. finding matches
inp = 'software'
values = find_values_(inp)

In [101]:
#export
def find_best_match(user_input, values):
    """
    Will return best matches in a df format
    """
    values_df = pd.DataFrame(values)
    values_df['match_rating'] = values_df['title'].apply(lambda x: levenshtein_ratio_and_distance(x, user_input))
    values_df.sort_values('match_rating', ascending=False, inplace=True)
    return values_df

In [72]:
# 4. finding best match
matches = find_best_match(inp, values)

In [74]:
# top 5
matches.head()

Unnamed: 0,_id,company,date,location,summary,title,match_rating
57,5e16ad9a4a5e28e6136e67dc,Google,2019-12-09,"New York, NY 10011 (Chelsea area)",Experience with one or more general purpose pr...,Software Engineer,0.56
16,5e16ad984a5e28e6136e6787,Seated,2020-01-06,"New York, NY 10011 (Flatiron District area)",Hands-on development experience in Java or sim...,Software Engineer,0.56
44,5e16ad994a5e28e6136e67c4,Oracle,2019-12-09,"New York, NY","Specify, design and implement modest changes t...",Software Engineer,0.56
36,5e16ad994a5e28e6136e67ad,viagogo,2019-12-13,"New York, NY",You love to code and you're in the final year ...,Software Engineer,0.56
33,5e16ad994a5e28e6136e67a7,Teachable,2019-12-20,"New York, NY",Teachable is looking for a Full-Stack or Back-...,Software Engineer,0.56


In [102]:
#export
def post_to_board(matches, postings, amount=10):
    """
    Will post best matches into the boards collection. This will replicate posting into a jobs board. 
    
    amount: <int | string> will determine how many of the matches will be used.
    """
    # condition of amount
    if isinstance(amount, int): matches = matches.head(amount)
    elif isinstance(amount, str) and amount == 'all': matches = matches # all matches
    else: raise ValueError('amount has wrong value')
    
    # pushing to board
    for i in range(len(matches)):
        job = matches.iloc[i].to_dict()
        job['post_date'] = date.today().strftime('%Y-%m-%d')
        if postings.find_one(job): pass
        else: postings.insert_one(job) # post if it's not there

In [96]:
post_to_board(matches)

In [103]:
!python notebook2script.py 02_push_to_postings.ipynb

Converted 02_push_to_postings.ipynb to exp\nb_02.py
