In [3]:
# library
import matplotlib.pyplot as plt
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import pickle

from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import urllib

from io import BytesIO

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sambe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sambe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def document_to_text(url):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    f = urllib.request.urlopen(url).read()
    fp = BytesIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    parsed = retstr.getvalue()
    retstr.close()

    if parsed == None:
        print("The submitted document cannot be read.")
    try:
        parsed = parsed.replace('\n', '')
        parsed = parsed.replace('\u200b', '')
    except:
        pass
    return parsed

In [5]:
def compile_document_text(text):
    # job_descriptions = pd.read_csv('data/job_descriptions.csv', index_col=0)
    with open('recs/pkl/job_descriptions.pkl', 'rb') as f:
        job_descriptions = pickle.load(f)
    data = [['resume', text]]
    basic_documentdf = pd.DataFrame(data, columns = ['title', 'description'])
    return basic_documentdf

In [6]:
def text_to_bagofwords(df):
    df['rake_key_words'] = ''
    r = Rake()
    for index, row in df.iterrows():
        r.extract_keywords_from_text(row['description'])
        key_words_dict_scores = r.get_word_degrees()
        row['rake_key_words'] = list(key_words_dict_scores.keys())
# Transform key words into bag of words
    df['bag_of_words'] = ''
    for index, row in df.iterrows():
        words = ''
        words += ' '.join(row['rake_key_words']) + ' '
        row['bag_of_words'] = words
    verbose_documentdf = df
    return verbose_documentdf

In [7]:
def join_and_condense(df):
    # job_descriptions = pd.read_csv('data/job_descriptions.csv', index_col=0)
    with open('recs/pkl/job_descriptions.pkl', 'rb') as f:
        job_descriptions = pickle.load(f)
    job_descriptions = job_descriptions.append(df)
    recommend_df = job_descriptions[['title', 'bag_of_words']]
    recommend_df = recommend_df.reset_index(drop=True)
    return recommend_df

In [8]:
def vectorize_text(df):
    count = CountVectorizer()
    count_matrix = count.fit_transform(df['bag_of_words'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    return cosine_sim

In [9]:
def recommend_100(df, matrix):
    recommended_jobs = []
    indices = pd.Series(df['title'])
    idx = indices[indices == 'resume'].index[0]
    score_series = pd.Series(matrix[idx]).sort_values(ascending = False)
    top_100_indices = list(score_series.iloc[1:101].index)

    for i in top_100_indices:
        recommended_jobs.append(list(df['title'])[i])

    return recommended_jobs

In [10]:
def format_recommendations(recommendations):
    jobs10 = []
    for job in recommendations:
        job = job.lower().replace("_", " ").title()
        job = job.replace('Hr Manager', 'HR Manager')
        job = job.replace('Care Giver / Hha / Cna', 'Care Giver')
        jobs10.append(job)
    jobs10 = set(jobs10[0:100])
    format_jobs = list(jobs10)
    # final_jobs10 = jobs10[0:3]
    # for i, item in enumerate(final_jobs10, 1):
        # print(i, '. ' + item + '\n', sep='',end='')
    return format_jobs

In [11]:
def top_100_categories(recommendations):
    # df = pd.read_csv('data/job_descriptions.csv', index_col=0)
    with open('recs/pkl/job_descriptions.pkl', 'rb') as f:
        df = pickle.load(f)
    user_titles = df[df.title.isin(recommendations)]
    user_titles = user_titles[['title', 'category']]
    category_list = list(user_titles.category)
    return category_list

In [12]:
def freq(list_of_categories):
    frequency = []
    unique_words = set(list_of_categories)
    for words in unique_words :
        frequency.append(list_of_categories.count(words))
    return frequency

In [13]:
def viz_data(list_of_categories, frequency_of_categories):
    unique_words = set(list_of_categories)
    unique_words = list(unique_words)
    category_values = dict(zip(unique_words, frequency_of_categories))
    category_dict = {key:val for key, val in category_values.items() if val >= 10}
    names=category_dict.keys()
    size=category_dict.values()
    return names, size

In [14]:
def make_viz(names_of_categories, size_of_categories):
# Create a circle for the center of the plot
    my_circle=plt.Circle( (0,0), 0.7, color='white')
# Give color names
    plt.title('Strength Summary')
    plt.pie(size_of_categories, labels=names_of_categories)
    p=plt.gcf()
    p.gca().add_artist(my_circle)
    plt.show()

In [15]:
def analyze(document_path):
    resume_text = document_to_text(document_path)
    basic_documentdf = compile_document_text(resume_text)
    verbose_documentdf = text_to_bagofwords(basic_documentdf)
    recommend_df = join_and_condense(verbose_documentdf)
    cosine_sim = vectorize_text(recommend_df)
    recommended_jobs = recommend_100(recommend_df, cosine_sim)
    final_jobs10 = format_recommendations(recommended_jobs)
    category_list = top_100_categories(recommended_jobs)
    frequency = freq(category_list)
    names, size = viz_data(category_list, frequency)
    strength_summary = make_viz(names, size)

In [16]:
def final_rec(document_path):
    text = document_to_text(document_path)
    basic_documentdf = compile_document_text(text)
    verbose_documentdf = text_to_bagofwords(basic_documentdf)
    recommend_df = join_and_condense(verbose_documentdf)
    cosine_sim = vectorize_text(recommend_df)
    recommended_jobs = recommend_100(document_path, 'resume', cosine_sim)
    recommendations = format_recommendations(recommended_jobs)
    return recommendations

In [17]:
def upload(document_url):
    resume_text = document_to_text(document_url)
    basic_documentdf = compile_document_text(resume_text)
    verbose_documentdf = text_to_bagofwords(basic_documentdf)
    recommend_df = join_and_condense(verbose_documentdf)
    cosine_sim = vectorize_text(recommend_df)
    recommended_jobs = recommend_100(recommend_df, cosine_sim)
    final_jobs = format_recommendations(recommended_jobs)
    return final_jobs


In [19]:
job_descriptions = pd.read_csv('pkl/job_descriptions.csv', index_col=0)

In [20]:
job_descriptions

Unnamed: 0,title,category,description,location,rake_key_words,bag_of_words
0,training_manager,management,The world's leading private security organizat...,NY,"['provide', 'limited', 'express', 'interest', ...",provide limited express interest additional tr...
1,training_manager,management,The Training Manager is responsible for the de...,NY,"['workshops', 'including', 'schedules', 'integ...",workshops including schedules integrity respon...
2,training_manager,management,The Training Manager of Applied Data Analytics...,NY,"['oversee', 'translate', 'diverse', 'audience'...",oversee translate diverse audience advanced kn...
3,training_manager,management,Unqork is the no-code platform that's pioneeri...,NY,"['public', 'speaking', '•', 'enjoy', 'getting'...",public speaking • enjoy getting previous exper...
4,training_manager,management,The Training Manager is in charge of training ...,NY,"['luxury', 'business', 'pr', 'teams', 'powerpo...",luxury business pr teams powerpoint paced envi...
...,...,...,...,...,...,...
260,dietitian (pool),"sports, fitness, & recreation",The Dietitian plans therapeutic diets and conf...,"phenix city, AL","['productivity', 'standards', 'equipment', 'co...",productivity standards equipment compliance ni...
261,youth services aide,"sports, fitness, & recreation",The Youth Services Aide is a permanent full-ti...,"montgomery, AL","['time', 'position', 'placed', 'may', 'apply',...",time position placed may apply directly locate...
262,nutritionist associate,"sports, fitness, & recreation","The Nutritionist Associate is a permanent, ful...","montgomery, AL","['time', 'position', 'local', 'health', 'agenc...",time position local health agency professional...
263,Licensed Real Estate SalesAgent,real estate,Job Description The Licensed Real Estate Sales...,"Spanish Fort, AL","['integrity', 'career', 'online', 'commission'...",integrity career online commission region pers...


In [25]:
to_add1 = pd.read_csv(r"C:\Users\sambe\Projects\rookieplay\data\compile\Job Descriptions 1.csv")

In [27]:
pd.set_option('display.max_rows', to_add1.shape[0]+1)


In [28]:
to_add1

Unnamed: 0,JOB TITLE,JOB DESCRIPTION,JOB CATEGORY,LOCATION
0,Office Clerk,"As a Post Office Clerk, you are responsible fo...",Admin & Office,US
1,Inside Sales Representative,This is an inside sales position.\nThis is our...,Sales & Retail,"Fort Worth, TX"
2,Entry Level Collections Specialist,Why you should be interested in this role!\nTi...,Accounting & Finance,"Bothell, WA 98021"
3,Foreign Language Instructor,"As a Foreign Language Instructor for the CIA, ...",Education,"Washington, DC"
4,QA Tester,Quality Assurance Testers (“QA Testers”) perfo...,Computer & IT,"Baton Rouge, LA 70820"
5,"Human Resources, Full Time Associate",You'’re ready to bring your knowledge from the...,Human Resources,"New York, NY"
6,"Finance, Full Time Analyst",You’re ready to bring your knowledge from the ...,Accounting & Finance,"New York, NY"
7,Functional Game Tester,"With nearly two decades on the books, the Qual...",Computer & IT,"Eden Prairie, MN"
8,Executive Assistant To The City Manager,The Executive Assistant to the City Manager is...,Admin & Office,"City of Poway, CA"
9,Flight Attendants,As our flight attendants have the most in-pers...,Entertainment & Travel,"Miami, FL 33181"


In [29]:
to_add2 = pd.read_csv(r"C:\Users\sambe\Projects\rookieplay\data\compile\Job Descriptions 2.csv")

In [30]:
to_add2

Unnamed: 0,JOB TITLE,JOB DESCRIPTION,JOB CATEGORY,LOCATION
0,Deputy Operations Program Manager,Assists The Lead Program Manager In The Follow...,Management,"Auburn, NY"
1,Account Manager,The Account Manager’s primary function is to s...,Sales & Retail,"Washington, DC"
2,Junior Key Account Manager,As a Junior Key Account Manager you will be re...,Customer Service,"Denver, CO"
3,Audit Associate - Markets,"The Associate, Senior Auditor position is a st...",Accounting & Finance,"New York, NY"
4,Healthcare Data Analyst,"Software Guidance & Assistance, Inc., (SGA), i...",Computer & IT,"Jacksonville, FL"
5,Municipal Bond Credit Analyst,Clinton Investment Management has an opportuni...,Accounting & Finance,"Stamford, CT"
6,Freelance Resume Writer,"Through our tailor-made platform, you’ll selec...","Media, Communications & Writing","Dallas, TX"
7,Research Analyst,Harvey & Company is searching for Research Ana...,Accounting & Finance,"Newport Beach, CA"
8,"Associate Producer, MSNBC Weekends",\tStraight cut video and work with editors on ...,"Media, Communications & Writing","New York, NY"
9,Graduate Engineer,\tWorking under the supervision of experienced...,Science & Engineering,"Tampa, FL"
