In [52]:
import numpy as np
import pandas as pd

In [53]:
job_df = pd.read_csv("Combined_Jobs_Final.csv")

In [54]:
job_df.head(2)

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC


In [55]:
print(len(job_df))
job_df = job_df[['Status', 'Title', 'Position', 'Company', 'Job.Description']]
print(len(job_df))

84090
84090


In [56]:
job_df.shape

(84090, 5)

In [57]:
job_df['Job.Description'][20]

'Hiring Event Details\r\nStore Associate\r\n\r\n$12.00 / Hour\r\nAdditional $1.00 Per Hour For ALL Sunday Shifts!\r\n50 Cent Wage Increases Beginning At 6 Months - Up to $13.50 At 2 Years\r\n\r\nMonday, December 15, 2014\r\n9am - 11am\r\n\r\nALDI\r\n3133 Market Place Dr\r\nOnalaska, WI 54650\r\n\r\n&nbsp;\r\nFor consideration, please apply in person at the hiring event only. Get started now by downloading our Store Employment Application.\r\n\r\nStore Associate - Retail Sales ( Customer Service )\r\n\r\nIf you are a customer service minded individual with a positive and energetic personality and you&rsquo;re interested in working for one of the best-known grocery stores in the nation, join the ALDI family! We are looking for motivated and reliable individuals to serve as a Store Associate. You will serve as the face of ALDI, providing customers with friendly and efficient check-out services. But that&rsquo;s just the beginning. You will also assist the store manager in a variety of rol

In [58]:
job_df.isnull().sum()
job_df.fillna('',inplace=True)
job_df.isnull().sum()

Status             0
Title              0
Position           0
Company            0
Job.Description    0
dtype: int64

In [59]:
job_df = job_df.sample(n=1000,random_state=42)

In [60]:
job_df.shape

(1000, 5)

# cleaning dataset
keeping all letters and digits                          
lover case                             
removing stopwords                            
tokenization                            
stemming                         

In [61]:
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [62]:
def cleaning(text):
    # convert to lowercase
    text = text.lower()

    # remove special characters
    text = re.sub(r'\W+', ' ', text)

    # remove extra spaces
    text = re.sub(' +', ' ', text)

    # remove starting and trailing spaces
    text = text.strip()

    return text
    

In [63]:
cleaning("\n\rhelo the master piece is my loving moving dog @9032#%$")

'helo the master piece is my loving moving cat 9032'

In [64]:
job_df['Job.Description'] = job_df['Job.Description'].astype(str).apply(lambda x: cleaning(x))
job_df['Title'] = job_df['Title'].astype(str).apply(lambda x: cleaning(x))
job_df['Position'] = job_df['Position'].astype(str).apply(lambda x: cleaning(x))

In [65]:
job_df['clean_text'] = job_df['Job.Description']+" "+job_df['Title']+job_df['Position']

In [66]:
job_df['clean_text'][64119]

'job summary knowledge universe ku site directors are site leaders who inspire children and teachers alike to learn and grow they are passionate about educational excellence and confident teaching children and adults they use our nationally recognized curriculum as a framework to create unique and engaging classroom experiences they are committed to making their site successful and know that meaningful relationships with children families and their team are important to success they are fully engaged enthusiastic about their work and eager to share their knowledge with others job responsibilities and essential functions these are the basic expectations for site directors of course creative and new ways to meet or exceed expectations are encouraged so long as the required essential functions are also met supervision of children and staff record keeping licensing records and child files lesson planning and implementation maintenance of safe and welcoming classroom environment building of

# vectorizatoin

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(job_df['clean_text'])
similarity = cosine_similarity(matrix)

In [69]:
similarity

array([[1.        , 0.04316713, 0.01584323, ..., 0.03422459, 0.02355197,
        0.05787057],
       [0.04316713, 1.        , 0.02349895, ..., 0.02843237, 0.00650416,
        0.02200366],
       [0.01584323, 0.02349895, 1.        , ..., 0.12147903, 0.11821837,
        0.0849148 ],
       ...,
       [0.03422459, 0.02843237, 0.12147903, ..., 1.        , 0.10231673,
        0.0942541 ],
       [0.02355197, 0.00650416, 0.11821837, ..., 0.10231673, 1.        ,
        0.3645772 ],
       [0.05787057, 0.02200366, 0.0849148 , ..., 0.0942541 , 0.3645772 ,
        1.        ]])

In [70]:
sorted(list(enumerate(similarity[0])), key=lambda x: x[1], reverse=True)[1:20]

[(276, 0.9722978191534403),
 (730, 0.46934556239894065),
 (81, 0.45367410518365264),
 (917, 0.45367410518365264),
 (252, 0.23216213857943935),
 (128, 0.2103412366151445),
 (629, 0.16075217918132761),
 (825, 0.15085839382029045),
 (360, 0.13948993213260905),
 (38, 0.1320341335055304),
 (245, 0.12480868628845905),
 (326, 0.1212848168047901),
 (298, 0.11780183176064217),
 (195, 0.11298471725820095),
 (284, 0.11217975216761909),
 (59, 0.11167276537417432),
 (114, 0.10797827058611764),
 (254, 0.10562131035812325),
 (890, 0.10498251733474127)]

# Recommendation System

In [71]:
# define recommend function
def recommend(title):
    try:
        # find index of input title in job_df
        indx = job_df[job_df['Title'] == title].index[0]
        indx = job_df.index.get_loc(indx)

        # calculate similarity between input title and all other titles
        distances = sorted(list(enumerate(similarity[indx])), key=lambda x: x[1], reverse=True)[1:20]

        # extract top 20 most similar job titles
        jobs = []
        for i in distances:
            jobs.append(job_df.iloc[i[0]].Title)

        return jobs

    except IndexError:
        print(f"Error: The input title '{title}' was not found in the job_df DataFrame.")
        return []

# recommend jobs based on input title
input_title = 'Software Engineer'
recommendations = recommend(input_title)
print(recommendations)
        

Error: The input title 'Software Engineer' was not found in the job_df DataFrame.
[]


In [72]:
recommend(' software tester')

Error: The input title 'site director knowledg univers' was not found in the job_df DataFrame.


[]

In [73]:
import pickle
pickle.dump(job_df,open('df.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [74]:
import pandas as pd
print(pd.__version__)


2.1.4
