In [1]:
import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/My Drive/newdf.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,0,1,1,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,DC,US,20531.0,17:01.6,4/6/2012 23:59
1,1,4,1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,NC,US,28217.0,03:44.1,4/20/2012 23:59
2,2,7,1,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,36:55.4,4/1/2012 23:59
3,3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,01:10.1,4/2/2012 23:59
4,4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,01:11.9,4/2/2012 23:59


In [5]:
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

In [8]:
def stem_words(text):
    porter = PorterStemmer()
    token_words = word_tokenize(text)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [9]:
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    token_words = word_tokenize(text)
    lemma_sentence = []
    for word in token_words:
        lemma_sentence.append(lemmatizer.lemmatize(word))
        lemma_sentence.append(" ")
    return "".join(lemma_sentence)

In [37]:
def remove_unnesessary_symbols(df):
    removePattern = r'(<(.*?)>)|(&\w+)'
    addSpacePattern = r'([;:])|(\\r)|(\\n)'
    removeExtraSpaces = r'(\s\s+?)(?=\S)'
    df['DescCleaned'] = df['Description'].str.lower()
    df['DescCleaned'] = df['DescCleaned'].str.replace(removePattern, "")
    df['DescCleaned'] = df['DescCleaned'].str.replace(addSpacePattern, " ")
    df['DescCleaned'] = df['DescCleaned'].str.replace(removeExtraSpaces, " ")
    # Same for Requirements
    df['ReqCleaned'] = df['Requirements'].str.lower()
    df['ReqCleaned'] = df['ReqCleaned'].str.replace(removePattern, "")
    df['ReqCleaned'] = df['ReqCleaned'].str.replace(addSpacePattern, " ")
    df['ReqCleaned'] = df['ReqCleaned'].str.replace(removeExtraSpaces, " ")

    # Normalize Text
    df['DescCleaned'] = df['DescCleaned'].apply(remove_stop_words)
    df['DescCleaned'] = df['DescCleaned'].apply(stem_words)
    df['DescCleaned'] = df['DescCleaned'].apply(lemmatize_words)

    df['ReqCleaned'] = df['ReqCleaned'].apply(remove_stop_words)
    df['ReqCleaned'] = df['ReqCleaned'].apply(stem_words)
    df['ReqCleaned'] = df['ReqCleaned'].apply(lemmatize_words)

    return df

In [11]:
def search_job(df, keywords):
    results = []
    for index, row in df.iterrows():
        if any(keyword in str(row[column]).lower() for column in ['Title', 'DescCleaned', 'ReqCleaned', 'Country'] for keyword in keywords):
            results.append(row['Title'])
    if len(results) == 0:
        return "Tidak ditemukan hasil yang cocok dengan kata kunci yang diberikan."
    else:
        return "Berikut hasil pencarian yang cocok dengan kata kunci '{}':\n{}".format(', '.join(keywords), '\n- '.join(results))


In [38]:
df_copy = df.copy()
useless_col = ['Unnamed: 0', 'JobID', 'WindowID', 'StartDate', 'EndDate', 'Zip5', 'State']
df_copy = df_copy.drop(useless_col, axis = 1)
df_copy = df_copy.dropna()
df_copy.head()

Unnamed: 0,Title,Description,Requirements,City,Country
0,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,US
1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,US
2,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,US
3,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,US
4,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,US


In [39]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         997 non-null    object
 1   Description   997 non-null    object
 2   Requirements  997 non-null    object
 3   City          997 non-null    object
 4   Country       997 non-null    object
dtypes: object(5)
memory usage: 46.7+ KB


In [40]:
df_copy = remove_unnesessary_symbols(df_copy)
df_copy.head()

  df['DescCleaned'] = df['DescCleaned'].str.replace(removePattern, "")
  df['DescCleaned'] = df['DescCleaned'].str.replace(addSpacePattern, " ")
  df['DescCleaned'] = df['DescCleaned'].str.replace(removeExtraSpaces, " ")
  df['ReqCleaned'] = df['ReqCleaned'].str.replace(removePattern, "")
  df['ReqCleaned'] = df['ReqCleaned'].str.replace(addSpacePattern, " ")
  df['ReqCleaned'] = df['ReqCleaned'].str.replace(removeExtraSpaces, " ")


Unnamed: 0,Title,Description,Requirements,City,Country,DescCleaned,ReqCleaned
0,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,US,secur clearanc requir top secret job number tm...,skill set network secur tool webdefend web app...
1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,US,corp. corp resum consid direct hire `` perman ...,need four year colleg degre minimum 5 8+ year ...
2,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,US,p/t human resourc assist —— 1-2 year experi hr...,plea refer job descript view requir job
3,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,US,citi beverag come work best busi ! citi bevera...,plea refer job descript view requir job
4,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,US,make sure everi part day magic . disney . drea...,plea refer job descript view requir job


In [41]:
df_copy.loc[0].DescCleaned

'secur clearanc requir top secret job number tmr-447 locat job washington , dc tmr , inc. equal employ opportun compani job opportun tmr , visit websit www.tmrhq.com send resum hr @ tmrhq2.com job summari lead custom overal cyber secur strategi , formal servic offer consist itil best practic , provid design architectur support . provid secur design / architectur support ojp secur divis ( itsd ) lead secop team day day ojp secur oper support provid direct need secur incid technic issu work concert network oper design /integr best secur postur support busi develop function includ captur manag , propos develop respons , initi includ confer , trade show , webinar , develop white paper like . identifi resourc mentor in-hous talent ensur tmr remain respons grow initi contract qualifi personnel . '

In [48]:
keywords = input("Masukkan kata kunci: ")
keywords = stem_words(keywords)
keywords = lemmatize_words(keywords)
keywords = keywords.split()


search_results = search_job(df_copy, keywords)
print(search_results)

Masukkan kata kunci: software engineer
Berikut hasil pencarian yang cocok dengan kata kunci 'softwar, engin':
Security Engineer/Technical Lead
- ELECTRONIC PRE-PRESS PROFESSIONAL
- CONSTRUCTION PROJECT MGR & PM TRAINEE
- Financial Planner/ParaPlanner
- CIVIL ENGINEER
- MECHANICAL/INDUSTRIAL ENGINEER
- COMPUTER PROFESSIONALS
- CIVIL ENGINEER
- TECH SUPPORT REPRESENTATIVE
- MACHINE SHOP
- BUSINESS ANALYST
- FINANCIAL ACCOUNTANT
- Electronics Repair Technician
- ENGINEER III WATER/WASTEWATER AECOM Technical Services
- Software Engineer
- DATABASE ADMINISTRATOR II
- Process Lab Coordinator
- Business Development
- Manufacturing
- Product Dvlpmnt Eng Team Lead
- Engineer
- Project Engineering
- System Analyst
- Supervisor of Instruction High School
- METAL FABRICATION MACHINISTS & WELDER-FITTERS
- IT Administrator
- Quality Assurance Specialist
- Civil Engineer
- Engineer/Designer/CADD
- MANUFACTURING
- CONSTRUCTION POSITIONS
- SR. SOFTWARE ENGINEER
- IT POSITIONS
- ENGINEER
- DRAFTER
- IT 