In [1]:
import numpy as np
import pandas as pd
import re
import warnings
import uuid


warnings.filterwarnings(action='ignore')

In [2]:
machine_learning_job_path = 'Job Descriptions/machine learning/'
chartered_accountant_job_path = 'Job Descriptions/Chartered accountant'
software_developer_job_path = 'Job Descriptions/software developer'
product_manager_job_path = 'Job Descriptions/product manager'

In [3]:
jobs_df = pd.read_csv("Job Descriptions/Train_Original.csv")

In [4]:
def save_job_descriptions_as_txt(df,path):
    for index, row in df.iterrows():
        title = row['Title']
        description = row['FullDescription']
        unique_id = uuid.uuid4()
        file_name = str(unique_id) + '.txt'

        with open(path + "/" + file_name, 'w') as file:
            file.write(f'Title: {title}\n')
            file.write(f'Full Description: {description}\n')

## Filter Job Descriptions

### Artificial Intelligence / Machine Learning / data science

#### Machine Learning

In [5]:
# Define the regex pattern for filtering
pattern = r'(machine learning)'

# Filter the dataframe
filtered__machine_learning = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]


In [6]:
filtered__machine_learning = filtered__machine_learning.groupby('Category').filter(lambda x: len(x) > 10)

In [7]:
filtered__machine_learning['Category'].value_counts()

Category
IT Jobs    40
Name: count, dtype: int64

In [8]:
filtered__machine_learning[['Title','FullDescription']].head()

Unnamed: 0,Title,FullDescription
11746,Postdoctoral Researchers,An exciting opportunity to carry out world cla...
17262,Analytics Developer,"Software/Analytics Developer (C, C++ or Java) ..."
17377,Tools Developer Data Analytics / Big Data / NLP,Highgrowth startup in the Artificial Intellige...
17507,C++ Quantitative Analyst Oxford Abingdon,C++ Quantitative Analyst Oxford Abingdon My ...
33429,Quantitative Developer,Job Title: Quantitative Developer Salary: ****...


#### Data Science

In [9]:
# Define the regex pattern for filtering
pattern = r'(data science)'

# Filter the dataframe
filtered__data_science = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]

In [10]:
filtered__data_science['Category'].value_counts()

Category
IT Jobs                             8
PR, Advertising & Marketing Jobs    2
Teaching Jobs                       1
Healthcare & Nursing Jobs           1
Name: count, dtype: int64

In [11]:
filtered__data_science = filtered__data_science[filtered__data_science['Category']=='IT Jobs']

#### Artificial Intelligence

In [12]:
# Define the regex pattern for filtering
pattern = r'(artificial intelligence)'

# Filter the dataframe
filtered__artificial_intelligence = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]


In [13]:
filtered__artificial_intelligence['Category'].value_counts()

Category
IT Jobs                      14
Engineering Jobs              2
Scientific & QA Jobs          2
Accounting & Finance Jobs     1
Teaching Jobs                 1
Other/General Jobs            1
Name: count, dtype: int64

In [14]:
filtered__artificial_intelligence = filtered__artificial_intelligence[filtered__artificial_intelligence['Category']=='IT Jobs']

#### NLP

In [15]:
# Define the regex pattern for filtering
pattern = r'(natural language processing)'

# Filter the dataframe
filtered__nlp = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]

In [16]:
filtered__nlp['Category'].value_counts()

Category
IT Jobs                       35
Accounting & Finance Jobs      2
Teaching Jobs                  1
Sales Jobs                     1
Logistics & Warehouse Jobs     1
Graduate Jobs                  1
Name: count, dtype: int64

In [17]:
filtered__nlp = filtered__nlp[filtered__nlp['Category']=='IT Jobs']

In [18]:
filtered__ai = pd.concat([filtered__machine_learning,
                                      filtered__data_science,
                                      filtered__artificial_intelligence,
                                      filtered__nlp,
                                      ]).drop_duplicates()

### Software Developer

In [19]:
# Define the regex pattern for filtering
pattern = r'(software developer)'

# Filter the dataframe
filtered__software_developer = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]

In [20]:
filtered__software_developer['Category'].value_counts()

Category
IT Jobs                             2362
Engineering Jobs                     113
Other/General Jobs                    52
HR & Recruitment Jobs                 49
Accounting & Finance Jobs             27
Logistics & Warehouse Jobs            17
Graduate Jobs                         13
PR, Advertising & Marketing Jobs      12
Sales Jobs                             8
Scientific & QA Jobs                   8
Creative & Design Jobs                 7
Admin Jobs                             4
Consultancy Jobs                       3
Energy, Oil & Gas Jobs                 3
Manufacturing Jobs                     3
Healthcare & Nursing Jobs              2
Customer Services Jobs                 2
Part time Jobs                         2
Retail Jobs                            1
Teaching Jobs                          1
Legal Jobs                             1
Travel Jobs                            1
Name: count, dtype: int64

In [21]:
filtered__software_developer = filtered__software_developer[filtered__software_developer['Category']=='IT Jobs']

In [22]:
filtered__software_developer['Category'].value_counts()

Category
IT Jobs    2362
Name: count, dtype: int64

In [23]:
# To filter out the AI related jobs from Software developer jobs as they could be mixed
# Keywords to exclude
keywords = ['machine learning', 'data science', 'artificial intelligence', 'natural language processing']

# Filter the dataframe
filtered__software_developer = filtered__software_developer[~filtered__software_developer['FullDescription'].str.contains('|'.join(keywords), case=False)]

### Product manager

In [24]:
# Define the regex pattern for filtering
pattern = r'(product manager)'

# Filter the dataframe
filtered__product_manager = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]

In [25]:
filtered__product_manager.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
1976,51539851,ACCESSORIES PRODUCT MANAGER,ACCESSORIES PRODUCT MANAGER Location: Basingst...,Basingstoke,Basingstoke,,permanent,,Retail Jobs,25-30K PLUS BONUS,27500,hays.co.uk
2479,55408208,Senior Java Software Engineer,The Role: To work as a member of the Developme...,Berkshire,Berkshire,full_time,permanent,Migration,IT Jobs,"Up to 60,000 per year + 55000.00-60000.00",60000,planetrecruit.com
2542,55408955,Senior Applications Developer,My Client is looking for a Senior Applications...,Warwickshire,Warwickshire,full_time,permanent,JOBG8,IT Jobs,"Up to 45,000 per year + 30000.00-45000.00",45000,planetrecruit.com
2658,55409997,Test Analyst,Test Analyst London ****k plus bonus and excel...,London,London,full_time,permanent,Informatiq Consulting,IT Jobs,"From 30,000 to 35,000 per year",32500,planetrecruit.com
2717,55415212,Product manager,The Product manager/Digital Product Executive ...,London,London,full_time,permanent,JOBG8,Sales Jobs,"Up to 32,000 per year + 30000.00-32000.00",32000,hotrecruit.com


In [26]:
filtered__product_manager = filtered__product_manager[(filtered__product_manager['Category']=='PR, Advertising & Marketing Jobs') | 
                         (filtered__product_manager['Title']=='Product Manager') ]

### Chartered Accountant

In [27]:
# Define the regex pattern for filtering
pattern = r'(chartered accountant)'

# Filter the dataframe
filtered__chartered_accountant = jobs_df[jobs_df['Title'].str.contains(pattern, flags=re.IGNORECASE, regex=True) 
                                     | jobs_df['FullDescription'].str.contains(pattern, flags=re.IGNORECASE, regex=True)]

In [28]:
filtered__chartered_accountant.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
2229,53636956,Commercial Analyst,"My client, a market leading software company, ...",Berkshire,Berkshire,full_time,permanent,CMC Consulting Limited,Accounting & Finance Jobs,"50,000 - 55,000 + Benefits",52500,myjobs.cimaglobal.com
2890,56282015,Senior Financial Accounting Manager,A Senior Financial Accounting Manager is sough...,London,London,full_time,contract,Impart Recruitment,Accounting & Finance Jobs,From 380 to 400 per day,93600,jobsfinancial.com
2891,56282016,Analyst,An Analyst is sought by one of the world's pre...,London,London,full_time,contract,Impart Recruitment,Accounting & Finance Jobs,From 200 to 250 per day,54000,jobsfinancial.com
3368,58067002,Sponsorship Sales Executive,Corporate Sponsorship Sales Executive London U...,London,London,full_time,permanent,JOBG8,Sales Jobs,"From 30,000 to 39,999 per year + ( 30,000 - 39...",34999,hotrecruit.com
5263,62007931,Accountant (Fantastic Opportunity)),My client is a leading firm of Chartered Accou...,Huddersfield,Huddersfield,,,Taskmaster,Accounting & Finance Jobs,"18,000 TO 20,000 pro rota",19000,MyUkJobs


In [29]:
filtered__chartered_accountant['Category'].value_counts()

Category
Accounting & Finance Jobs     376
Other/General Jobs             11
Consultancy Jobs                5
Admin Jobs                      5
Sales Jobs                      3
HR & Recruitment Jobs           3
Graduate Jobs                   3
Logistics & Warehouse Jobs      2
Engineering Jobs                2
IT Jobs                         1
Legal Jobs                      1
Property Jobs                   1
Creative & Design Jobs          1
Travel Jobs                     1
Manufacturing Jobs              1
Retail Jobs                     1
Name: count, dtype: int64

In [30]:
filtered__chartered_accountant = filtered__chartered_accountant[(filtered__chartered_accountant['Category']=='Accounting & Finance Jobs')]

### Save the Job Description in Respective Category

In [31]:
save_job_descriptions_as_txt(filtered__machine_learning, path=machine_learning_job_path)

In [32]:
save_job_descriptions_as_txt(filtered__data_science, path=machine_learning_job_path)

In [33]:
save_job_descriptions_as_txt(filtered__artificial_intelligence, path=machine_learning_job_path)

In [34]:
save_job_descriptions_as_txt(filtered__nlp, path=machine_learning_job_path)

In [35]:
save_job_descriptions_as_txt(filtered__software_developer, path=software_developer_job_path)

In [36]:
save_job_descriptions_as_txt(filtered__product_manager, path=product_manager_job_path)

In [37]:
save_job_descriptions_as_txt(filtered__chartered_accountant, path=chartered_accountant_job_path)

### Upload the job descriptions to MongoDB database

In [7]:
import os
from pymongo import MongoClient, ASCENDING
from tqdm import tqdm

In [12]:
def read_txt_file(file_path):
    with open(file_path, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

def connect_mongodb():
    client = MongoClient('mongodb://localhost:27017/')  # Connect to the MongoDB server
    db = client['job-resume-db']  # Choose your database
    return db

def delete_all_documents(db):
    job_collection = db['job-descriptions']  # Choose your collection
    job_collection.delete_many({})  # Deletes all documents

def create_document(job_category, job_description, index):
    return {
        'index': index,
        'category': job_category,
        'description': job_description,
    }

def insert_document(db, document):
    job_collection = db['job-descriptions']  # Choose your collection
    job_collection.insert_one(document)  # Insert the document

In [13]:
# Connect to the MongoDB database
jobs_resume_database = connect_mongodb()

In [14]:
# Define the job categories and the paths to their respective directories
job_categories_directories = {
    'product manager': 'Job Descriptions/product manager',
    'software developer': 'Job Descriptions/software developer',
    'machine learning': 'Job Descriptions/machine learning',
    'chartered accountant': 'Job Descriptions/Chartered accountant',
}

In [15]:
# Delete all documents in the collection
delete_all_documents(jobs_resume_database)


 # For each job category
for job_category, directory_path in tqdm(job_categories_directories.items(), desc="Inserting Job Descriptions in MongoDB: "):
    # Get all text files in the directory
    txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
    # For each text file
    for index, txt_file in enumerate(txt_files):
        # Read the text file
        job_description = read_txt_file(os.path.join(directory_path, txt_file))
        
        # Create the MongoDB document
        document = create_document(job_category, job_description, index)
        
        # Insert the document into MongoDB
        insert_document(jobs_resume_database, document)

# Create an index for efficient querying
jobs_resume_database.jobs.create_index([('index', ASCENDING)])

Inserting Job Descriptions in MongoDB: 100%|██████████| 4/4 [00:01<00:00,  2.38it/s]


'index_1'