In [2]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For text processing
import textacy
import re
import string

# For machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Load dataset (CSV with "Text" and "Job Role" columns)
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Real_Time_Project/job_descriptions.csv")
df.head()

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615940 entries, 0 to 1615939
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Job Id            1615940 non-null  int64  
 1   Experience        1615940 non-null  object 
 2   Qualifications    1615940 non-null  object 
 3   Salary Range      1615940 non-null  object 
 4   location          1615940 non-null  object 
 5   Country           1615940 non-null  object 
 6   latitude          1615940 non-null  float64
 7   longitude         1615940 non-null  float64
 8   Work Type         1615940 non-null  object 
 9   Company Size      1615940 non-null  int64  
 10  Job Posting Date  1615940 non-null  object 
 11  Preference        1615940 non-null  object 
 12  Contact Person    1615940 non-null  object 
 13  Contact           1615940 non-null  object 
 14  Job Title         1615940 non-null  object 
 15  Role              1615940 non-null  object 
 16  

In [6]:
df.shape

(1615940, 23)

In [7]:
# Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing Values in Each Column:\n", missing_values)

# Percentage of missing data
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("\nPercentage of Missing Data:\n", missing_percentage)


Missing Values in Each Column:
 Job Id                 0
Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Contact                0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
dtype: int64

Percentage of Missing Data:
 Job Id              0.000000
Experience          0.000000
Qualifications      0.000000
Salary Range        0.000000
location            0.000000
Country             0.000000
latitude            0.000000
longitude           0.000000
Work Type           0.000000
Company Size        0.000000
Job Posting Date    0.000000
Preference          0.000000
Co

### Handling Missing Values in Company Profile Since this column isn't critical for job recommendations, we have two options:

  * 1.Drop the missing values in Company Profile.

  *2.Fill the missing values with a placeholder like "Not Provided".



In [8]:
# Fill missing values in 'Company Profile' with 'Not Provided'
df['Company Profile'].fillna('Not Provided', inplace=True)

# Confirm that there are no more missing values
print("Missing values in 'Company Profile':", df['Company Profile'].isnull().sum())

Missing values in 'Company Profile': 0


In [9]:
### Display all column names to check for typos or extra spaces

print(df.columns.tolist())

['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location', 'Country', 'latitude', 'longitude', 'Work Type', 'Company Size', 'Job Posting Date', 'Preference', 'Contact Person', 'Contact', 'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company', 'Company Profile']


In [10]:
# Standardize column names: lowercase and remove extra spaces
df.columns = df.columns.str.strip().str.lower()

# Check the updated column names
print(df.columns.tolist())


['job id', 'experience', 'qualifications', 'salary range', 'location', 'country', 'latitude', 'longitude', 'work type', 'company size', 'job posting date', 'preference', 'contact person', 'contact', 'job title', 'role', 'job portal', 'job description', 'benefits', 'skills', 'responsibilities', 'company', 'company profile']


### Drop unnecessary columns

In [11]:
# Drop irrelevant columns
columns_to_drop = ['contact person', 'contact', 'job portal',
                   'latitude', 'longitude', 'job posting date', 'job id']
df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Confirm the columns have been removed
print("Remaining Columns:\n", df.columns.tolist())


Remaining Columns:
 ['experience', 'qualifications', 'salary range', 'location', 'country', 'work type', 'company size', 'preference', 'job title', 'role', 'job description', 'benefits', 'skills', 'responsibilities', 'company', 'company profile']


## Encode Categorical Columns

## Why Use Label Encoding Instead of One-Hot Encoding?
The choice between Label Encoding and One-Hot Encoding depends on the nature of the data and how the model will use it. Let me explain why Label Encoding is more suitable here.

## Label Encoding (What We're Using)
Converts categories into integer labels (e.g., Full-time → 0, Part-time → 1).
Efficient for columns with many unique categories.
Keeps the dataset compact and avoids adding too many extra columns.

## One-Hot Encoding (Why We Avoid It Here)
Converts each category into a binary column (e.g., Full-time → [1, 0], Part-time → [0, 1]).
Can lead to high-dimensional data when categories have many unique values.
Slows down model training and increases memory usage.

In [12]:
from sklearn.preprocessing import LabelEncoder

# Categorical columns to encode
categorical_columns = ['work type', 'location', 'country',
                       'preference', 'qualifications',
                       'role', 'company', 'job title']

# Initialize and apply Label Encoding
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save encoder for later use

# Confirm encoding
df.head()


Unnamed: 0,experience,qualifications,salary range,location,country,work type,company size,preference,job title,role,job description,benefits,skills,responsibilities,company,company profile
0,5 to 15 Years,6,$59K-$99K,59,92,2,26801,1,31,312,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",427,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,2 to 12 Years,4,$56K-$116K,11,198,2,100340,1,145,147,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",619,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,0 to 12 Years,9,$61K-$104K,102,114,4,84525,2,90,273,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,819,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,4 to 11 Years,9,$65K-$91K,152,20,1,129896,1,83,375,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",404,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,1 to 12 Years,7,$64K-$87K,172,39,2,53944,1,38,60,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,174,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


##  Split experience into Min and Max

In [13]:
# Extract minimum and maximum experience as numerical values
df[['min_experience', 'max_experience']] = df['experience'].str.extract(r'(\d+)\s*to\s*(\d+)').astype(float)

# Drop the original 'experience' column
df.drop(columns=['experience'], inplace=True)

# Confirm the new columns
df[['min_experience', 'max_experience']].head()


Unnamed: 0,min_experience,max_experience
0,5.0,15.0
1,2.0,12.0
2,0.0,12.0
3,4.0,11.0
4,1.0,12.0


## Split salary range into Min and Max

In [14]:
# Clean and extract numeric values from 'salary range'
df[['min_salary', 'max_salary']] = df['salary range'].str.replace('K', '').str.replace('$', '').str.extract(r'(\d+)\s*-\s*(\d+)').astype(float)

# Drop the original 'salary range' column
df.drop(columns=['salary range'], inplace=True)

# Confirm the new columns
df[['min_salary', 'max_salary']].head()


Unnamed: 0,min_salary,max_salary
0,59.0,99.0
1,56.0,116.0
2,61.0,104.0
3,65.0,91.0
4,64.0,87.0


In [15]:
df.head()

Unnamed: 0,qualifications,location,country,work type,company size,preference,job title,role,job description,benefits,skills,responsibilities,company,company profile,min_experience,max_experience,min_salary,max_salary
0,6,59,92,2,26801,1,31,312,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",427,"{""Sector"":""Diversified"",""Industry"":""Diversifie...",5.0,15.0,59.0,99.0
1,4,11,198,2,100340,1,145,147,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",619,"{""Sector"":""Financial Services"",""Industry"":""Com...",2.0,12.0,56.0,116.0
2,9,102,114,4,84525,2,90,273,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,819,"{""Sector"":""Insurance"",""Industry"":""Insurance: P...",0.0,12.0,61.0,104.0
3,9,152,20,1,129896,1,83,375,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",404,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O...",4.0,11.0,65.0,91.0
4,7,172,39,2,53944,1,38,60,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,174,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ...",1.0,12.0,64.0,87.0


## Clean and Prepare Text Data

Text data in columns like job description, skills, responsibilities, and benefits often contains noise (punctuation, extra spaces, etc.).

Cleaning it ensures that the model focuses on meaningful information.

In [16]:
# Function to clean text data
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Apply cleaning to text columns
text_columns = ['job description', 'skills', 'responsibilities', 'benefits', 'company profile']
for col in text_columns:
    df[col] = df[col].apply(clean_text)

# Check the cleaned data
df[text_columns].head()


Unnamed: 0,job description,skills,responsibilities,benefits,company profile
0,social media managers oversee an organizations...,social media platforms eg facebook twitter ins...,manage and grow social media accounts create e...,flexible spending accounts fsas relocation ass...,sectordiversifiedindustrydiversified financial...
1,frontend web developers design and implement u...,html css javascript frontend frameworks eg rea...,design and code user interfaces for websites e...,health insurance retirement plans paid time of...,sectorfinancial servicesindustrycommercial s d...
2,quality control managers establish and enforce...,quality control processes and methodologies st...,establish and enforce quality control standard...,legal assistance bonuses and incentive program...,sectorinsuranceindustryinsurance property and ...
3,wireless network engineers design implement an...,wireless network design and architecture wifi ...,design configure and optimize wireless network...,transportation benefits professional developme...,sectorenergyindustrymining crudeoil production...
4,a conference manager coordinates and manages c...,event planning conference logistics budget man...,specialize in conference and convention planni...,flexible spending accounts fsas relocation ass...,sectorenergyindustryenergy oil gas exploration...


## Combine Text Columns
## Why?

Combining all relevant text fields creates a richer context for text-based recommendations.

## Why This Is Important:

job title is prioritized because it’s the most important feature.
Combining text fields helps capture complete job details for recommendations.

In [17]:
# Combine all relevant text columns into one feature
df['combined_text'] = (
    df['job title'].astype(str) + " " +  # Prioritize job title
    df['job description'] + " " +
    df['skills'] + " " +
    df['responsibilities'] + " " +
    df['benefits'] + " " +
    df['company profile']
)

# Confirm the combined text
df['combined_text'].head()


Unnamed: 0,combined_text
0,31 social media managers oversee an organizati...
1,145 frontend web developers design and impleme...
2,90 quality control managers establish and enfo...
3,83 wireless network engineers design implement...
4,38 a conference manager coordinates and manage...


## Vectorize Text Data Using TF-IDF

## Why?
TF-IDF (Term Frequency–Inverse Document Frequency) helps the model understand the importance of words in job descriptions. It highlights important words and reduces the influence of common ones.

## Why This Is Important:

Converts text to numbers that the model can use.
Focuses on the most relevant words across job postings.
Limits features to 1500 to balance model performance and complexity.

In [18]:
# Reduce TF-IDF features to 1500 for better memory management
tfidf = TfidfVectorizer(
    max_features=1500,
    stop_words='english',
    max_df=0.85,
    min_df=5,
    ngram_range=(1, 2)
)

# Apply TF-IDF without converting to array
text_features = tfidf.fit_transform(df['combined_text'])
print("TF-IDF shape:", text_features.shape)


TF-IDF shape: (1615940, 1500)


In [19]:
import zipfile

# Step 1: Save the processed DataFrame as a CSV file
df.to_csv('/content/drive/MyDrive/Real_Time_Project/processed_job_data.csv', index=False)

# Step 2: Create a ZIP file and add the CSV to it
#with zipfile.ZipFile('/content/drive/MyDrive/Real_Time_Project/processed_job_data.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
#    zipf.write('/content/drive/MyDrive/Real_Time_Project/processed_job_data.csv', arcname='processed_job_data.csv')

print("✅ Processed data saved as CSV successfully!")


✅ Processed data saved as ZIP successfully!


In [20]:
import joblib

# Save the TF-IDF vectorizer
joblib.dump(tfidf, '/content/drive/MyDrive/Real_Time_Project/tfidf_vectorizer.pkl')

print("✅ TF-IDF vectorizer saved successfully!")


✅ TF-IDF vectorizer saved successfully!


In [21]:
# Show the first 20 words in the TF-IDF vocabulary
print("TF-IDF Vocabulary Sample:\n", list(tfidf.vocabulary_.keys())[:20])


TF-IDF Vocabulary Sample:
 ['31', 'social', 'media', 'managers', 'oversee', 'organizations', 'presence', 'create', 'schedule', 'content', 'engage', 'analyze', 'metrics', 'drive', 'brand', 'awareness', 'engagement', 'platforms', 'facebook', 'creation']
