## **Text Processing**

In [1]:
import pandas as pd

combined_canonical = pd.read_csv("/content/combined_canonical_v1.csv")

  combined_canonical = pd.read_csv("/content/combined_canonical_v1.csv")


In [2]:
combined_canonical.info()
combined_canonical.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129334 entries, 0 to 129333
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   job_id            129334 non-null  int64  
 1   job_title         129334 non-null  object 
 2   company           129334 non-null  object 
 3   location          129334 non-null  object 
 4   experience_level  99932 non-null   object 
 5   job_description   129334 non-null  object 
 6   salary_final      11772 non-null   float64
 7   industry          5491 non-null    object 
 8   sub_industry      5491 non-null    object 
 9   source            129334 non-null  object 
dtypes: float64(1), int64(1), object(8)
memory usage: 9.9+ MB


Unnamed: 0,0
job_id,0
job_title,0
company,0
location,0
experience_level,29402
job_description,0
salary_final,117562
industry,123843
sub_industry,123843
source,0


### Text Cleaning

Transformed job_description into clean_description by:

- Converting text to lowercase

- Removing HTML tags

- Removing punctuation, numbers, and special characters

- Normalizing whitespace

- Safely handling missing values (NaN â†’ "")

In [3]:
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

combined_canonical["clean_description"] = combined_canonical["job_description"].apply(clean_text)


In [4]:
combined_canonical["clean_description"].head()
combined_canonical["clean_description"].str.len().describe()


Unnamed: 0,clean_description
count,129334.0
mean,3660.648484
std,2145.967257
min,2.0
25%,2095.0
50%,3326.0
75%,4836.75
max,31522.0


### Stopword Removal

Removed high-frequency, low-information words using NLTK which include:

- the, and, is, for, with, to, of

**Why this is necessary**

These words:

- Appear in almost every document

- Carry no skill, role, or domain information

- Inflate feature space later

Removing them:

- Improves signal-to-noise ratio

- Makes skills like python, sql, aws stand out

- Improves clustering and modeling quality later

In [5]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def remove_stopwords(text):
    return " ".join(w for w in text.split() if w not in stop_words)

combined_canonical["clean_description"] = combined_canonical["clean_description"].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Verification

This includes:

- Before and after clean description check
- Length Reduction Check
- Stopword Effect Check
- Null Safety Check


In [6]:
combined_canonical[
    ["job_description", "clean_description"]
].sample(5, random_state=42)


Unnamed: 0,job_description,clean_description
35000,Description\n\nCompany: Oak Street Health\n\nT...,description company oak street health title me...
35370,"LOCATION: Watertown / New York (US-NY), United...",location watertown new york us ny united state...
328,The Terra Forza Golf team is growing! Territor...,terra forza golf team growing territory determ...
63701,The ideal candidate is a skilled professional ...,ideal candidate skilled professional passionat...
21765,Position Summary...\n\nAs a Bakery Department ...,position summary bakery department lead drive ...


In [7]:
"the" in combined_canonical["clean_description"].iloc[0].split()


False

In [8]:
combined_canonical["job_description"].str.len().describe()
combined_canonical["clean_description"].str.len().describe()


Unnamed: 0,clean_description
count,129334.0
mean,2980.924792
std,1731.072435
min,2.0
25%,1721.0
50%,2739.0
75%,3935.0
max,25010.0


In [9]:
combined_canonical["clean_description"].isnull().sum()


np.int64(0)

### Inspect Most Common Tokens

In [10]:
from collections import Counter

all_words = " ".join(combined_canonical["clean_description"]).split()
common_words = Counter(all_words).most_common(30)
common_words


[('experience', 398816),
 ('work', 371327),
 ('team', 260841),
 ('skills', 205574),
 ('job', 192258),
 ('including', 189339),
 ('ability', 180643),
 ('business', 179234),
 ('management', 176906),
 ('required', 175554),
 ('time', 172546),
 ('company', 164643),
 ('position', 160449),
 ('customer', 151481),
 ('support', 150492),
 ('benefits', 141826),
 ('service', 135044),
 ('years', 133483),
 ('care', 133077),
 ('information', 132343),
 ('health', 125345),
 ('sales', 124034),
 ('requirements', 121252),
 ('must', 118153),
 ('services', 117519),
 ('may', 117206),
 ('opportunity', 117012),
 ('data', 116101),
 ('development', 115662),
 ('status', 114136)]

### Noise Word Set

In [11]:
noise_words = {
    "experience", "years", "year", "ability", "skills", "knowledge",
    "role", "responsibilities", "responsible", "work", "working",
    "team", "teams", "looking", "candidate", "position", "job",
    "required", "preferred", "strong", "excellent", "good", "including",
    "time", "company", "benefits", "requirements", "must", "may", "opportunity"
}


### Noise Removal

In [12]:
def remove_noise_words(text):
    return " ".join(
        word for word in text.split()
        if word not in noise_words
    )

combined_canonical["clean_description"] = (
    combined_canonical["clean_description"]
    .apply(remove_noise_words)
)


### After Removal Check

In [13]:
from collections import Counter

Counter(
    " ".join(combined_canonical["clean_description"]).split()
).most_common(20)


[('business', 179234),
 ('management', 176906),
 ('customer', 151481),
 ('support', 150492),
 ('service', 135044),
 ('care', 133077),
 ('information', 132343),
 ('health', 125345),
 ('sales', 124034),
 ('services', 117519),
 ('data', 116101),
 ('development', 115662),
 ('status', 114136),
 ('new', 112181),
 ('environment', 110247),
 ('employment', 109014),
 ('provide', 107242),
 ('related', 103396),
 ('customers', 101993),
 ('medical', 98617)]

### Update the Dataset

In [14]:
combined_canonical.to_csv(
    "/content/combined_canonical_v2.csv",
    index=False
)


In [15]:
combined_canonical.columns


Index(['job_id', 'job_title', 'company', 'location', 'experience_level',
       'job_description', 'salary_final', 'industry', 'sub_industry', 'source',
       'clean_description'],
      dtype='object')

### Import Required Tools

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer


## Configure TF-IDF

In [17]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8
)


## Fit & transform

In [18]:
X_tfidf = tfidf.fit_transform(
    combined_canonical["clean_description"]
)


## Sanity checks

In [19]:
X_tfidf.shape


(129334, 5000)

### Vocabulary inspection

In [20]:
tfidf.get_feature_names_out()[:20]


array(['aa', 'abilities', 'abilities include', 'able', 'able communicate',
       'able lift', 'able perform', 'able read', 'abreast', 'absence',
       'abuse', 'academic', 'academy', 'accelerate', 'accept',
       'acceptable', 'acceptance', 'accepted', 'accepting', 'access'],
      dtype=object)

### Non-zero density check

In [21]:
X_tfidf.nnz / (X_tfidf.shape[0] * X_tfidf.shape[1])


0.0456680316080845

## Persist artifacts

In [30]:
import joblib

joblib.dump(tfidf, "tfidf_vectorizer_v1.pkl")
joblib.dump(X_tfidf, "X_tfidf_v1.pkl")


['X_tfidf_v1.pkl']

In [31]:
SKILL_VOCAB = {
    "python", "java", "sql", "excel", "power bi", "tableau",
    "machine learning", "deep learning", "nlp",
    "tensorflow", "pytorch",
    "aws", "azure", "gcp",
    "docker", "kubernetes",
    "react", "node", "fastapi", "django",
    "git", "linux"
}


In [32]:
def extract_skills(text, skill_vocab):
    found = set()
    text = text.lower()
    for skill in skill_vocab:
        if skill in text:
            found.add(skill)
    return list(found)

combined_canonical["extracted_skills"] = combined_canonical[
    "clean_description"
].apply(lambda x: extract_skills(x, SKILL_VOCAB))


In [36]:
combined_canonical[["job_title", "extracted_skills"]].tail(50)


Unnamed: 0,job_title,extracted_skills
129284,Experienced Travel Agent,[excel]
129285,Travel Agent,[]
129286,Reservation Specialist / Customer Service Agent,[]
129287,Reservation Specialist Vacation Rentals,[]
129288,Reservation Specialist,[]
129289,Reservation Specialist/Part Time,[excel]
129290,Reservation Specialist,[]
129291,Reservation Specialist,[]
129292,Travel Advisor - Corporate & Leisure,[]
129293,Luxury Travel Sales Agent,[]
