load and inspect data

In [20]:
import pandas as pd
import re

# Load raw data since clean data is empty
df = pd.read_csv("../data/jobs_raw.csv")
print(f"✅ Loaded raw data: {df.shape}")
print("Columns:", df.columns.tolist())

# Simple text cleaning function
def clean_text(text):
    if pd.isna(text):
        return ""
    # Convert to lowercase and remove special characters
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    return text.strip()

# Apply cleaning to description
df['clean_description'] = df['description'].apply(clean_text)

print(f"\n📊 Data ready for training: {df.shape}")
df.head()

✅ Loaded raw data: (78, 6)
Columns: ['title', 'company', 'location', 'description', 'created', 'category']

📊 Data ready for training: (78, 7)


Unnamed: 0,title,company,location,description,created,category,clean_description
0,Data Scientist,eTeam,"Veldhoven, Noord-Brabant",Role: Data Scientists Salary : 103k per annum ...,2025-07-31T22:44:49Z,Data Engineer,role data scientists salary k per annum holida...
1,Data Architectuur Specialist (Data Modelleur /...,CIMSOLUTIONS,"Veldhoven, Noord-Brabant",Als Data Architectuur Specialist ben je verant...,2025-07-09T07:11:50Z,Data Engineer,als data architectuur specialist ben je verant...
2,Lead Data Lifecycle Management,ASML,"Veldhoven, Noord-Brabant",Introduction to the job Data is a team sport. ...,2025-07-01T13:09:20Z,Data Engineer,introduction to the job data is a team sport j...
3,Lead Data Quality Management,ASML,"Veldhoven, Noord-Brabant",Introduction to the job Data is a team sport. ...,2025-07-01T13:09:20Z,Data Engineer,introduction to the job data is a team sport j...
4,Recruitment Consultant,Orange Quarter,"Veldhoven, Noord-Brabant",Orange Quarter is looking for a Recruitment Co...,2025-08-01T03:01:23Z,Data Engineer,orange quarter is looking for a recruitment co...


label the data

In [21]:
def label_category(text):
    text = text.lower()
    if "data" in text or "machine learning" in text:
        return "data"
    elif "software" in text or "developer" in text or "engineer" in text:
        return "software"
    elif "devops" in text or "cloud" in text:
        return "devops"
    elif "security" in text:
        return "security"
    elif "testing" in text or "qa" in text:
        return "qa"
    else:
        return "other"

df["category"] = df["clean_description"].apply(label_category)
df["category"].value_counts()



category
software    39
data        24
other       15
Name: count, dtype: int64

train and test split

In [22]:
from sklearn.model_selection import train_test_split

X = df["clean_description"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

print(f"Data shape: {df.shape}")
print(f"Missing clean_description: {df['clean_description'].isna().sum()}")
print(f"Missing category: {df['category'].isna().sum()}")
print(df['clean_description'].head())
print(df['category'].head())

Data shape: (78, 7)
Missing clean_description: 0
Missing category: 0
0    role data scientists salary k per annum holida...
1    als data architectuur specialist ben je verant...
2    introduction to the job data is a team sport j...
3    introduction to the job data is a team sport j...
4    orange quarter is looking for a recruitment co...
Name: clean_description, dtype: object
0    data
1    data
2    data
3    data
4    data
Name: category, dtype: object


build and train the pipeline

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('clf', MultinomialNB())
])

model.fit(X_train, y_train)



0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


evaluate

In [24]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

        data       1.00      0.80      0.89         5
       other       0.00      0.00      0.00         3
    software       0.67      1.00      0.80         8

    accuracy                           0.75        16
   macro avg       0.56      0.60      0.56        16
weighted avg       0.65      0.75      0.68        16



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


save model

In [25]:
import joblib
joblib.dump(model, "../data/skill_classifier.pkl")
print("✅ Model saved to ../data/skill_classifier.pkl")


✅ Model saved to ../data/skill_classifier.pkl
