In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import re

In [37]:
df = pd.read_csv('cleaned_jobs.csv')

In [38]:
# 1. Data Cleaning
# ======================
print(f"Number of jobs: {len(df)}")
print(f"Number of unique departments: {df['department'].nunique()}")

# Display most common departments
department_counts = df['department'].value_counts()
print(f"\nTop 10 most common departments:")
print(department_counts.head(10))

# We'll only keep departments with enough samples (at least 5)
min_samples = 5
common_departments = department_counts[department_counts >= min_samples].index.tolist()
df_filtered = df[df['department'].isin(common_departments)].copy()

print(f"\nAfter filtering: {len(df_filtered)} jobs")
print(f"Remaining departments: {df_filtered['department'].nunique()}")


Number of jobs: 8235
Number of unique departments: 904

Top 10 most common departments:
department
Sales/Retail                           1113
Engineering - Telecom/Technology        899
Engineering - Mechanical/Electrical     559
Operations/Management                   444
Media/Journalism/Publishing             327
Marketing/PR/Advertising                311
Customer Service/Support                226
IT/Software Development                 217
Project/Program Management              209
Human Resources                         167
Name: count, dtype: int64

After filtering: 7160 jobs
Remaining departments: 110


In [39]:
# 2. Feature Processing
# ======================
# A) Text features
def preprocess_text(text):
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = str(text).lower()
    # Remove special punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Process text columns
df_filtered['Title_cleaned'] = df_filtered['Title'].apply(preprocess_text)
df_filtered['categories_cleaned'] = df_filtered['categories'].apply(preprocess_text)
df_filtered['company_cleaned'] = df_filtered['company'].apply(lambda x: preprocess_text(x) if pd.notna(x) else "")

# B) Numerical features
# Process Experience_year
def clean_experience_year(value):
    if pd.isna(value) or str(value).lower() == 'unknown':
        return np.nan
    try:
        return float(value)
    except:
        return np.nan

df_filtered['Experience_year_cleaned'] = df_filtered['Experience_year'].apply(clean_experience_year)

# C) Categorical features
# Encode Experience_level
experience_level_mapping = {
    'Entry Level': 0,
    'Student': 0,
    'Internship': 0,
    'Experienced': 1,
    'Manager': 2,
    'Senior Management': 3,
    'Not specified': 1
}

df_filtered['Experience_level_encoded'] = df_filtered['Experience_level'].map(
    lambda x: experience_level_mapping.get(x, 1)
)

# Encode job_type
job_type_mapping = {
    'Full Time': 0,
    'Part Time': 1,
    'Internship': 2,
    'Shift Based': 3,
    'Freelance / Project': 4
}

df_filtered['job_type_encoded'] = df_filtered['job_type'].map(
    lambda x: job_type_mapping.get(x, 0)
)

In [40]:
# 3. Encode Target (Department)
# ======================
label_encoder = LabelEncoder()
df_filtered['department_encoded'] = label_encoder.fit_transform(df_filtered['department'])

print(f"\nEncoded {len(label_encoder.classes_)} departments")



Encoded 110 departments


In [41]:
# 4. Split the Data
# ======================
X = df_filtered[[
    'Title_cleaned', 'categories_cleaned', 'company_cleaned',
    'Experience_year_cleaned', 'Experience_level_encoded',
    'job_type_encoded', 'Number_of_skills'
]]

y = df_filtered['department_encoded']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nData split:")
print(f"Training: {len(X_train)} samples")
print(f"Testing: {len(X_test)} samples")



Data split:
Training: 5728 samples
Testing: 1432 samples


In [42]:
# 5. Build the Pipeline
# ======================
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Setup TF-IDF for text features
title_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
categories_vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
company_vectorizer = TfidfVectorizer(max_features=30, stop_words='english')

# Prepare text features
X_train_title = title_vectorizer.fit_transform(X_train['Title_cleaned'])
X_train_categories = categories_vectorizer.fit_transform(X_train['categories_cleaned'])
X_train_company = company_vectorizer.fit_transform(X_train['company_cleaned'])

# Prepare numerical features
numeric_features = X_train[['Experience_year_cleaned', 'Experience_level_encoded', 
                           'job_type_encoded', 'Number_of_skills']].fillna(0).values

# Combine all features
X_train_combined = np.hstack([
    X_train_title.toarray(),
    X_train_categories.toarray(),
    X_train_company.toarray(),
    numeric_features
])


In [43]:
# 6. Train the Model
# ======================
# Use Random Forest for multi-class classification
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train_combined, y_train)
print("\nModel trained successfully!")



Model trained successfully!


In [44]:
# 7. Prepare Test Data
# ======================
X_test_title = title_vectorizer.transform(X_test['Title_cleaned'])
X_test_categories = categories_vectorizer.transform(X_test['categories_cleaned'])
X_test_company = company_vectorizer.transform(X_test['company_cleaned'])

numeric_features_test = X_test[['Experience_year_cleaned', 'Experience_level_encoded', 
                               'job_type_encoded', 'Number_of_skills']].fillna(0).values

X_test_combined = np.hstack([
    X_test_title.toarray(),
    X_test_categories.toarray(),
    X_test_company.toarray(),
    numeric_features_test
])

In [45]:
# 8. Evaluation
# ======================
y_pred = model.predict(X_test_combined)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2%}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))




Model Accuracy: 19.34%

Classification Report:
                                               precision    recall  f1-score   support

                           Account Management       0.25      0.50      0.33         2
                                   Accounting       0.06      0.20      0.09        10
                           Accounting/Finance       0.64      0.35      0.45        20
                               Administration       0.50      0.50      0.50        30
                              Adobe Photoshop       0.04      1.00      0.07         1
                             Analyst/Research       0.33      0.10      0.15        30
                                      Angular       0.00      0.00      0.00         1
                         Architectural Design       0.10      0.50      0.17         2
                    Architectural Engineering       0.00      0.00      0.00         1
                                 Architecture       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
# 9. Prediction Function
# ======================
def predict_department(job_title, categories, company, experience_year, 
                      experience_level, job_type, num_skills):
    """
    Function to predict job department based on inputs
    
    Parameters:
    -----------
    job_title : str
        Job title
    categories : str
        Job category/classification
    company : str
        Company name
    experience_year : float
        Years of experience (can be np.nan)
    experience_level : str
        Experience level ('Entry Level', 'Experienced', etc.)
    job_type : str
        Job type ('Full Time', 'Part Time', etc.)
    num_skills : int
        Number of required skills
    
    Returns:
    --------
    dict : Prediction result with probabilities
    """
    # Clean text inputs
    title_cleaned = preprocess_text(job_title)
    categories_cleaned = preprocess_text(categories)
    company_cleaned = preprocess_text(company)
    
    # Encode categorical features
    exp_level_encoded = experience_level_mapping.get(experience_level, 1)
    job_type_encoded = job_type_mapping.get(job_type, 0)
    
    # Prepare text features
    title_vec = title_vectorizer.transform([title_cleaned])
    categories_vec = categories_vectorizer.transform([categories_cleaned])
    company_vec = company_vectorizer.transform([company_cleaned])
    
    # Process experience year
    exp_year_cleaned = clean_experience_year(experience_year)
    if pd.isna(exp_year_cleaned):
        exp_year_cleaned = 0
    
    # Numerical features
    numeric_features = np.array([[exp_year_cleaned, exp_level_encoded, 
                                  job_type_encoded, num_skills]])
    
    # Combine all features
    features_combined = np.hstack([
        title_vec.toarray(),
        categories_vec.toarray(),
        company_vec.toarray(),
        numeric_features
    ])
    
    # Make prediction
    prediction = model.predict(features_combined)[0]
    probabilities = model.predict_proba(features_combined)[0]
    
    # Get top 5 predictions
    top_n = 5
    top_indices = np.argsort(probabilities)[-top_n:][::-1]
    
    result = {
        'department': label_encoder.inverse_transform([prediction])[0],
        'top_predictions': []
    }
    
    for idx in top_indices:
        result['top_predictions'].append({
            'department': label_encoder.inverse_transform([idx])[0],
            'probability': float(probabilities[idx])
        })
    
    return result


In [47]:
# 10. Test Examples
# ======================
print("\n" + "="*50)
print("Prediction Examples:")
print("="*50)

# Example 1: Data Analyst job
example1 = predict_department(
    job_title="Senior Data Analyst",
    categories="IT/Software Development",
    company="Tech Solutions Inc.",
    experience_year=5.0,
    experience_level="Experienced",
    job_type="Full Time",
    num_skills=8
)

print(f"\nExample 1: Senior Data Analyst")
print(f"Predicted Department: {example1['department']}")
print("Top 5 predictions:")
for pred in example1['top_predictions']:
    print(f"  - {pred['department']}: {pred['probability']:.2%}")

# Example 2: Marketing job
example2 = predict_department(
    job_title="Marketing Specialist",
    categories="Marketing/PR/Advertising",
    company="Advertising Agency",
    experience_year=2.0,
    experience_level="Entry Level",
    job_type="Full Time",
    num_skills=6
)

print(f"\nExample 2: Marketing Specialist")
print(f"Predicted Department: {example2['department']}")
print("Top 5 predictions:")
for pred in example2['top_predictions']:
    print(f"  - {pred['department']}: {pred['probability']:.2%}")

# Example 3: Finance job
example3 = predict_department(
    job_title="Financial Analyst",
    categories="Accounting/Finance",
    company="Bank of Egypt",
    experience_year=3.0,
    experience_level="Experienced",
    job_type="Full Time",
    num_skills=7
)

print(f"\nExample 3: Financial Analyst")
print(f"Predicted Department: {example3['department']}")
print("Top 5 predictions:")
for pred in example3['top_predictions']:
    print(f"  - {pred['department']}: {pred['probability']:.2%}")



Prediction Examples:

Example 1: Senior Data Analyst
Predicted Department: Data Analysis
Top 5 predictions:
  - Data Analysis: 14.45%
  - Analyst/Research: 12.47%
  - React.js: 7.30%
  - Engineering - Telecom/Technology: 7.13%
  - Computer Science: 5.21%

Example 2: Marketing Specialist
Predicted Department: Marketing
Top 5 predictions:
  - Marketing: 14.17%
  - Content Creation: 12.37%
  - Media/Journalism/Publishing: 10.76%
  - Digital Marketing: 10.58%
  - Marketing Strategy: 8.61%

Example 3: Financial Analyst
Predicted Department: Finance
Top 5 predictions:
  - Finance: 12.28%
  - Banking: 7.34%
  - Financial Analysis: 7.26%
  - Analyst/Research: 6.75%
  - Financial Reporting: 4.67%


In [48]:
# 11. Feature Importance Analysis
# ======================
print("\n" + "="*50)
print("Top 10 Most Important Features:")
print("="*50)

# Get feature names
feature_names = []
feature_names.extend([f"Title_{i}" for i in range(100)])
feature_names.extend([f"Category_{i}" for i in range(50)])
feature_names.extend([f"Company_{i}" for i in range(30)])
feature_names.extend(['Experience_Year', 'Experience_Level', 'Job_Type', 'Num_Skills'])

feature_importances = model.feature_importances_
top_indices = np.argsort(feature_importances)[-10:][::-1]

for idx in top_indices:
    if idx < len(feature_names):
        print(f"{feature_names[idx]}: {feature_importances[idx]:.4f}")



Top 10 Most Important Features:
Num_Skills: 0.0627
Experience_Year: 0.0464
Experience_Level: 0.0271
Title_83: 0.0211
Title_34: 0.0192
Title_89: 0.0182
Title_59: 0.0182
Title_56: 0.0180
Category_12: 0.0168
Category_39: 0.0161


In [49]:
# 12. Save the Model for Future Use
# ======================
import joblib
import pickle

# Save model and components
model_data = {
    'model': model,
    'label_encoder': label_encoder,
    'title_vectorizer': title_vectorizer,
    'categories_vectorizer': categories_vectorizer,
    'company_vectorizer': company_vectorizer,
    'experience_level_mapping': experience_level_mapping,
    'job_type_mapping': job_type_mapping,
    'preprocess_text': preprocess_text,
    'clean_experience_year': clean_experience_year
}

# Save using pickle
with open('department_predictor_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("\nModel saved to 'department_predictor_model.pkl'")

# Function to load and use the model
def load_and_predict():
    """Function to load the model and use it for predictions"""
    with open('department_predictor_model.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    # Return all model components
    return model_data

print("\nYou can now use the model to predict departments for new jobs!")


Model saved to 'department_predictor_model.pkl'

You can now use the model to predict departments for new jobs!


In [50]:
# To use the model after saving it:
loaded_model = load_and_predict()

# To predict for a new job:
result = predict_department(
    job_title="Data Scientist",
    categories="IT/Software Development",
    company="AI Startup",
    experience_year=3.0,
    experience_level="Experienced",
    job_type="Full Time",
    num_skills=10
)

print(f"Predicted Department: {result['department']}")

Predicted Department: Analyst/Research
