## Introduction

This notebook processes a CSV file containing job titles, generates numerical embeddings using OpenAI's API, and utilizes a pre-trained XGBoost classifier to predict labels. The results are saved and displayed in a formatted table for easy analysis and interpretation.


* Load a CSV file whose titles are under the JOB_TITLE column.

In [5]:
import numpy as np
import pandas as pd
import pickle
import json
import os
import glob
import joblib
import time
import openai
import xgboost as xgb
from tqdm import tqdm
from rich import print
from rich.table import Table
from rich.console import Console
from rich.box import HEAVY_EDGE  



# -------------------------------
# Set the path to the input CSV file
# -------------------------------
input_csv_path = './'  # Directory containing the CSV file
input_csv_file = 'data_job_title_industry_embeddings.csv'  # Name of the CSV file

full_input_csv_path = os.path.join(input_csv_path, input_csv_file)
base_name = os.path.splitext(input_csv_file)[0]



## Embedding Generation

In this section, we load job titles from the specified CSV file and generate their corresponding embeddings using OpenAI's `text-embedding-ada-002` model. The embeddings are appended to the dataframe and saved as a new CSV for further processing.



In [8]:

openai.api_key = ""

batch_size = 100
EMBEDDING_DIM = 1536
ZERO_EMBEDDING = [0.0] * EMBEDDING_DIM

def create_embeddings_batch(texts, max_retries=3):
    embeddings = []
    non_empty_indices = [i for i, text in enumerate(texts) if isinstance(text, str) and text.strip()]
    non_empty_texts = [text for text in texts if isinstance(text, str) and text.strip()]
    for attempt in range(max_retries):
        try:
            if non_empty_texts:
                response = openai.Embedding.create(
                    input=non_empty_texts,
                    model="text-embedding-ada-002"
                )
                batch_embeddings = [item['embedding'] for item in response['data']]
                if len(batch_embeddings) != len(non_empty_texts):
                    batch_embeddings = [ZERO_EMBEDDING] * len(non_empty_texts)
                break
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(2)
    else:
        batch_embeddings = [ZERO_EMBEDDING] * len(non_empty_texts)

    full_embeddings = [ZERO_EMBEDDING] * len(texts)
    for idx, emb in zip(non_empty_indices, batch_embeddings):
        if len(emb) == EMBEDDING_DIM:
            full_embeddings[idx] = emb
        else:
            full_embeddings[idx] = ZERO_EMBEDDING
    return full_embeddings

df = pd.read_csv(full_input_csv_path)
df = df.drop_duplicates(subset=['JOB_TITLE'])

embeddings = []
for i in tqdm(range(0, len(df), batch_size), desc="Generating Embeddings"):
    batch_texts = df['JOB_TITLE'].iloc[i:i + batch_size].astype(str).tolist()
    batch_embeddings = create_embeddings_batch(batch_texts)
    embeddings.extend(batch_embeddings)
    time.sleep(1)

df['embeddings'] = embeddings

output_filename = f"{base_name}_with_embeddings.csv"
output_path = os.path.join(input_csv_path, output_filename)
df.to_csv(output_path, index=False)

print(f"File saved at '{output_path}'")



Generating Embeddings: 100%|██████████| 6/6 [00:11<00:00,  1.90s/it]


## Model Loading and Prediction

Here, we load the pre-trained XGBoost model and the associated label encoder from the designated directory. Using these, we perform predictions on the generated embeddings, calculate prediction probabilities, and append the results to the dataframe. The final predictions are saved to a new CSV and displayed in a color-coded table for clarity.


In [9]:

models_dir = "models"
data_dir = input_csv_path

model_path = os.path.join(models_dir, "best_clf.pkl")
label_encoder_path = os.path.join(models_dir, "label_encoder.pkl")
console = Console()

try:
    with open(model_path, 'rb') as file:
        loaded_clf = pickle.load(file)
    print("Model loaded successfully from 'best_clf.pkl'")
except FileNotFoundError:
    print(f"Model file '{model_path}' not found.")
    loaded_clf = None
except Exception as e:
    print(f"Error loading model: {e}")
    loaded_clf = None

try:
    label_encoder = joblib.load(label_encoder_path)
    print("Label encoder loaded successfully from 'label_encoder.pkl'")
except FileNotFoundError:
    print(f"Label encoder file '{label_encoder_path}' not found.")
    label_encoder = None
except Exception as e:
    print(f"Error loading label encoder: {e}")
    label_encoder = None

if loaded_clf is not None and label_encoder is not None:
    embeddings_csv_filename = f"{base_name}_with_embeddings.csv"
    embeddings_csv_path = os.path.join(data_dir, embeddings_csv_filename)
    
    print(f"Base Name: {base_name}")
    print(f"Embeddings CSV Filename: {embeddings_csv_filename}")
    print(f"Embeddings CSV Path: {embeddings_csv_path}")
    
    print(f"Embeddings CSV path set to '{embeddings_csv_path}'")
    
    try:
        df_embeddings = pd.read_csv(embeddings_csv_path)
        print(f"Embeddings CSV loaded from '{embeddings_csv_path}'")
    except FileNotFoundError:
        print(f"Embeddings CSV file '{embeddings_csv_path}' not found.")
        df_embeddings = None
    except Exception as e:
        print(f"Error loading embeddings CSV: {e}")
        df_embeddings = None

    if df_embeddings is not None and 'embeddings' in df_embeddings.columns:
        try:
            X_test = np.array(df_embeddings['embeddings'].apply(json.loads).tolist())
            
            y_pred = loaded_clf.predict(X_test)
            y_pred_probs = loaded_clf.predict_proba(X_test)
            
            y_pred_labels = label_encoder.inverse_transform(y_pred)
            df_embeddings['predictions'] = y_pred_labels
            
            predictions_output_filename = f"{base_name}_predictions.csv"
            predictions_output_path = os.path.join(data_dir, predictions_output_filename)
            df_embeddings.to_csv(predictions_output_path, index=False)
            print(f"Predictions saved at '{predictions_output_path}'")
            
            table = Table(show_header=True, header_style="bold magenta", box=HEAVY_EDGE, show_lines=True)
            table.add_column("Job Title", style="dim", width=50, overflow="fold")
            table.add_column("Prediction", justify="center", width=10)
            table.add_column("Probability", justify="right", width=12)
            
            for i in range(len(df_embeddings)):
                job_title = df_embeddings['JOB_TITLE'].iloc[i]
                label = df_embeddings['predictions'].iloc[i]
                
                label_color = "yellow" if label == 'False' else "blue"
                
                prob = y_pred_probs[i][0] if label == 'False' else y_pred_probs[i][1]
                prob_percentage = prob * 100
                prob_str = f"{prob_percentage:.2f}%"
                
                if prob_percentage > 90:
                    prob_color = "green"
                elif 70 < prob_percentage <= 90:
                    prob_color = "#FFA500"
                elif 50 < prob_percentage <= 70:
                    prob_color = "red"
                else:
                    prob_color = "grey"
                
                table.add_row(
                    job_title,
                    f"[{label_color}]{label}[/{label_color}]",
                    f"[{prob_color}]{prob_str}[/{prob_color}]"
                )
            
            console.print(table)
        
        except Exception as e:
            print(f"Error during prediction: {e}")
    else:
        if df_embeddings is not None:
            print("The 'embeddings' column was not found in the CSV.")
else:
    print("Model or label encoder could not be loaded. Please check the previous steps.")


## Conclusion

The pipeline successfully ingests job titles, generates their embeddings, and leverages a machine learning model to predict labels. The structured and color-coded output facilitates easy interpretation of the results.

