In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap

In [8]:
df = pd.read_csv('postings.csv')

print(f"Loaded {len(df)} job postings")
print(f"Columns: {list(df.columns)}")

# Basic cleaning
data = df.copy()

# Clean salary
if 'normalized_salary' in data.columns:
    data['salary'] = data['normalized_salary']
else:
    data['salary'] = data[['min_salary', 'max_salary', 'med_salary']].mean(axis=1)

# Remove records with missing critical data
data = data.dropna(subset=['salary', 'formatted_experience_level'])
data = data[(data['salary'] >= 20000) & (data['salary'] <= 500000)]

# Clean categorical fields
data['experience_level'] = data['formatted_experience_level'].fillna('Not Specified')
data['work_type'] = data['formatted_work_type'].fillna('Not Specified')
data['remote'] = data['remote_allowed'].fillna(0).astype(int)

# Calculate engagement rate
data['engagement_rate'] = (data['applies'] / (data['views'] + 1)) * 100
data['engagement_rate'] = data['engagement_rate'].clip(0, 100)

# Extract state
data['state'] = data['location'].str.split(',').str[-1].str.strip().str.upper()
invalid_states = ['UNITED STATES', 'US', 'USA', 'REMOTE', '']
data = data[~data['state'].isin(invalid_states)]
data = data[data['state'].str.len() == 2]

print(f"\nCleaned data: {len(data)} job postings")


Loaded 123849 job postings
Columns: ['job_id', 'company_name', 'title', 'description', 'max_salary', 'pay_period', 'location', 'company_id', 'views', 'med_salary', 'min_salary', 'formatted_work_type', 'applies', 'original_listed_time', 'remote_allowed', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency', 'compensation_type', 'normalized_salary', 'zip_code', 'fips']

Cleaned data: 22785 job postings


In [12]:
print("\n" + "="*80)
print("CREATING EMBEDDINGS")
print("="*80)

# We'll create embeddings based on multiple dimensions:
# 1. Numeric features (salary, engagement, views, applies)
# 2. Categorical features (experience level, work type, state)
# 3. Temporal features (if available)
# 4. Text features (from job title - simple keyword counts)

embeddings_data = data[['job_id']].copy()

# --- NUMERIC FEATURES (4 dimensions) ---
# Log-transform salary for better scale
embeddings_data['salary_log'] = np.log1p(data['salary'])

# Engagement rate (already 0-100)
embeddings_data['engagement_rate'] = data['engagement_rate']

# Log-transform views and applies
embeddings_data['views_log'] = np.log1p(data['views'])
embeddings_data['applies_log'] = np.log1p(data['applies'])

print(f"\nNumeric features: 4 dimensions")

# --- CATEGORICAL FEATURES: Experience Level (one-hot encoding) ---
exp_dummies = pd.get_dummies(data['experience_level'], prefix='exp')
embeddings_data = pd.concat([embeddings_data, exp_dummies], axis=1)
print(f"Experience level features: {len(exp_dummies.columns)} dimensions")

# --- CATEGORICAL FEATURES: Work Type (one-hot encoding) ---
work_dummies = pd.get_dummies(data['work_type'], prefix='work')
embeddings_data = pd.concat([embeddings_data, work_dummies], axis=1)
print(f"Work type features: {len(work_dummies.columns)} dimensions")

# --- BINARY FEATURE: Remote ---
embeddings_data['remote'] = data['remote']
print(f"Remote feature: 1 dimension")

# --- CATEGORICAL FEATURES: State (grouped by region to reduce dimensionality) ---
# Group states into regions
region_map = {
    # Northeast
    'ME': 'Northeast', 'NH': 'Northeast', 'VT': 'Northeast', 'MA': 'Northeast',
    'RI': 'Northeast', 'CT': 'Northeast', 'NY': 'Northeast', 'NJ': 'Northeast',
    'PA': 'Northeast', 'DE': 'Northeast', 'MD': 'Northeast', 'DC': 'Northeast',
    # Southeast
    'WV': 'Southeast', 'VA': 'Southeast', 'KY': 'Southeast', 'TN': 'Southeast',
    'NC': 'Southeast', 'SC': 'Southeast', 'GA': 'Southeast', 'FL': 'Southeast',
    'AL': 'Southeast', 'MS': 'Southeast', 'LA': 'Southeast', 'AR': 'Southeast',
    # Midwest
    'OH': 'Midwest', 'MI': 'Midwest', 'IN': 'Midwest', 'IL': 'Midwest',
    'WI': 'Midwest', 'MN': 'Midwest', 'IA': 'Midwest', 'MO': 'Midwest',
    'ND': 'Midwest', 'SD': 'Midwest', 'NE': 'Midwest', 'KS': 'Midwest',
    # Southwest
    'TX': 'Southwest', 'OK': 'Southwest', 'NM': 'Southwest', 'AZ': 'Southwest',
    # West
    'CO': 'West', 'WY': 'West', 'MT': 'West', 'ID': 'West', 'UT': 'West',
    'NV': 'West', 'CA': 'West', 'OR': 'West', 'WA': 'West', 'AK': 'West', 'HI': 'West'
}

data['region'] = data['state'].map(region_map)
region_dummies = pd.get_dummies(data['region'], prefix='region')
embeddings_data = pd.concat([embeddings_data, region_dummies], axis=1)
print(f"Region features: {len(region_dummies.columns)} dimensions")

# --- TEXT FEATURES: Job Title Keywords (simple approach) ---
# Extract common keywords from job titles
keywords = ['senior', 'junior', 'lead', 'manager', 'director', 'engineer', 
            'developer', 'analyst', 'scientist', 'designer', 'data', 'software']

for keyword in keywords:
    embeddings_data[f'title_{keyword}'] = data['title'].str.lower().str.contains(keyword, na=False).astype(int)

print(f"Title keyword features: {len(keywords)} dimensions")

# Total embedding dimensions
total_dims = len(embeddings_data.columns) - 1  # Exclude job_id
print(f"\nTotal embedding dimensions: {total_dims}")



CREATING EMBEDDINGS

Numeric features: 4 dimensions
Experience level features: 6 dimensions
Work type features: 7 dimensions
Remote feature: 1 dimension
Region features: 5 dimensions
Title keyword features: 12 dimensions

Total embedding dimensions: 35


In [13]:
print("\n" + "="*80)
print("HANDLING MISSING VALUES AND NORMALIZING")
print("="*80)

# Separate job_id from features
job_ids = embeddings_data['job_id'].values
feature_columns = [col for col in embeddings_data.columns if col != 'job_id']

# Check for missing values
print(f"Missing values per column:")
missing_counts = embeddings_data[feature_columns].isna().sum()
print(missing_counts[missing_counts > 0])

# Fill missing values with 0 (appropriate for one-hot encoded and log-transformed features)
features = embeddings_data[feature_columns].fillna(0).values

print(f"\nFeatures shape after filling NaN: {features.shape}")

# Standardize features (mean=0, std=1)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

print(f"Scaled features shape: {features_scaled.shape}")

# Save full embeddings
embeddings_df = pd.DataFrame(features_scaled, columns=feature_columns)
embeddings_df.insert(0, 'job_id', job_ids)

# Add original attributes for reference
embeddings_df['salary'] = data['salary'].values
embeddings_df['experience_level'] = data['experience_level'].values
embeddings_df['work_type'] = data['work_type'].values
embeddings_df['state'] = data['state'].values
embeddings_df['region'] = data['region'].values
embeddings_df['engagement_rate_orig'] = data['engagement_rate'].values
embeddings_df['title'] = data['title'].values

embeddings_df.to_csv('embeddings.csv', index=False)
print(f"\nSaved embeddings.csv ({len(embeddings_df)} records)")



HANDLING MISSING VALUES AND NORMALIZING
Missing values per column:
engagement_rate    18182
views_log            266
applies_log        18182
dtype: int64

Features shape after filling NaN: (22785, 35)
Scaled features shape: (22785, 35)

Saved embeddings.csv (22785 records)


In [16]:
print("\n" + "="*80)
print("DIMENSIONALITY REDUCTION: PCA")
print("="*80)

# Apply PCA
pca = PCA(n_components=2, random_state=42)
pca_coords = pca.fit_transform(features_scaled)

print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

# Create PCA dataframe
pca_df = pd.DataFrame({
    'job_id': job_ids,
    'pca_x': pca_coords[:, 0],
    'pca_y': pca_coords[:, 1],
    'salary': data['salary'].values,
    'experience_level': data['experience_level'].values,
    'work_type': data['work_type'].values,
    'state': data['state'].values,
    'region': data['region'].values,
    'engagement_rate': data['engagement_rate'].values,
    'remote': data['remote'].values,
    'title': data['title'].values
})

pca_df.to_csv('embeddings_pca_2d.csv', index=False)
print(f"\nSaved embeddings_pca_2d.csv ({len(pca_df)} records)")



DIMENSIONALITY REDUCTION: PCA
PCA explained variance ratio: [0.0900452  0.06780134]
Total variance explained: 15.78%

Saved embeddings_pca_2d.csv (22785 records)


In [17]:

print("\n" + "="*80)
print("DIMENSIONALITY REDUCTION: t-SNE")
print("="*80)

# Apply t-SNE (on a sample if data is large)
sample_size = min(10000, len(features_scaled))
if len(features_scaled) > sample_size:
    indices = np.random.choice(len(features_scaled), sample_size, replace=False)
    features_sample = features_scaled[indices]
    sample_job_ids = job_ids[indices]
else:
    features_sample = features_scaled
    indices = np.arange(len(features_scaled))
    sample_job_ids = job_ids

print(f"Using {len(features_sample)} samples for t-SNE")

tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
tsne_coords = tsne.fit_transform(features_sample)

# Create t-SNE dataframe
tsne_df = pd.DataFrame({
    'job_id': sample_job_ids,
    'tsne_x': tsne_coords[:, 0],
    'tsne_y': tsne_coords[:, 1],
    'salary': data.iloc[indices]['salary'].values,
    'experience_level': data.iloc[indices]['experience_level'].values,
    'work_type': data.iloc[indices]['work_type'].values,
    'state': data.iloc[indices]['state'].values,
    'region': data.iloc[indices]['region'].values,
    'engagement_rate': data.iloc[indices]['engagement_rate'].values,
    'remote': data.iloc[indices]['remote'].values,
    'title': data.iloc[indices]['title'].values
})

tsne_df.to_csv('embeddings_tsne_2d.csv', index=False)
print(f"\nSaved embeddings_tsne_2d.csv ({len(tsne_df)} records)")



DIMENSIONALITY REDUCTION: t-SNE
Using 10000 samples for t-SNE

Saved embeddings_tsne_2d.csv (10000 records)


In [18]:
print("\n" + "="*80)
print("DIMENSIONALITY REDUCTION: UMAP")
print("="*80)

# Apply UMAP (on same sample as t-SNE for consistency)
reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, min_dist=0.1)
umap_coords = reducer.fit_transform(features_sample)

# Create UMAP dataframe
umap_df = pd.DataFrame({
    'job_id': sample_job_ids,
    'umap_x': umap_coords[:, 0],
    'umap_y': umap_coords[:, 1],
    'salary': data.iloc[indices]['salary'].values,
    'experience_level': data.iloc[indices]['experience_level'].values,
    'work_type': data.iloc[indices]['work_type'].values,
    'state': data.iloc[indices]['state'].values,
    'region': data.iloc[indices]['region'].values,
    'engagement_rate': data.iloc[indices]['engagement_rate'].values,
    'remote': data.iloc[indices]['remote'].values,
    'title': data.iloc[indices]['title'].values
})

umap_df.to_csv('embeddings_umap_2d.csv', index=False)
print(f"\nSaved embeddings_umap_2d.csv ({len(umap_df)} records)")



DIMENSIONALITY REDUCTION: UMAP


  warn(



Saved embeddings_umap_2d.csv (10000 records)


In [19]:
print("\n" + "="*80)
print("EMBEDDING CREATION COMPLETE")
print("="*80)
print("\nGenerated files:")
print("1. embeddings.csv - Full high-dimensional embeddings")
print("2. embeddings_pca_2d.csv - PCA 2D projection")
print("3. embeddings_tsne_2d.csv - t-SNE 2D projection")
print("4. embeddings_umap_2d.csv - UMAP 2D projection")
print("\nFeature breakdown:")
print(f"  - Numeric features: 4 (salary_log, engagement, views_log, applies_log)")
print(f"  - Experience level: {len(exp_dummies.columns)} dimensions")
print(f"  - Work type: {len(work_dummies.columns)} dimensions")
print(f"  - Remote: 1 dimension")
print(f"  - Region: {len(region_dummies.columns)} dimensions")
print(f"  - Title keywords: {len(keywords)} dimensions")
print(f"  - TOTAL: {total_dims} dimensions")
print("="*80)


EMBEDDING CREATION COMPLETE

Generated files:
1. embeddings.csv - Full high-dimensional embeddings
2. embeddings_pca_2d.csv - PCA 2D projection
3. embeddings_tsne_2d.csv - t-SNE 2D projection
4. embeddings_umap_2d.csv - UMAP 2D projection

Feature breakdown:
  - Numeric features: 4 (salary_log, engagement, views_log, applies_log)
  - Experience level: 6 dimensions
  - Work type: 7 dimensions
  - Remote: 1 dimension
  - Region: 5 dimensions
  - Title keywords: 12 dimensions
  - TOTAL: 35 dimensions
