In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [53]:
# Load all datasets
df = pd.read_csv('employee_training.csv')
bench_df = pd.read_csv('bench_copy.csv')
demands_df = pd.read_csv('demands.csv')

print(f"Employee Training: {df.shape[0]} records, {df.shape[1]} features")
print(f"Bench Employees: {bench_df.shape[0]} records, {bench_df.shape[1]} features")
print(f"Demands/Open Positions: {demands_df.shape[0]} records, {demands_df.shape[1]} features")

Employee Training: 402 records, 28 features
Bench Employees: 109 records, 19 features
Demands/Open Positions: 80 records, 12 features


In [54]:
# Analyze demands to extract in-demand technologies
in_demand_tech = demands_df['Technology'].value_counts()
print("=== IN-DEMAND TECHNOLOGIES ===")
print(in_demand_tech.head(20))

# Create a demand score dictionary (higher count = more demand)
demand_scores = in_demand_tech.to_dict()
max_demand = max(demand_scores.values()) if demand_scores else 1

# Normalize demand scores (0-1)
demand_scores_normalized = {tech: score/max_demand for tech, score in demand_scores.items()}

print(f"\nTotal unique technologies in demand: {len(demand_scores)}")
print(f"Top 5 in-demand: {list(demand_scores.keys())[:5]}")

=== IN-DEMAND TECHNOLOGIES ===
Technology
Python           8
ML               6
Java             5
AWS              5
DevOps           4
Spark            4
Selenium         3
Node.js          3
React            3
SQL              3
JavaScript       2
Cybersecurity    2
Kubernetes       2
Docker           2
Azure            2
CI/CD            2
Kafka            2
TensorFlow       2
PyTorch          2
Spring Boot      2
Name: count, dtype: int64

Total unique technologies in demand: 28
Top 5 in-demand: ['Python', 'ML', 'Java', 'AWS', 'DevOps']


In [55]:
# Define technology similarity mapping (technologies that are closely related)
tech_similarity = {
    # Programming Languages
    'Python': ['Django', 'FastAPI', 'Flask', 'Pandas', 'ML', 'TensorFlow', 'PyTorch', 'Data Science'],
    'Java': ['Spring Boot', 'Microservices', 'Kotlin', 'Android', 'Scala'],
    'JavaScript': ['React', 'Angular', 'Vue.js', 'Node.js', 'TypeScript', 'Next.js'],
    'TypeScript': ['Angular', 'React', 'Node.js', 'Next.js', 'JavaScript'],
    'C++': ['Rust', 'Systems', 'Embedded Systems', 'Qt'],
    'C#': ['.NET Core', 'Azure', '.NET', 'Blazor'],
    'Go': ['Kubernetes', 'Docker', 'Microservices', 'Cloud'],
    'Rust': ['WebAssembly', 'Systems', 'C++', 'Low Level'],
    'Ruby': ['Rails', 'Full Stack'],
    'PHP': ['Laravel', 'Web'],
    'Scala': ['Spark', 'Big Data', 'Java', 'Kafka'],
    
    # Frontend
    'React': ['JavaScript', 'TypeScript', 'Next.js', 'Redux', 'Frontend'],
    'Angular': ['TypeScript', 'JavaScript', 'Frontend', 'RxJS'],
    'Vue.js': ['JavaScript', 'Nuxt', 'Frontend'],
    
    # Backend & APIs
    'Node.js': ['Express', 'JavaScript', 'TypeScript', 'GraphQL', 'Backend'],
    'Django': ['Python', 'REST API', 'Backend'],
    'FastAPI': ['Python', 'API', 'Backend', 'Microservices'],
    'Spring Boot': ['Java', 'Microservices', 'Backend', 'REST API'],
    'GraphQL': ['Node.js', 'API', 'Apollo'],
    
    # Cloud & DevOps
    'AWS': ['Lambda', 'Cloud', 'Serverless', 'S3', 'EC2'],
    'Azure': ['Cloud', 'C#', '.NET', 'DevOps'],
    'Kubernetes': ['Docker', 'K8s', 'DevOps', 'Go', 'Cloud'],
    'Docker': ['Kubernetes', 'DevOps', 'Containers', 'CI/CD'],
    'Terraform': ['IaC', 'AWS', 'Azure', 'DevOps', 'Cloud'],
    'CI/CD': ['Jenkins', 'DevOps', 'Docker', 'Kubernetes'],
    
    # Data & ML
    'ML': ['Python', 'TensorFlow', 'PyTorch', 'Data Science', 'AI'],
    'TensorFlow': ['ML', 'Python', 'Deep Learning', 'AI', 'PyTorch'],
    'PyTorch': ['ML', 'Python', 'Deep Learning', 'AI', 'TensorFlow'],
    'Spark': ['Scala', 'Python', 'Big Data', 'Databricks', 'Data Engineering'],
    'SQL': ['PostgreSQL', 'MySQL', 'Oracle', 'Database', 'Data'],
    'Snowflake': ['Data Warehouse', 'SQL', 'Cloud', 'ETL'],
    'Kafka': ['Streaming', 'Data', 'Scala', 'Flink'],
    
    # Mobile
    'iOS': ['Swift', 'Mobile', 'Apple'],
    'Android': ['Kotlin', 'Java', 'Mobile'],
    'Flutter': ['Dart', 'Mobile', 'Cross-platform'],
    'React Native': ['React', 'JavaScript', 'Mobile'],
    
    # QA & Testing
    'Selenium': ['Cypress', 'Automation', 'Testing', 'QA'],
    'Cypress': ['Selenium', 'JavaScript', 'Testing', 'E2E'],
    'Appium': ['Mobile Testing', 'Automation', 'QA'],
    'JMeter': ['Performance', 'Load Testing', 'Gatling'],
    
    # Security & Others
    'Cybersecurity': ['OWASP', 'Security', 'Penetration Testing'],
    'Blockchain': ['Solidity', 'Ethereum', 'Smart Contracts', 'Web3'],
    'Solidity': ['Blockchain', 'Ethereum', 'Smart Contracts'],
}

# Create reverse mapping for quick lookup
def get_similar_techs(tech):
    """Get list of similar technologies for a given tech"""
    similar = set()
    tech_upper = tech.strip()
    
    # Direct matches
    if tech_upper in tech_similarity:
        similar.update(tech_similarity[tech_upper])
    
    # Reverse lookup - find techs that list this one as similar
    for key, values in tech_similarity.items():
        if tech_upper in values or tech_upper.lower() in [v.lower() for v in values]:
            similar.add(key)
            similar.update(values)
    
    similar.discard(tech_upper)
    return list(similar)

# Test similarity mapping
print("Similar to 'Python':", get_similar_techs('Python')[:5])
print("Similar to 'Java':", get_similar_techs('Java')[:5])
print("Similar to 'React':", get_similar_techs('React')[:5])

Similar to 'Python': ['AI', 'Django', 'Spark', 'PyTorch', 'Databricks']
Similar to 'Java': ['Spring Boot', 'Kafka', 'Android', 'Backend', 'Big Data']
Similar to 'React': ['Frontend', 'Node.js', 'Next.js', 'Vue.js', 'Mobile']


In [56]:
# Map bench employees with training data and analyze demand status
def check_demand_status(skill):
    """Check if a skill is in demand and return demand score"""
    skill_clean = str(skill).strip()
    
    # Direct match
    if skill_clean in demand_scores:
        return True, demand_scores[skill_clean], skill_clean
    
    # Case-insensitive match
    for tech in demand_scores.keys():
        if skill_clean.lower() == tech.lower():
            return True, demand_scores[tech], tech
    
    return False, 0, None

def find_closest_in_demand_tech(current_skill):
    """Find the closest technology that is in demand"""
    similar_techs = get_similar_techs(current_skill)
    
    # Score similar techs by demand
    in_demand_similar = []
    for tech in similar_techs:
        is_in_demand, score, matched_tech = check_demand_status(tech)
        if is_in_demand:
            in_demand_similar.append((matched_tech, score))
    
    # Sort by demand score (highest first)
    in_demand_similar.sort(key=lambda x: x[1], reverse=True)
    
    return in_demand_similar

# Merge bench data with training data
# First, let's standardize column names
bench_df_clean = bench_df.copy()
bench_df_clean = bench_df_clean.rename(columns={
    'Emp Id': 'Emp_Id',
    'Associate': 'Employee_Name',
    'Primary Skill': 'Primary_Skill',
    'Current_Skill': 'Secondary_Skill',
    'Work City': 'Location',
    'Line_Manager': 'Manager'
})

# Analyze bench employees' skills vs demand
print("=== BENCH EMPLOYEES SKILL-DEMAND ANALYSIS ===\n")

bench_analysis = []
for _, emp in bench_df_clean.iterrows():
    emp_id = emp['Emp_Id']
    name = emp['Employee_Name']
    primary_skill = emp['Primary_Skill']
    current_skill = emp.get('Secondary_Skill', primary_skill)
    
    # Check if primary skill is in demand
    is_in_demand, demand_score, matched = check_demand_status(primary_skill)
    
    if is_in_demand:
        status = "IN DEMAND"
        recommendation = f"Continue with {primary_skill} training"
        alt_techs = []
    else:
        status = "NOT IN DEMAND"
        alt_techs = find_closest_in_demand_tech(primary_skill)
        if alt_techs:
            recommendation = f"Consider upskilling to: {', '.join([t[0] for t in alt_techs[:3]])}"
        else:
            recommendation = "Explore trending technologies"
    
    bench_analysis.append({
        'Emp_Id': emp_id,
        'Name': name,
        'Primary_Skill': primary_skill,
        'Demand_Status': status,
        'Demand_Score': demand_score,
        'Recommendation': recommendation,
        'Alternative_Techs': alt_techs[:3] if not is_in_demand else []
    })

bench_analysis_df = pd.DataFrame(bench_analysis)
print(bench_analysis_df[['Name', 'Primary_Skill', 'Demand_Status', 'Recommendation']].head(15).to_string(index=False))

=== BENCH EMPLOYEES SKILL-DEMAND ANALYSIS ===

          Name   Primary_Skill Demand_Status                    Recommendation
   Daniel Park            Java     IN DEMAND       Continue with Java training
   George Wang      JavaScript     IN DEMAND Continue with JavaScript training
 Kevin O'Brien            Java     IN DEMAND       Continue with Java training
Oscar Martinez           Linux     IN DEMAND      Continue with Linux training
  Rachel Green              Go     IN DEMAND         Continue with Go training
   Xavier Ross         Windows NOT IN DEMAND     Explore trending technologies
 Brandon Scott          Appium NOT IN DEMAND     Explore trending technologies
   Henry Adams         Tableau NOT IN DEMAND     Explore trending technologies
 Maria Johnson        Cucumber NOT IN DEMAND     Explore trending technologies
   Rita Wilson Robot Framework NOT IN DEMAND     Explore trending technologies
  Xena Warrior         Haskell NOT IN DEMAND     Explore trending technologies
  Dia

In [57]:
# Create enhanced training dataset by merging employee_training with bench data
# Add demand-aware features

# Merge training data with bench data where possible
merged_df = df.copy()

# Add demand score for primary skill
merged_df['Primary_Skill_In_Demand'] = merged_df['Primary_Skill'].apply(
    lambda x: 1 if check_demand_status(x)[0] else 0
)
merged_df['Primary_Skill_Demand_Score'] = merged_df['Primary_Skill'].apply(
    lambda x: check_demand_status(x)[1]
)

# Add demand score for secondary skill
merged_df['Secondary_Skill_In_Demand'] = merged_df['Secondary_Skill'].apply(
    lambda x: 1 if check_demand_status(x)[0] else 0
)
merged_df['Secondary_Skill_Demand_Score'] = merged_df['Secondary_Skill'].apply(
    lambda x: check_demand_status(x)[1]
)

# Enhanced mapping of courses to target technologies AND related skills
course_to_tech_map = {
    'Python Programming Fundamentals': 'Python',
    'Machine Learning with Python': 'ML',
    'Data Analysis and Visualization': 'SQL',
    'Full Stack Web Development': 'JavaScript',
    'DevOps and CI/CD Pipeline': 'CI/CD',
    'AWS Cloud Practitioner': 'AWS',
    'Backend API Development': 'Node.js',
    'Automated Testing with Selenium': 'Selenium',
    'Data Pipeline Engineering': 'Spark',
    'Cybersecurity Essentials': 'Cybersecurity',
    'Software Architecture Design': 'Microservices',
    'Agile Project Management': 'Agile',
    'Linux System Administration': 'Linux',
}

# CRITICAL: Skill-to-Course relevance mapping (which courses are relevant for which skills)
skill_to_courses_map = {
    'Python': ['Machine Learning with Python', 'Data Analysis and Visualization', 'Data Pipeline Engineering', 'Python Programming Fundamentals'],
    'ML': ['Machine Learning with Python', 'Data Analysis and Visualization', 'Data Pipeline Engineering'],
    'Data Science': ['Machine Learning with Python', 'Data Analysis and Visualization', 'Data Pipeline Engineering'],
    'TensorFlow': ['Machine Learning with Python', 'Data Pipeline Engineering'],
    'PyTorch': ['Machine Learning with Python', 'Data Pipeline Engineering'],
    'Pandas': ['Data Analysis and Visualization', 'Machine Learning with Python'],
    'Java': ['Backend API Development', 'Software Architecture Design', 'Full Stack Web Development'],
    'Spring Boot': ['Backend API Development', 'Software Architecture Design'],
    'JavaScript': ['Full Stack Web Development', 'Backend API Development'],
    'React': ['Full Stack Web Development'],
    'Angular': ['Full Stack Web Development'],
    'Node.js': ['Backend API Development', 'Full Stack Web Development'],
    'TypeScript': ['Full Stack Web Development', 'Backend API Development'],
    'AWS': ['AWS Cloud Practitioner', 'DevOps and CI/CD Pipeline'],
    'Azure': ['AWS Cloud Practitioner', 'DevOps and CI/CD Pipeline'],
    'Docker': ['DevOps and CI/CD Pipeline', 'AWS Cloud Practitioner'],
    'Kubernetes': ['DevOps and CI/CD Pipeline', 'AWS Cloud Practitioner'],
    'CI/CD': ['DevOps and CI/CD Pipeline'],
    'DevOps': ['DevOps and CI/CD Pipeline', 'AWS Cloud Practitioner', 'Linux System Administration'],
    'SQL': ['Data Analysis and Visualization', 'Data Pipeline Engineering'],
    'Spark': ['Data Pipeline Engineering', 'Data Analysis and Visualization'],
    'Selenium': ['Automated Testing with Selenium'],
    'QA': ['Automated Testing with Selenium'],
    'Testing': ['Automated Testing with Selenium'],
    'Cybersecurity': ['Cybersecurity Essentials'],
    'Security': ['Cybersecurity Essentials'],
    'Linux': ['Linux System Administration', 'DevOps and CI/CD Pipeline'],
    'Agile': ['Agile Project Management', 'Software Architecture Design'],
    'Microservices': ['Software Architecture Design', 'Backend API Development'],
    '.NET': ['Backend API Development', 'Full Stack Web Development'],
    'C#': ['Backend API Development', 'Full Stack Web Development'],
}

# Function to get relevant courses for a skill
def get_relevant_courses_for_skill(skill):
    """Get list of courses relevant to a skill"""
    skill_clean = str(skill).strip()
    
    # Direct match
    if skill_clean in skill_to_courses_map:
        return skill_to_courses_map[skill_clean]
    
    # Case-insensitive match
    for s, courses in skill_to_courses_map.items():
        if skill_clean.lower() == s.lower():
            return courses
    
    # Check similar technologies
    similar_techs = get_similar_techs(skill_clean)
    for tech in similar_techs:
        if tech in skill_to_courses_map:
            return skill_to_courses_map[tech]
    
    return []

# Add course target technology demand
merged_df['Course_Tech'] = merged_df['Course_Name'].map(course_to_tech_map).fillna('Unknown')
merged_df['Course_Tech_In_Demand'] = merged_df['Course_Tech'].apply(
    lambda x: 1 if check_demand_status(x)[0] else 0
)

print("=== ENHANCED DATASET WITH DEMAND FEATURES ===")
print(f"Total records: {len(merged_df)}")
print(f"\nDemand feature distribution:")
print(f"  - Primary skills in demand: {merged_df['Primary_Skill_In_Demand'].sum()}")
print(f"  - Primary skills not in demand: {(merged_df['Primary_Skill_In_Demand'] == 0).sum()}")
print(f"\nSkill-Course Mapping Test:")
print(f"  - Python courses: {get_relevant_courses_for_skill('Python')}")
print(f"  - Java courses: {get_relevant_courses_for_skill('Java')}")
print(f"\nSample of enhanced data:")
print(merged_df[['Employee_Name', 'Primary_Skill', 'Primary_Skill_In_Demand', 'Course_Name']].head(10).to_string(index=False))

=== ENHANCED DATASET WITH DEMAND FEATURES ===
Total records: 402

Demand feature distribution:
  - Primary skills in demand: 279
  - Primary skills not in demand: 123

Skill-Course Mapping Test:
  - Python courses: ['Machine Learning with Python', 'Data Analysis and Visualization', 'Data Pipeline Engineering', 'Python Programming Fundamentals']
  - Java courses: ['Backend API Development', 'Software Architecture Design', 'Full Stack Web Development']

Sample of enhanced data:
 Employee_Name   Primary_Skill  Primary_Skill_In_Demand                     Course_Name
   Daniel Park            Java                        1         Backend API Development
   George Wang      JavaScript                        1      Full Stack Web Development
  Kevin OBrien            Java                        1    Software Architecture Design
Oscar Martinez           Linux                        1     Linux System Administration
  Rachel Green              Go                        1       DevOps and CI/CD 

In [58]:
# Use enhanced merged dataframe with demand features
data = merged_df.copy()

# Fill missing values for key training features
data['Grade'] = data['Grade'].fillna('G1')
data['Department'] = data['Department'].fillna('Unknown')
data['Primary_Skill'] = data['Primary_Skill'].fillna('Unknown')
data['Secondary_Skill'] = data['Secondary_Skill'].fillna('Unknown')
data['Course_Category'] = data['Course_Category'].fillna('Unknown')
data['Business_Priority'] = data['Business_Priority'].fillna('Medium')
data['Career_Goal'] = data['Career_Goal'].fillna('Unknown')
data['Course_Name'] = data['Course_Name'].fillna('Unknown Course')

# Derive numeric grade features
data['Grade_Num'] = data['Grade'].str.extract("(\d+)").astype(int)
experience_map = {1: 0, 2: 0.5, 3: 1.5, 4: 3, 5: 5, 6: 7, 7: 10, 8: 12, 9: 15, 10: 18}
data['Experience_Level'] = data['Grade_Num'].map(experience_map).fillna(0)

# Add Skill_Gap_Score and Performance_Rating if available
if 'Skill_Gap_Score' in data.columns:
    data['Skill_Gap_Score'] = data['Skill_Gap_Score'].fillna(data['Skill_Gap_Score'].median())
else:
    data['Skill_Gap_Score'] = 0.3

if 'Performance_Rating' in data.columns:
    data['Performance_Rating'] = data['Performance_Rating'].fillna(data['Performance_Rating'].median())
else:
    data['Performance_Rating'] = 4.0

# Normalize demand scores
max_demand_score = data['Primary_Skill_Demand_Score'].max() if data['Primary_Skill_Demand_Score'].max() > 0 else 1
data['Primary_Skill_Demand_Normalized'] = data['Primary_Skill_Demand_Score'] / max_demand_score
data['Secondary_Skill_Demand_Normalized'] = data['Secondary_Skill_Demand_Score'] / max_demand_score

# Create interaction features
data['Grade_Skill_Interaction'] = data['Grade_Num'] * data['Skill_Gap_Score']
data['Grade_Performance'] = data['Grade_Num'] * data['Performance_Rating']
data['Demand_Gap_Interaction'] = data['Primary_Skill_In_Demand'] * data['Skill_Gap_Score']

# Encode all categorical features for training
label_encoders = {}
for col in ['Department', 'Primary_Skill', 'Secondary_Skill', 'Course_Category', 'Business_Priority', 'Career_Goal']:
    values = pd.concat([data[col].astype(str), pd.Series(['Unknown'])], ignore_index=True)
    le = LabelEncoder()
    le.fit(values)
    data[f'{col}_Encoded'] = le.transform(data[col].astype(str))
    label_encoders[col] = le

# Encode the target course name
target_encoder = LabelEncoder()
data['Target'] = target_encoder.fit_transform(data['Course_Name'].astype(str))

# Store reference catalog
course_catalog = data[['Course_Name', 'Course_Category']].drop_duplicates()
print(data.groupby('Course_Category')['Course_Name'].nunique())

print(f"\nPrepared dataset with {data['Target'].nunique()} unique training modules")
print(f"New demand-aware features added: Primary_Skill_In_Demand, Demand_Gap_Interaction")

Course_Category
Analytics           1
Architecture        1
Backend             1
Cloud               1
Data Engineering    1
Data Science        1
DevOps              1
Development         1
Infrastructure      1
Management          1
Programming         1
Security            1
Testing             1
Unknown             1
Name: Course_Name, dtype: int64

Prepared dataset with 14 unique training modules
New demand-aware features added: Primary_Skill_In_Demand, Demand_Gap_Interaction


In [59]:
from sklearn.preprocessing import StandardScaler

# Feature columns with expanded attributes INCLUDING DEMAND FEATURES
feature_cols = [
    'Grade_Num',
    'Experience_Level',
    'Department_Encoded',
    'Primary_Skill_Encoded',
    'Secondary_Skill_Encoded',
    'Course_Category_Encoded',
    'Business_Priority_Encoded',
    'Career_Goal_Encoded',
    'Skill_Gap_Score',
    'Performance_Rating',
    'Grade_Skill_Interaction',
    'Grade_Performance',
    # New demand-aware features
    'Primary_Skill_In_Demand',
    'Primary_Skill_Demand_Normalized',
    'Secondary_Skill_In_Demand',
    'Demand_Gap_Interaction'
]

# Train-test split
X, y = data[feature_cols], data['Target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train set: {X_train.shape[0]} rows | Test set: {X_test.shape[0]} rows")
print(f"Unique courses in training: {y_train.nunique()}")
print(f"Total features: {len(feature_cols)} (including 4 new demand features)")

Train set: 321 rows | Test set: 81 rows
Unique courses in training: 14
Total features: 16 (including 4 new demand features)


In [60]:
import warnings
warnings.filterwarnings('ignore')

# Filter out courses with only 1 sample (can't stratify with single samples)
course_counts = data['Course_Name'].value_counts()
valid_courses = course_counts[course_counts >= 2].index
data_filtered = data[data['Course_Name'].isin(valid_courses)].copy()

print(f"Original courses: {data['Course_Name'].nunique()}")
print(f"Filtered courses (≥2 samples): {data_filtered['Course_Name'].nunique()}")
print(f"Removed {data['Course_Name'].nunique() - data_filtered['Course_Name'].nunique()} courses with single samples")

# Re-encode target with filtered data
target_encoder = LabelEncoder()
data_filtered['Target'] = target_encoder.fit_transform(data_filtered['Course_Name'].astype(str))

# Update course catalog with filtered data
course_catalog = data_filtered[['Course_Name', 'Course_Category']].drop_duplicates()

# Update X and y with filtered data
X_filtered = data_filtered[feature_cols]
y_filtered = data_filtered['Target']


X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42
)

# Scale features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTrain set: {X_train.shape[0]} rows")
print(f"Test set: {X_test.shape[0]} rows")


# XGBoost optimized for better accuracy with sufficient data
model = XGBClassifier(
    n_estimators=800,          # More trees for better learning
    max_depth=6,               # Deeper trees now that we have more data
    learning_rate=0.05,        # Lower learning rate for better convergence
    subsample=0.85,            # Use 85% of data per tree
    colsample_bytree=0.85,     # Use 85% of features per tree
    min_child_weight=2,        # Require 2 samples per leaf
    gamma=0.2,                 # Moderate regularization
    reg_alpha=0.5,             # L1 regularization
    reg_lambda=1.5,            # L2 regularization
    scale_pos_weight=1,        
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss',
    early_stopping_rounds=50
)

# Train with evaluation set for early stopping
eval_set = [(X_train_scaled, y_train), (X_test_scaled, y_test)]
model.fit(X_train_scaled, y_train, eval_set=eval_set, verbose=False)

test_predictions = model.predict(X_test_scaled)
train_predictions = model.predict(X_train_scaled)
test_acc = accuracy_score(y_test, test_predictions)
train_acc = accuracy_score(y_train, train_predictions)


print(f"Train Accuracy: {train_acc:.1%}")
print(f"Test Accuracy: {test_acc:.1%}")
print(f"Overfitting Gap: {(train_acc - test_acc):.1%}")


Original courses: 14
Filtered courses (≥2 samples): 14
Removed 0 courses with single samples

Train set: 321 rows
Test set: 81 rows
Train Accuracy: 99.4%
Test Accuracy: 96.3%
Overfitting Gap: 3.1%


In [61]:
# Enhanced DEMAND-AWARE recommendation function with skill-course relevance
def recommend_course_demand_aware(employee, top_n=3, consider_demand=True):
    """
    Recommend courses considering:
    1. Check if employee's skill has ACTIVE DEMAND in requirements
    2. If IN DEMAND - suggest courses relevant to that skill (ML for Python, etc.)
    3. If NOT IN DEMAND - suggest pivot to in-demand tech and relevant courses
    """
    grade_value = str(employee.get('Grade', 'G3'))
    digits = ''.join(ch for ch in grade_value if ch.isdigit())
    grade_num = int(digits) if digits else 3
    
    skill_gap = employee.get('Skill_Gap_Score', 0.3)
    performance = employee.get('Performance_Rating', 4.0)
    primary_skill = employee.get('Primary_Skill', 'Unknown')
    secondary_skill = employee.get('Secondary_Skill', primary_skill)
    
    # Check demand status for employee's primary skill
    is_in_demand, demand_score, matched_tech = check_demand_status(primary_skill)
    
    # Also check if related skills are in demand (e.g., ML demand for Python developer)
    related_demand = []
    similar_techs = get_similar_techs(primary_skill)
    for tech in similar_techs:
        tech_in_demand, tech_score, _ = check_demand_status(tech)
        if tech_in_demand:
            related_demand.append((tech, tech_score))
    related_demand.sort(key=lambda x: x[1], reverse=True)
    
    # Prepare demand analysis
    demand_analysis = {
        'Primary_Skill': primary_skill,
        'Is_In_Demand': is_in_demand,
        'Demand_Score': demand_score,
        'Related_In_Demand': related_demand[:3],
        'Alternative_Techs': []
    }
    
    # Determine recommendation strategy
    recommended_skill = primary_skill
    need_pivot = False
    
    if is_in_demand:
        # Skill is in demand - recommend courses for this skill
        demand_analysis['Strategy'] = 'SKILL_IN_DEMAND'
    elif related_demand:
        # Skill not directly in demand, but related skills are (e.g., Python -> ML)
        demand_analysis['Strategy'] = 'RELATED_DEMAND'
        demand_analysis['Related_Tech_Focus'] = related_demand[0][0]
    else:
        # No demand for skill or related - suggest pivot
        need_pivot = True
        alt_techs = find_closest_in_demand_tech(primary_skill)
        demand_analysis['Alternative_Techs'] = alt_techs[:3]
        demand_analysis['Strategy'] = 'NEED_PIVOT'
        if alt_techs:
            recommended_skill = alt_techs[0][0]
            demand_analysis['Recommended_Pivot'] = recommended_skill
    
    # Get relevant courses for the PRIMARY skill FIRST (most important)
    relevant_courses = get_relevant_courses_for_skill(primary_skill if not need_pivot else recommended_skill)
    
    # Calculate demand features for ML model
    rec_in_demand, rec_demand_score, _ = check_demand_status(recommended_skill)
    max_demand = max(demand_scores.values()) if demand_scores else 1
    
    profile = {
        'Grade_Num': grade_num,
        'Experience_Level': experience_map.get(grade_num, 0.0),
        'Skill_Gap_Score': skill_gap,
        'Performance_Rating': performance,
        'Grade_Skill_Interaction': grade_num * skill_gap,
        'Grade_Performance': grade_num * performance,
        'Primary_Skill_In_Demand': 1 if (is_in_demand or related_demand) else 0,
        'Primary_Skill_Demand_Normalized': rec_demand_score / max_demand if max_demand > 0 else 0,
        'Secondary_Skill_In_Demand': 1 if check_demand_status(secondary_skill)[0] else 0,
        'Demand_Gap_Interaction': (1 if (is_in_demand or related_demand) else 0) * skill_gap
    }

    for col in ['Department', 'Primary_Skill', 'Secondary_Skill', 'Course_Category', 'Business_Priority', 'Career_Goal']:
        encoder = label_encoders[col]
        if col == 'Primary_Skill' and need_pivot and consider_demand:
            value = recommended_skill
        else:
            value = str(employee.get(col, 'Unknown') or 'Unknown')
        
        if value not in encoder.classes_:
            value = 'Unknown'
        profile[f'{col}_Encoded'] = int(encoder.transform([value])[0])

    X_new = pd.DataFrame([profile])[feature_cols]
    X_new_scaled = scaler.transform(X_new)

    # Get ML model probabilities
    probabilities = model.predict_proba(X_new_scaled)[0]
    
    # SMART RECOMMENDATION: Prioritize SKILL-RELEVANT courses when skill is in demand
    recommendations = []
    
    if (is_in_demand or related_demand) and relevant_courses:
        # SKILL IS IN DEMAND: Return skill-relevant courses directly (ordered by relevance)
        for i, course in enumerate(relevant_courses[:top_n]):
            if course in target_encoder.classes_:
                idx = target_encoder.transform([course])[0]
                base_confidence = probabilities[idx]
                catalog_row = course_catalog[course_catalog['Course_Name'] == course]
                course_category = catalog_row['Course_Category'].iloc[0] if not catalog_row.empty else 'Unknown'
                course_tech = course_to_tech_map.get(course, 'General')
                course_in_demand = check_demand_status(course_tech)[0]
                
                # Strong boost for skill-relevant courses (descending priority)
                relevance_boost = 0.95 - (i * 0.15)  # First: 95%, Second: 80%, Third: 65%
                boosted_confidence = max(relevance_boost, base_confidence)
                
                recommendations.append({
                    'Course_Name': course,
                    'Course_Category': course_category,
                    'Confidence': boosted_confidence,
                    'Target_Technology': course_tech,
                    'Tech_In_Demand': course_in_demand,
                    'Relevance': 'HIGH'
                })
    else:
        # SKILL NOT IN DEMAND: Use ML model predictions but filter for relevant courses if available
        top_indices = np.argsort(probabilities)[::-1]
        
        # First try to add relevant courses for pivot skill
        if relevant_courses:
            for course in relevant_courses[:top_n]:
                if course in target_encoder.classes_:
                    idx = target_encoder.transform([course])[0]
                    confidence = probabilities[idx]
                    catalog_row = course_catalog[course_catalog['Course_Name'] == course]
                    course_category = catalog_row['Course_Category'].iloc[0] if not catalog_row.empty else 'Unknown'
                    course_tech = course_to_tech_map.get(course, 'General')
                    course_in_demand = check_demand_status(course_tech)[0]
                    
                    recommendations.append({
                        'Course_Name': course,
                        'Course_Category': course_category,
                        'Confidence': confidence,
                        'Target_Technology': course_tech,
                        'Tech_In_Demand': course_in_demand,
                        'Relevance': 'PIVOT'
                    })
        
        # Add from ML model if needed
        for idx in top_indices:
            if len(recommendations) >= top_n:
                break
            course_name = target_encoder.inverse_transform([idx])[0]
            if course_name not in [r['Course_Name'] for r in recommendations]:
                confidence = probabilities[idx]
                catalog_row = course_catalog[course_catalog['Course_Name'] == course_name]
                course_category = catalog_row['Course_Category'].iloc[0] if not catalog_row.empty else 'Unknown'
                course_tech = course_to_tech_map.get(course_name, 'General')
                course_in_demand = check_demand_status(course_tech)[0]
                
                recommendations.append({
                    'Course_Name': course_name,
                    'Course_Category': course_category,
                    'Confidence': confidence,
                    'Target_Technology': course_tech,
                    'Tech_In_Demand': course_in_demand,
                    'Relevance': 'ML_PREDICTED'
                })
    
    # Sort by confidence and take top_n
    recommendations.sort(key=lambda x: x['Confidence'], reverse=True)
    
    return recommendations[:top_n], demand_analysis

# Legacy wrapper for backward compatibility
def recommend_course(employee, top_n=3):
    recs, _ = recommend_course_demand_aware(employee, top_n, consider_demand=False)
    return recs

In [None]:
# Test with demand-aware recommendations
print("DEMAND-AWARE COURSE RECOMMENDATION SYSTEM")
print("")

# Test Case 1: Employee with IN-DEMAND skill (Python)
test_emp_in_demand = {
    'Grade': 'G5',
    'Department': 'Engineering',
    'Primary_Skill': 'Python',  # Python is in demand
    'Secondary_Skill': 'Django',
    'Course_Category': 'Development',
    'Business_Priority': 'Critical',
    'Career_Goal': 'Full Stack Lead'
}

print("TEST 1: Employee with IN-DEMAND skill (Python)")
print(f"Profile: {test_emp_in_demand['Primary_Skill']} developer, {test_emp_in_demand['Grade']}")

recs, analysis = recommend_course_demand_aware(test_emp_in_demand, top_n=3)

print(f"\nDemand Analysis:")
print(f"   Current Skill: {analysis['Primary_Skill']}")
print(f"   In Demand: {'YES' if analysis['Is_In_Demand'] else 'NO'}")
print(f"   Demand Score: {analysis['Demand_Score']}")
print(f"   Strategy: {analysis.get('Strategy', 'N/A')}")

if analysis.get('Related_In_Demand'):
    print(f"   Related Skills Also In Demand:")
    for tech, score in analysis['Related_In_Demand'][:3]:
        print(f"      {tech} (Demand: {score})")

print(f"\nRecommended Courses:")
for i, rec in enumerate(recs, 1):
    demand_badge = "[HIGH DEMAND]" if rec['Tech_In_Demand'] else ""
    relevance = f"[{rec.get('Relevance', 'N/A')}]"
    print(f"   {i}. {rec['Course_Name']} ({rec['Course_Category']}) - {rec['Confidence']:.1%} {demand_badge} {relevance}")

# Test Case 2: Employee with skill NOT in demand
test_emp_not_demand = {
    'Grade': 'G4',
    'Department': 'Engineering',
    'Primary_Skill': 'COBOL',  # Legacy, not in demand
    'Secondary_Skill': 'Mainframe',
    'Course_Category': 'Development',
    'Business_Priority': 'High',
    'Career_Goal': 'Backend Dev'
}

print("\n\nTEST 2: Employee with skill NOT IN DEMAND (COBOL)")
print(f"Profile: {test_emp_not_demand['Primary_Skill']} developer, {test_emp_not_demand['Grade']}")

recs, analysis = recommend_course_demand_aware(test_emp_not_demand, top_n=3)

print(f"\nDemand Analysis:")
print(f"   Current Skill: {analysis['Primary_Skill']}")
print(f"   In Demand: {'YES' if analysis['Is_In_Demand'] else 'NO'}")
print(f"   Strategy: {analysis.get('Strategy', 'N/A')}")

if analysis['Alternative_Techs']:
    print(f"   Suggested Pivot Technologies:")
    for tech, score in analysis['Alternative_Techs']:
        print(f"      {tech} (Demand Score: {score})")

if 'Recommended_Pivot' in analysis:
    print(f"   Recommended Pivot: {analysis['Recommended_Pivot']}")

print(f"\nRecommended Courses (for skill pivot):")
for i, rec in enumerate(recs, 1):
    demand_badge = "[HIGH DEMAND]" if rec['Tech_In_Demand'] else ""
    print(f"   {i}. {rec['Course_Name']} ({rec['Course_Category']}) - {rec['Confidence']:.1%} {demand_badge}")

# Test Case 3: Employee with Java (check related demand)
test_emp_java = {
    'Grade': 'G5',
    'Department': 'Engineering',
    'Primary_Skill': 'Java',
    'Secondary_Skill': 'Spring Boot',
    'Course_Category': 'Development',
    'Business_Priority': 'High',
    'Career_Goal': 'Backend Lead'
}

print("\n\nTEST 3: Employee with Java skill")
print(f"Profile: {test_emp_java['Primary_Skill']} developer, {test_emp_java['Grade']}")

recs, analysis = recommend_course_demand_aware(test_emp_java, top_n=3)

print(f"\nDemand Analysis:")
print(f"   Current Skill: {analysis['Primary_Skill']}")
print(f"   In Demand: {'YES' if analysis['Is_In_Demand'] else 'NO'}")
print(f"   Strategy: {analysis.get('Strategy', 'N/A')}")

print(f"\nRecommended Courses:")
for i, rec in enumerate(recs, 1):
    demand_badge = "[HIGH DEMAND]" if rec['Tech_In_Demand'] else ""
    print(f"   {i}. {rec['Course_Name']} ({rec['Course_Category']}) - {rec['Confidence']:.1%} {demand_badge}")

DEMAND-AWARE COURSE RECOMMENDATION SYSTEM

>>> TEST 1: Employee with IN-DEMAND skill (Python)
Profile: Python developer, G5

Demand Analysis:
   - Current Skill: Python
   - In Demand: YES
   - Demand Score: 8
   - Strategy: SKILL_IN_DEMAND
   - Related Skills Also In Demand:
      * ML (Demand: 6)
      * Spark (Demand: 4)
      * PyTorch (Demand: 2)

Recommended Courses:
   1. Machine Learning with Python (Data Science) - 95.0% [HIGH DEMAND] [HIGH]
   2. Data Analysis and Visualization (Analytics) - 80.0% [HIGH DEMAND] [HIGH]
   3. Data Pipeline Engineering (Data Engineering) - 65.0% [HIGH DEMAND] [HIGH]

>>> TEST 2: Employee with skill NOT IN DEMAND (COBOL)
Profile: COBOL developer, G4

Demand Analysis:
   - Current Skill: COBOL
   - In Demand: NO
   - Strategy: NEED_PIVOT

Recommended Courses (for skill pivot):
   1. Full Stack Web Development (Development) - 92.9% [HIGH DEMAND]
   2. Machine Learning with Python (Data Science) - 0.8% [HIGH DEMAND]
   3. Data Pipeline Engineering (

In [None]:
# Process BENCH employees with demand-aware recommendations
print("BENCH EMPLOYEES - DEMAND-AWARE TRAINING RECOMMENDATIONS")
print("")

# Sample bench employees for demonstration
sample_bench = bench_df_clean.head(15).copy()

results = []
for _, emp in sample_bench.iterrows():
    emp_profile = {
        'Emp_Id': emp['Emp_Id'],
        'Grade': emp['Grade'],
        'Department': emp['Department'],
        'Primary_Skill': emp['Primary_Skill'],
        'Secondary_Skill': emp.get('Secondary_Skill', emp['Primary_Skill']),
        'Location': emp['Location'],
        'Course_Category': 'Development',  # Default
        'Business_Priority': 'High',
        'Career_Goal': 'Senior Developer'
    }
    
    recs, analysis = recommend_course_demand_aware(emp_profile, top_n=1)
    
    results.append({
        'Emp_Id': emp['Emp_Id'],
        'Name': emp['Employee_Name'],
        'Current_Skill': emp['Primary_Skill'],
        'In_Demand': 'Yes' if analysis['Is_In_Demand'] else 'No',
        'Pivot_To': analysis.get('Recommended_Pivot', '-'),
        'Recommended_Course': recs[0]['Course_Name'] if recs else 'N/A',
        'Confidence': f"{recs[0]['Confidence']:.1%}" if recs else 'N/A'
    })

results_df = pd.DataFrame(results)
print("\n")
print(results_df.to_string(index=False))

# Summary statistics
in_demand_count = sum(1 for r in results if r['In_Demand'] == 'Yes')
pivot_count = sum(1 for r in results if r['Pivot_To'] != '-')

print(f"\nSUMMARY:")
print(f"   Total Bench Employees Analyzed: {len(results)}")
print(f"   Skills Already In Demand: {in_demand_count}")
print(f"   Recommended for Skill Pivot: {pivot_count}")

BENCH EMPLOYEES - DEMAND-AWARE TRAINING RECOMMENDATIONS


 Emp_Id           Name   Current_Skill In_Demand Pivot_To              Recommended_Course Confidence
    104    Daniel Park            Java       Yes        -         Backend API Development      95.0%
    107    George Wang      JavaScript       Yes        -      Full Stack Web Development      95.0%
    111  Kevin O'Brien            Java       Yes        -         Backend API Development      95.0%
    115 Oscar Martinez           Linux       Yes        -     Linux System Administration      95.0%
    118   Rachel Green              Go       Yes        -    Software Architecture Design      95.0%
    124    Xavier Ross         Windows        No        -      Full Stack Web Development      89.0%
    128  Brandon Scott          Appium        No        - Automated Testing with Selenium       0.8%
    134    Henry Adams         Tableau        No        -      Full Stack Web Development      86.3%
    139  Maria Johnson        Cuc

In [None]:
# Test with JSON file - DEMAND-AWARE validation
import json

with open('test_employees.json', 'r') as f:
    test_records = json.load(f)

print("DEMAND-AWARE RECOMMENDATION TEST RESULTS")
print("")

course_match_count = 0
demand_match_count = 0
pivot_match_count = 0
total = len(test_records)

results_summary = []

for emp in test_records:
    emp_id = emp.get('Emp_Id')
    name = emp.get('Employee_Name')
    expected_course = emp.get('Expected_Course')
    expected_in_demand = emp.get('Skill_In_Demand', True)
    expected_pivot = emp.get('Expected_Pivot')
    scenario = emp.get('Test_Scenario', '')
    
    # Get demand-aware recommendations
    recs, analysis = recommend_course_demand_aware(emp, top_n=1)
    
    predicted_course = recs[0]['Course_Name'] if recs else 'N/A'
    actual_in_demand = analysis['Is_In_Demand']
    actual_pivot = analysis.get('Recommended_Pivot')
    
    # Check matches
    course_matched = predicted_course == expected_course
    demand_matched = actual_in_demand == expected_in_demand
    pivot_matched = (expected_pivot is None and actual_pivot is None) or \
                   (expected_pivot and actual_pivot and expected_pivot.lower() in actual_pivot.lower())
    
    if course_matched:
        course_match_count += 1
    if demand_matched:
        demand_match_count += 1
    if pivot_matched:
        pivot_match_count += 1
    
    # Status indicators
    course_status = "MATCH" if course_matched else "MISS"
    demand_status = "MATCH" if demand_matched else "MISS"
    pivot_status = "MATCH" if pivot_matched else "WARN"
    
    results_summary.append({
        'Emp_Id': emp_id,
        'Name': name[:15],
        'Skill': emp.get('Primary_Skill')[:12],
        'In_Demand': 'Yes' if actual_in_demand else 'No',
        'Pivot': actual_pivot[:10] if actual_pivot else '-',
        'Course': predicted_course[:25],
        'Match': course_status
    })

# Print results table
print("\n")
results_df = pd.DataFrame(results_summary)
print(results_df.to_string(index=False))

# Print summary
print("\nSUMMARY")
print(f"Total Test Cases: {total}")
print(f"Course Predictions Matched: {course_match_count}/{total} ({course_match_count/total*100:.1f}%)")
print(f"Demand Status Correct: {demand_match_count}/{total} ({demand_match_count/total*100:.1f}%)")
print(f"Pivot Recommendations Correct: {pivot_match_count}/{total} ({pivot_match_count/total*100:.1f}%)")

# Breakdown by scenario type
in_demand_cases = [e for e in test_records if e.get('Skill_In_Demand', True)]
not_in_demand_cases = [e for e in test_records if not e.get('Skill_In_Demand', True)]
print(f"\nTest Case Breakdown:")
print(f"   IN_DEMAND scenarios: {len(in_demand_cases)}")
print(f"   NOT_IN_DEMAND scenarios (pivot needed): {len(not_in_demand_cases)}")

DEMAND-AWARE RECOMMENDATION TEST RESULTS


Emp_Id            Name        Skill In_Demand Pivot                    Course Match
  E901   Alex Thompson       Python       Yes     - Machine Learning with Pyt  MISS
  E902    Priya Sharma Machine Lear        No     - Machine Learning with Pyt MATCH
  E903  Marcus Johnson   JavaScript       Yes     - Full Stack Web Developmen MATCH
  E904   Lisa Anderson          AWS       Yes     -    AWS Cloud Practitioner MATCH
  E905     Rahul Verma          SQL       Yes     - Data Analysis and Visuali MATCH
  E906 Sophia Martinez         Java       Yes     -   Backend API Development MATCH
  E907   Carlos Rivera     Selenium       Yes     - Automated Testing with Se MATCH
  E908   Jennifer Wong      Node.js       Yes     -   Backend API Development  MISS
  E909    Ahmed Hassan        Spark       Yes     - Data Pipeline Engineering MATCH
  E910      Emily Chen           Go       Yes     - Software Architecture Des  MISS
  E911     Tom Bradley        COB

In [None]:
# Full function to get recommendation for any employee (bench or otherwise)
def get_training_recommendation(emp_id=None, emp_name=None, from_bench=True):
    """
    Get demand-aware training recommendations for an employee.
    Can lookup from bench data or accept custom profile.
    """
    if from_bench and (emp_id or emp_name):
        # Find employee in bench data
        if emp_id:
            emp_row = bench_df_clean[bench_df_clean['Emp_Id'] == emp_id]
        else:
            emp_row = bench_df_clean[bench_df_clean['Employee_Name'].str.contains(emp_name, case=False, na=False)]
        
        if emp_row.empty:
            return None, f"Employee not found in bench data"
        
        emp = emp_row.iloc[0]
        profile = {
            'Grade': emp['Grade'],
            'Department': emp['Department'],
            'Primary_Skill': emp['Primary_Skill'],
            'Secondary_Skill': emp.get('Secondary_Skill', emp['Primary_Skill']),
            'Course_Category': 'Development',
            'Business_Priority': 'High',
            'Career_Goal': 'Senior Developer'
        }
    else:
        return None, "Please provide emp_id or emp_name"
    
    return recommend_course_demand_aware(profile, top_n=3)

# Example usage
print("INDIVIDUAL EMPLOYEE LOOKUP")
print("")

# Lookup a specific bench employee
recs, analysis = get_training_recommendation(emp_id=104)
if recs:
    print(f"Employee ID: 104")
    print(f"   Current Skill: {analysis['Primary_Skill']}")
    print(f"   In Demand: {'YES' if analysis['Is_In_Demand'] else 'NO'}")
    if analysis.get('Recommended_Pivot'):
        print(f"   Pivot Recommendation: {analysis['Recommended_Pivot']}")
    print(f"\n   Training Recommendations:")
    for i, rec in enumerate(recs, 1):
        print(f"      {i}. {rec['Course_Name']} - {rec['Confidence']:.1%}")

INDIVIDUAL EMPLOYEE LOOKUP

Employee ID: 104
   Current Skill: Java
   In Demand: YES

   Training Recommendations:
      1. Backend API Development - 95.0%
      2. Full Stack Web Development - 93.3%
      3. Software Architecture Design - 80.0%
