In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('employee_training.csv')
print(f"Dataset: {df.shape[0]} employees, {df.shape[1]} features")

Dataset: 402 employees, 28 features


In [3]:
data = df.copy()

# Fill missing values for key training features
data['Grade'] = data['Grade'].fillna('G1')
data['Department'] = data['Department'].fillna('Unknown')
data['Primary_Skill'] = data['Primary_Skill'].fillna('Unknown')
data['Secondary_Skill'] = data['Secondary_Skill'].fillna('Unknown')
data['Course_Category'] = data['Course_Category'].fillna('Unknown')
data['Business_Priority'] = data['Business_Priority'].fillna('Medium')
data['Career_Goal'] = data['Career_Goal'].fillna('Unknown')
data['Course_Name'] = data['Course_Name'].fillna('Unknown Course')

# Derive numeric grade features
data['Grade_Num'] = data['Grade'].str.extract("(\d+)").astype(int)
experience_map = {1: 0, 2: 0.5, 3: 1.5, 4: 3, 5: 5, 6: 7, 7: 10, 8: 12, 9: 15, 10: 18}
data['Experience_Level'] = data['Grade_Num'].map(experience_map).fillna(0)

# Add Skill_Gap_Score and Performance_Rating if available
if 'Skill_Gap_Score' in data.columns:
    data['Skill_Gap_Score'] = data['Skill_Gap_Score'].fillna(data['Skill_Gap_Score'].median())
else:
    data['Skill_Gap_Score'] = 0.3

if 'Performance_Rating' in data.columns:
    data['Performance_Rating'] = data['Performance_Rating'].fillna(data['Performance_Rating'].median())
else:
    data['Performance_Rating'] = 4.0

# Create interaction features
data['Grade_Skill_Interaction'] = data['Grade_Num'] * data['Skill_Gap_Score']
data['Grade_Performance'] = data['Grade_Num'] * data['Performance_Rating']

# Encode all categorical features for training
label_encoders = {}
for col in ['Department', 'Primary_Skill', 'Secondary_Skill', 'Course_Category', 'Business_Priority', 'Career_Goal']:
    values = pd.concat([data[col].astype(str), pd.Series(['Unknown'])], ignore_index=True)
    le = LabelEncoder()
    le.fit(values)
    data[f'{col}_Encoded'] = le.transform(data[col].astype(str))
    label_encoders[col] = le

# Encode the target course name
target_encoder = LabelEncoder()
data['Target'] = target_encoder.fit_transform(data['Course_Name'].astype(str))

# Store reference catalog
course_catalog = data[['Course_Name', 'Course_Category']].drop_duplicates()

print(f"Prepared dataset with {data['Target'].nunique()} unique training modules")

Prepared dataset with 14 unique training modules


  data['Grade_Num'] = data['Grade'].str.extract("(\d+)").astype(int)


In [4]:
from sklearn.preprocessing import StandardScaler

# Feature columns with expanded attributes
feature_cols = [
    'Grade_Num',
    'Experience_Level',
    'Department_Encoded',
    'Primary_Skill_Encoded',
    'Secondary_Skill_Encoded',
    'Course_Category_Encoded',
    'Business_Priority_Encoded',
    'Career_Goal_Encoded',
    'Skill_Gap_Score',
    'Performance_Rating',
    'Grade_Skill_Interaction',
    'Grade_Performance'
]

# Train-test split
X, y = data[feature_cols], data['Target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train set: {X_train.shape[0]} rows | Test set: {X_test.shape[0]} rows")
print(f"Unique courses in training: {y_train.nunique()}")
print(f"Total features: {len(feature_cols)}")

Train set: 321 rows | Test set: 81 rows
Unique courses in training: 14
Total features: 12


In [5]:
import warnings
warnings.filterwarnings('ignore')

# Filter out courses with only 1 sample (can't stratify with single samples)
course_counts = data['Course_Name'].value_counts()
valid_courses = course_counts[course_counts >= 2].index
data_filtered = data[data['Course_Name'].isin(valid_courses)].copy()

print(f"Original courses: {data['Course_Name'].nunique()}")
print(f"Filtered courses (≥2 samples): {data_filtered['Course_Name'].nunique()}")
print(f"Removed {data['Course_Name'].nunique() - data_filtered['Course_Name'].nunique()} courses with single samples")

# Re-encode target with filtered data
target_encoder = LabelEncoder()
data_filtered['Target'] = target_encoder.fit_transform(data_filtered['Course_Name'].astype(str))

# Update course catalog with filtered data
course_catalog = data_filtered[['Course_Name', 'Course_Category']].drop_duplicates()

# Update X and y with filtered data
X_filtered = data_filtered[feature_cols]
y_filtered = data_filtered['Target']


X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42
)

# Scale features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTrain set: {X_train.shape[0]} rows")
print(f"Test set: {X_test.shape[0]} rows")


# XGBoost optimized for better accuracy with sufficient data
model = XGBClassifier(
    n_estimators=800,          # More trees for better learning
    max_depth=6,               # Deeper trees now that we have more data
    learning_rate=0.05,        # Lower learning rate for better convergence
    subsample=0.85,            # Use 85% of data per tree
    colsample_bytree=0.85,     # Use 85% of features per tree
    min_child_weight=2,        # Require 2 samples per leaf
    gamma=0.2,                 # Moderate regularization
    reg_alpha=0.5,             # L1 regularization
    reg_lambda=1.5,            # L2 regularization
    scale_pos_weight=1,        
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss',
    early_stopping_rounds=50
)

# Train with evaluation set for early stopping
eval_set = [(X_train_scaled, y_train), (X_test_scaled, y_test)]
model.fit(X_train_scaled, y_train, eval_set=eval_set, verbose=False)

test_predictions = model.predict(X_test_scaled)
train_predictions = model.predict(X_train_scaled)
test_acc = accuracy_score(y_test, test_predictions)
train_acc = accuracy_score(y_train, train_predictions)


print(f"Train Accuracy: {train_acc:.1%}")
print(f"Test Accuracy: {test_acc:.1%}")
print(f"Overfitting Gap: {(train_acc - test_acc):.1%}")


Original courses: 14
Filtered courses (≥2 samples): 14
Removed 0 courses with single samples

Train set: 321 rows
Test set: 81 rows
Train Accuracy: 99.4%
Test Accuracy: 96.3%
Overfitting Gap: 3.1%


In [6]:
# Enhanced recommendation helper with all training features
def recommend_course(employee, top_n=3):
    grade_value = str(employee.get('Grade', 'G3'))
    digits = ''.join(ch for ch in grade_value if ch.isdigit())
    grade_num = int(digits) if digits else 3
    
    skill_gap = employee.get('Skill_Gap_Score', 0.3)
    performance = employee.get('Performance_Rating', 4.0)

    profile = {
        'Grade_Num': grade_num,
        'Experience_Level': experience_map.get(grade_num, 0.0),
        'Skill_Gap_Score': skill_gap,
        'Performance_Rating': performance,
        'Grade_Skill_Interaction': grade_num * skill_gap,
        'Grade_Performance': grade_num * performance
    }

    for col in ['Department', 'Primary_Skill', 'Secondary_Skill', 'Course_Category', 'Business_Priority', 'Career_Goal']:
        encoder = label_encoders[col]
        value = str(employee.get(col, 'Unknown') or 'Unknown')
        if value not in encoder.classes_:
            value = 'Unknown'
        profile[f'{col}_Encoded'] = int(encoder.transform([value])[0])

    X_new = pd.DataFrame([profile])[feature_cols]
    X_new_scaled = scaler.transform(X_new)

    probabilities = model.predict_proba(X_new_scaled)[0]
    top_indices = np.argsort(probabilities)[::-1][:top_n]

    recommendations = []
    for idx in top_indices:
        course_name = target_encoder.inverse_transform([idx])[0]
        confidence = probabilities[idx]
        catalog_row = course_catalog[course_catalog['Course_Name'] == course_name]
        course_category = catalog_row['Course_Category'].iloc[0] if not catalog_row.empty else 'Unknown'
        recommendations.append({
            'Course_Name': course_name,
            'Course_Category': course_category,
            'Confidence': confidence
        })

    return recommendations

In [7]:
test_emp = {
    'Grade': 'G5',
    'Department': 'Engineering',
    'Primary_Skill': 'Terraform',
    'Secondary_Skill': 'Ansible',
    'Course_Category': 'Cloud',
    'Business_Priority': 'Critical',
    'Career_Goal': 'Cloud Architect'
}

print(f"Requesting recommendations for: {test_emp}\n")

recs = recommend_course(test_emp, top_n=1)
print("Top 3 Recommended Courses:")
for i, rec in enumerate(recs, 1):
    print(f"{i}. {rec['Course_Name']}")
    print(f"   Category: {rec['Course_Category']}")
    print(f"   Confidence: {rec['Confidence']:.1%}\n")

Requesting recommendations for: {'Grade': 'G5', 'Department': 'Engineering', 'Primary_Skill': 'Terraform', 'Secondary_Skill': 'Ansible', 'Course_Category': 'Cloud', 'Business_Priority': 'Critical', 'Career_Goal': 'Cloud Architect'}

Top 3 Recommended Courses:
1. AWS Cloud Practitioner
   Category: Cloud
   Confidence: 88.3%



In [8]:
import json

# Load all user records from JSON and generate recommendations for each
with open('test_employees.json', 'r') as f:
    user_records = json.load(f)

print(f"Loaded {len(user_records)} user profiles\n")

matched_count = 0
total_count = 0

for user in user_records:
    display_name = user.get('Emp_Id') or user.get('Employee_Name', 'Unknown')
    expected_course = user.get('Expected_Course', 'N/A')
    
    recommendations = recommend_course(user, top_n=1)
    
    if recommendations:
        predicted_course = recommendations[0]['Course_Name']
        course_category = recommendations[0]['Course_Category']
        
        if predicted_course == expected_course:
            match_status = "Matched"
            matched_count += 1
        else:
            match_status = "Not Matched"
        
        total_count += 1
        
        print(f"{display_name}: {predicted_course} (Category: {course_category}) | {match_status}")

print(f"Accuracy: {matched_count}/{total_count} ({matched_count/total_count*100:.1f}% matched)")


Loaded 15 user profiles

E901: Backend API Development (Category: Backend) | Matched
E902: Machine Learning with Python (Category: Data Science) | Matched
E903: Automated Testing with Selenium (Category: Testing) | Not Matched
E904: AWS Cloud Practitioner (Category: Cloud) | Matched
E905: Data Analysis and Visualization (Category: Analytics) | Matched
E906: Backend API Development (Category: Backend) | Matched
E907: Automated Testing with Selenium (Category: Testing) | Matched
E908: DevOps and CI/CD Pipeline (Category: DevOps) | Matched
E909: Data Pipeline Engineering (Category: Data Engineering) | Matched
E910: Cybersecurity Essentials (Category: Security) | Matched
E911: Backend API Development (Category: Backend) | Matched
E912: Automated Testing with Selenium (Category: Testing) | Not Matched
E913: Automated Testing with Selenium (Category: Testing) | Matched
E914: Automated Testing with Selenium (Category: Testing) | Not Matched
E915: Linux System Administration (Category: Infrast