In [1]:
import pandas as pd

# Reads the employee training CSV file for model training.
df = pd.read_csv('Employee_training.csv')
df.head(200)

Unnamed: 0,Emp_Id,Grade,Department,Primary_Skill,Secondary_Skill,Course_Category,Duration_Hours,Delivery_Mode,Business_Priority,Skill_Gap_Score,Availability_Hours_Per_Week,Bench_Status,Performance_Rating,Learning_Style,Career_Goal,Completion_Percentage,Assessment_Score,Training_Success
0,E001,G5,Engineering,Java,Spring Boot,Backend,50.0,Hybrid,High,0.25,0.0,Active,4.2,Hands-on,Tech Lead,100.0,85.0,Pass
1,E002,G3,Engineering,JavaScript,React,Development,45.0,Online,High,0.30,0.0,Active,3.8,Visual,Senior Developer,100.0,78.0,Pass
2,E003,G6,Engineering,Java,Microservices,Architecture,70.0,Hybrid,Critical,0.40,10.0,Active,4.5,Reading,Architect,65.0,0.0,Fail
3,E004,G3,IT Support,Linux,Shell Scripting,Infrastructure,40.0,Hybrid,High,0.20,0.0,Active,3.6,Hands-on,DevOps Engineer,100.0,82.0,Pass
4,E005,G5,Engineering,Go,Kubernetes,DevOps,50.0,Hybrid,Critical,0.15,0.0,Active,4.3,Hands-on,SRE Lead,100.0,88.0,Pass
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,E095,G4,Engineering,Webpack,Vite,Development,30.0,Online,High,0.30,0.0,Active,3.9,Visual,Build Eng,100.0,78.0,Pass
95,E096,G6,QA,Chaos,Resilience,Security,60.0,Hybrid,Critical,0.20,0.0,Active,4.4,Hands-on,SRE Tester,100.0,87.0,Pass
96,E097,G4,Engineering,Electron,Tauri,Development,45.0,Online,High,0.50,20.0,Bench,3.6,Visual,Desktop Dev,0.0,0.0,Fail
97,E098,G4,QA,Performance,Metrics,Testing,20.0,Online,High,0.25,0.0,Active,3.8,Hands-on,Perf Analyst,100.0,80.0,Pass


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Clean and derived features
df['Grade'] = df['Grade'].str.replace('G','')
df['Skill_Match'] = df.apply(lambda row: 1 if row['Primary_Skill'] == row['Secondary_Skill'] else 0, axis=1)
df['Bench_Status'] = df.apply(lambda row: 1 if row['Bench_Status'] == 'Active' else 0, axis=1)

features = [
    'Grade','Department','Primary_Skill','Secondary_Skill','Course_Category',
    'Delivery_Mode','Business_Priority','Skill_Gap_Score','Performance_Rating',
    'Skill_Match', 'Bench_Status'
]
target = 'Training_Success'

# Spliting Data into featires and target for training and testing purpose 
data = df.copy()
x = data[features]
y = data[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print(f"============== check training features ================")
print(x_train.head(5))
print(f"============== check training target ================")
print(y_train.head(5))   

   Grade    Department Primary_Skill Secondary_Skill Course_Category  \
49     6   Engineering       Clojure   ClojureScript     Programming   
70     5    IT Support           AWS          Lambda           Cloud   
68     3            QA        TestNG           JUnit     Programming   
15     4  Data Science           SQL      PostgreSQL       Analytics   
39     4            QA      Selenium         Cypress     Programming   

   Delivery_Mode Business_Priority  Skill_Gap_Score  Performance_Rating  \
49        Hybrid              High             0.30                 4.2   
70        Hybrid          Critical             0.20                 4.2   
68        Online              High             0.35                 3.5   
15        Hybrid              High             0.25                 4.0   
39        Online              High             0.30                 3.9   

    Skill_Match  Bench_Status  
49            0             1  
70            0             1  
68            0     

In [3]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # One-Hot Encoding for categorical features
    categorical_features = [
    'Department', 'Primary_Skill', 'Secondary_Skill',
    'Course_Category', 'Delivery_Mode', 'Business_Priority'
    ]
    numerical_features = [
        'Grade', 'Skill_Gap_Score', 'Performance_Rating', 'Skill_Match', 'Bench_Status'
    ]

    preprocessor = ColumnTransformer([
        ('categorical', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')

    # Fit encoder and transform data
    preprocessor.fit(x_train)
    x_train_preprocessed = preprocessor.transform(x_train)
    x_test_preprocessed = preprocessor.transform(x_test)

In [4]:
# Train and evaluating the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(x_train_preprocessed, y_train)
y_pred = model.predict(x_test_preprocessed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")




Model Accuracy: 0.85


In [5]:
def predict_training_success(input_df):
    # Preprocess input using the same preprocessor as training
    input_preprocessed = preprocessor.transform(input_df)
    # Predict using the trained model
    return model.predict(input_preprocessed)


In [6]:
import warnings

def predict_from_params(
    grade, department, primary_skill, secondary_skill, course_category,
    delivery_mode, business_priority, skill_gap_score, performance_rating, bench_status
):
    # Remove 'G' from grade if present, as in your feature engineering
    grade_clean = str(grade).replace('G', '')
    # Skill_Match: 1 if primary_skill == secondary_skill else 0
    skill_match = 1 if primary_skill == secondary_skill else 0
    # Bench_Status: 1 if 'Active', 0 otherwise (as per your feature engineering)
    bench_status_num = 1 if bench_status == 'Active' else 0

    input_df = pd.DataFrame([{
        'Grade': grade_clean,
        'Department': department,
        'Primary_Skill': primary_skill,
        'Secondary_Skill': secondary_skill,
        'Course_Category': course_category,
        'Delivery_Mode': delivery_mode,
        'Business_Priority': business_priority,
        'Skill_Gap_Score': skill_gap_score,
        'Performance_Rating': performance_rating,
        'Skill_Match': skill_match,
        'Bench_Status': bench_status_num
    }])
    return predict_training_success(input_df)[0]



In [7]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = predict_from_params(
        'G6', 
        'Engineering', 
        'Observability', 
        'OpenTelemetry', 'DevOps', 'Hybrid', 'Critical', 0.15, 4.6, 'Active'
    )
    print("Prediction:", result)



Prediction: Pass


In [8]:
import json
import pandas as pd
import warnings

newdf = pd.read_json('new_employees.json')
countoriginal = 0
countmatched = 0

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print("Manual Test Results on new_employees.json")
    for _, record in newdf.iterrows():
        countoriginal += 1
        predi = predict_from_params(
            record['Grade'],
            record['Department'],
            record['Primary_Skill'],
            record['Secondary_Skill'],
            record['Course_Category'],
            record['Delivery_Mode'],
            record['Business_Priority'],
            record['Skill_Gap_Score'],
            record['Performance_Rating'],
            record['Bench_Status']
        )
        if predi == record['Training_Success']:
            print(f"Record {record['Emp_Id']}: Prediction matches actual outcome. [Prediction: {predi}]")
            countmatched += 1
        else:
            print(f"Record {record['Emp_Id']}: No Match. [Prediction: {predi}, Actual: {record['Training_Success']}]")
    print(f"Total Records Processed: {countoriginal}")
    print(f"Total Matches: {countmatched}")
    print(f"total percentage of matched output : {countmatched / countoriginal * 100:.2f}% ")


Manual Test Results on new_employees.json
Record E201: Prediction matches actual outcome. [Prediction: Fail]
Record E202: Prediction matches actual outcome. [Prediction: Pass]
Record E203: Prediction matches actual outcome. [Prediction: Fail]
Record E204: Prediction matches actual outcome. [Prediction: Pass]
Record E205: No Match. [Prediction: Fail, Actual: nan]
Record E206: Prediction matches actual outcome. [Prediction: Pass]
Record E207: Prediction matches actual outcome. [Prediction: Fail]
Record E208: Prediction matches actual outcome. [Prediction: Pass]
Record E209: Prediction matches actual outcome. [Prediction: Fail]
Record E210: Prediction matches actual outcome. [Prediction: Pass]
Record E211: No Match. [Prediction: Pass, Actual: Fail]
Record E212: Prediction matches actual outcome. [Prediction: Pass]
Record E213: Prediction matches actual outcome. [Prediction: Fail]
Record E214: Prediction matches actual outcome. [Prediction: Pass]
Record E215: Prediction matches actual outc

In [9]:
import json
import pandas as pd
import warnings

newdf = pd.read_json('new_employees.json')

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for _, record in newdf.iterrows():
        if record['Primary_Skill'] == record['Secondary_Skill']:
            print(f"Record {record['Emp_Id']}: Primary and Secondary skills match.")
        else:
            print(f"Record {record['Emp_Id']}: Primary and Secondary skills do not match.")

Record E201: Primary and Secondary skills do not match.
Record E202: Primary and Secondary skills do not match.
Record E203: Primary and Secondary skills do not match.
Record E204: Primary and Secondary skills do not match.
Record E205: Primary and Secondary skills do not match.
Record E206: Primary and Secondary skills do not match.
Record E207: Primary and Secondary skills do not match.
Record E208: Primary and Secondary skills do not match.
Record E209: Primary and Secondary skills do not match.
Record E210: Primary and Secondary skills do not match.
Record E211: Primary and Secondary skills do not match.
Record E212: Primary and Secondary skills do not match.
Record E213: Primary and Secondary skills do not match.
Record E214: Primary and Secondary skills do not match.
Record E215: Primary and Secondary skills do not match.
Record E216: Primary and Secondary skills do not match.
Record E217: Primary and Secondary skills do not match.
Record E218: Primary and Secondary skills do not