In [3]:
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [4]:
DATA_PATH = "Skill_Job_Matching_Dataset.csv"
df = pd.read_csv(DATA_PATH)

In [6]:
df.info

<bound method DataFrame.info of      Student_ID  Age  Gender Vocational_Program  Academic_Performance  \
0         S0001   24    Male             Welder                 51.38   
1         S0002   21    Male            Plumber                 67.83   
2         S0003   28  Female           Mechanic                 82.67   
3         S0004   25  Female        Electrician                 85.41   
4         S0005   22    Male      IT Technician                 53.47   
...         ...  ...     ...                ...                   ...   
2804      S2805   24    Male      IT Technician                 80.30   
2805      S2806   29    Male          Carpenter                 79.27   
2806      S2807   28    Male           Mechanic                 78.37   
2807      S2808   23    Male          Carpenter                 58.01   
2808      S2809   23    Male            Plumber                 58.44   

      Certifications_Count  Internship_Experience  Skill_1  Skill_2  Skill_3  \
0          

In [7]:
cols = [c.lower() for c in df.columns]
title_col = next((df.columns[i] for i, c in enumerate(cols) if c in ["job_title", "title", "jobtitle", "job title"]), None)
skills_col = next((df.columns[i] for i, c in enumerate(cols) if c in ["skills", "skill_set", "required_skills", "requirements", "key_skills"]), None)
desc_col = next((df.columns[i] for i, c in enumerate(cols) if "description" in c), None)
exp_col = next((df.columns[i] for i, c in enumerate(cols) if "experience" in c), None)
edu_col = next((df.columns[i] for i, c in enumerate(cols) if "education" in c), None)
if not title_col:
    title_col = df.columns[0]

In [8]:
feature_cols = []
if skills_col: feature_cols.append(skills_col)
if desc_col: feature_cols.append(desc_col)
if exp_col: feature_cols.append(exp_col)
if edu_col: feature_cols.append(edu_col)

if not feature_cols:
    text_cols = df.select_dtypes(include=['object']).columns.tolist()
    if title_col in text_cols:
        text_cols.remove(title_col)
    feature_cols = text_cols

df["skills_text"] = df[feature_cols].astype(str).agg(" ".join, axis=1)


In [9]:
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r'[^a-z0-9,; ]+', ' ', s)
    return re.sub(r'\s+', ' ', s).strip()

df["skills_text"] = df["skills_text"].apply(clean_text)

In [10]:
X = df["skills_text"]
y = df[title_col].astype(str)


In [11]:
vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), stop_words='english')
X_vec = vectorizer.fit_transform(X)


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Model Accuracy: 0.15658362989323843

Classification Report:
                        precision    recall  f1-score   support

  Automobile Mechanic       0.00      0.00      0.00        88
Electrical Technician       0.00      0.00      0.00        83
IT Support Technician       0.00      0.00      0.00       100
   Plumbing Assistant       0.00      0.00      0.00       102
     Welding Operator       0.12      0.05      0.07        98
   Woodwork Assistant       0.16      0.91      0.27        91

             accuracy                           0.16       562
            macro avg       0.05      0.16      0.06       562
         weighted avg       0.05      0.16      0.06       562



In [14]:
model_bundle = {
    "vectorizer": vectorizer,
    "model": model,
    "title_col": title_col,
    "skills_col": skills_col,
    "description_col": desc_col,
    "experience_col": exp_col,
    "education_col": edu_col,
    "accuracy": accuracy
}

with open("skill_gap_ml_model.pkl", "wb") as f:
    pickle.dump(model_bundle, f)

print("\nModel saved as skill_gap_ml_model.pkl")


Model saved as skill_gap_ml_model.pkl


In [16]:
def predict_job_and_gap(candidate_skills):
    skills_clean = clean_text(candidate_skills)
    X_vec = vectorizer.transform([skills_clean])
    predicted_job = model.predict(X_vec)[0]
    missing_skills = []
    if skills_col:
        matched_jobs = df[df[title_col] == predicted_job]
        if not matched_jobs.empty:
            job_skills = " ".join(matched_jobs[skills_col].astype(str))
            job_skills_list = [s.strip().lower() for s in re.split(r'[;,]\s*', job_skills) if s.strip()]
            candidate_set = set([s.strip().lower() for s in re.split(r'[;,]\s*', candidate_skills)])
            missing_skills = list(set(job_skills_list) - candidate_set)

    return predicted_job, missing_skills


In [17]:
candidate = "python, machine learning, pandas, sql"
job, gap = predict_job_and_gap(candidate)
print("\nCandidate Skills:", candidate)
print("Predicted Suitable Job:", job)


Candidate Skills: python, machine learning, pandas, sql
Predicted Suitable Job: Woodwork Assistant
