In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import json
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import load_model

# -----------------------------
# Config / Paths
# -----------------------------
OUTPUT_DIR = r"C:\Users\NXTWAVE\Downloads\Job Skill Recommendation"
TFIDF_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl")
MLB_PATH = os.path.join(OUTPUT_DIR, "mlb.pkl")
MODEL_PATH = os.path.join(OUTPUT_DIR, "skill_model.keras")

CSV_LOG_PATH = os.path.join(OUTPUT_DIR, "skill_recommendation_history.csv")
JSON_LOG_PATH = os.path.join(OUTPUT_DIR, "skill_recommendation_history.json")

# Create output dir just in case
os.makedirs(OUTPUT_DIR, exist_ok=True)

# -----------------------------
# Load artifacts
# -----------------------------
if not os.path.exists(TFIDF_PATH) or not os.path.exists(MLB_PATH) or not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(
        "One or more model artifacts are missing. Make sure tfidf_vectorizer.pkl, mlb.pkl and skill_model.keras exist in:\n"
        f"{OUTPUT_DIR}"
    )

with open(TFIDF_PATH, "rb") as f:
    tfidf = pickle.load(f)

with open(MLB_PATH, "rb") as f:
    mlb = pickle.load(f)

model = load_model(MODEL_PATH)

# -----------------------------
# Preprocessing (same as training)
# -----------------------------
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Lowercase, remove non-alphanumerics, tokenize, remove stopwords, lemmatize,
    return joined string.
    """
    if text is None:
        text = ""
    # ensure string
    text = str(text)
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

# -----------------------------
# Recommendation function
# -----------------------------
def recommend_skills(user_skills, job_title="", industry="", top_n=10, threshold=0.5):
    """
    Returns a list of top_n recommended skills (predicted by the trained model).
    The model is multi-label; we threshold probabilities and use mlb.inverse_transform.
    If many predicted skills, we take first top_n.
    """
    # build combined user text
    user_text = ""
    if isinstance(user_skills, (list, tuple)):
        user_text += " " + " ".join(user_skills)
    else:
        user_text += " " + str(user_skills)

    if job_title:
        user_text += " " + str(job_title)
    if industry:
        user_text += " " + str(industry)

    # preprocess and vectorize
    X_user = tfidf.transform([preprocess_text(user_text)])
    # predict probabilities
    y_prob = model.predict(X_user.toarray())
    y_pred = (y_prob >= threshold).astype(int)

    # get predicted skill tuples from mlb
    predicted_skill_tuples = mlb.inverse_transform(y_pred)
    # flatten (mlb.inverse_transform returns list of tuples)
    predicted_flat = []
    for tup in predicted_skill_tuples:
        predicted_flat.extend(list(tup))

    # deduplicate while preserving order
    seen = set()
    deduped = []
    for sk in predicted_flat:
        if sk not in seen:
            deduped.append(sk)
            seen.add(sk)

    # If model predicted fewer than top_n, fall back to content-based nearest jobs (optional)
    # but for now just return first top_n
    return deduped[:top_n]

# -----------------------------
# Append result to CSV
# -----------------------------
def append_to_csv(row_dict, csv_path=CSV_LOG_PATH):
    """
    Appends a single-row dict to CSV. If CSV missing, creates it.
    Lists are stored as JSON strings in the CSV for readability.
    """
    df_row = pd.DataFrame([{
        k: (json.dumps(v, ensure_ascii=False) if isinstance(v, (list, dict)) else v)
        for k, v in row_dict.items()
    }])
    if not os.path.exists(csv_path):
        df_row.to_csv(csv_path, index=False)
    else:
        df_row.to_csv(csv_path, index=False, header=False, mode='a')

# -----------------------------
# Append result to JSON array file
# -----------------------------
def append_to_json(row_dict, json_path=JSON_LOG_PATH):
    """
    Maintains a JSON array in a file. If file not present, creates a new array.
    Appends the new dict and writes back (reads whole file).
    """
    # Load existing list if exists
    data = []
    if os.path.exists(json_path):
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                if not isinstance(data, list):
                    # if file corrupted or not a list, start fresh
                    data = []
        except Exception:
            # any error reading -> start fresh
            data = []

    # append new
    data.append(row_dict)

    # write back
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# -----------------------------
# High-level function: predict + append
# -----------------------------
def predict_and_save(user_skills, job_title="", industry="", top_n=10, save_csv=True, save_json=True):
    recommended = recommend_skills(user_skills, job_title, industry, top_n=top_n)

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    row = {
        "timestamp": timestamp,
        "user_current_skills": user_skills,
        "target_job_title": job_title,
        "target_industry": industry,
        "recommended_skills": recommended
    }

    if save_csv:
        append_to_csv(row, CSV_LOG_PATH)
    if save_json:
        append_to_json(row, JSON_LOG_PATH)

    return row

# -----------------------------
# Example usage (HR role)
# -----------------------------
if __name__ == "__main__":
    # Example HR input
    user_skills_input = ["communication", "recruitment", "employee relations"]
    job_title_input = "HR Manager"
    industry_input = "Human Resources"

    result = predict_and_save(
        user_skills=user_skills_input,
        job_title=job_title_input,
        industry=industry_input,
        top_n=10
    )

    print("Prediction Result:")
    print(json.dumps(result, indent=2, ensure_ascii=False))
    print(f"\nSaved (appended) to:\n - CSV: {CSV_LOG_PATH}\n - JSON: {JSON_LOG_PATH}")





Prediction Result:
{
  "timestamp": "2025-09-21 22:12:34",
  "user_current_skills": [
    "communication",
    "recruitment",
    "employee relations"
  ],
  "target_job_title": "HR Manager",
  "target_industry": "Human Resources",
  "recommended_skills": [
    "'communication'",
    "'problem solving'"
  ]
}

Saved (appended) to:
 - CSV: C:\Users\NXTWAVE\Downloads\Job Skill Recommendation\skill_recommendation_history.csv
 - JSON: C:\Users\NXTWAVE\Downloads\Job Skill Recommendation\skill_recommendation_history.json
