In [2]:
# SkillLink - Job Skill Recommendation Full Pipeline (Updated)
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import MultiLabelBinarizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# -----------------------------
# Paths
# -----------------------------
data_path = r"C:\Users\NXTWAVE\Downloads\Job Skill Recommendation\archive\all_job_post.csv"
output_dir = r"C:\Users\NXTWAVE\Downloads\Job Skill Recommendation"

os.makedirs(output_dir, exist_ok=True)

# -----------------------------
# Load Data
# -----------------------------
df = pd.read_csv(data_path)

# Check columns
print(df.columns)
# ['job_id', 'category', 'job_title', 'job_description', 'job_skill_set']

# -----------------------------
# Preprocessing
# -----------------------------
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text):
        text = ""
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

# Combine job_description + job_skill_set
df['processed_text'] = df['job_description'].fillna('') + " " + df['job_skill_set'].fillna('')
df['processed_text'] = df['processed_text'].apply(preprocess_text)

# -----------------------------
# TF-IDF Embeddings
# -----------------------------
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['processed_text'])

# Save TF-IDF Vectorizer
with open(os.path.join(output_dir, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf, f)

# -----------------------------
# Content-Based Recommendation Function
# -----------------------------
def recommend_skills(user_skills, top_n=10):
    user_text = " ".join(user_skills)
    user_vec = tfidf.transform([preprocess_text(user_text)])
    sim_scores = cosine_similarity(user_vec, X_tfidf)
    top_indices = np.argsort(sim_scores[0])[::-1][:top_n]
    recommended_skills = []
    for idx in top_indices:
        skills = df.iloc[idx]['job_skill_set'].split(',')
        recommended_skills.extend([s.strip() for s in skills])
    # Remove duplicates and keep top N
    recommended_skills = list(dict.fromkeys(recommended_skills))
    return recommended_skills[:top_n]

# Example Usage
user_skills = ['python', 'machine learning']
print("Recommended Skills:", recommend_skills(user_skills))

# -----------------------------
# Multi-Label Skill Prediction
# -----------------------------
df['skills_list'] = df['job_skill_set'].fillna('').apply(lambda x: [s.strip() for s in x.split(',')])
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['skills_list'])

# Save MultiLabelBinarizer
with open(os.path.join(output_dir, 'mlb.pkl'), 'wb') as f:
    pickle.dump(mlb, f)

# -----------------------------
# Keras Neural Network Model
# -----------------------------
model = Sequential()
model.add(Dense(128, input_dim=X_tfidf.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(y.shape[1], activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model (Demo: 2 epochs)
model.fit(X_tfidf.toarray(), y, epochs=2, batch_size=32)

# -----------------------------
# Save Model in Multiple Formats
# -----------------------------
# 1. Native Keras format (.keras)
model.save(os.path.join(output_dir, 'skill_model.keras'))

# 2. H5 format (optional, legacy)
model.save(os.path.join(output_dir, 'skill_model.h5'))

# 3. Pickle (weights + architecture)
with open(os.path.join(output_dir, 'skill_model.pkl'), 'wb') as f:
    pickle.dump(model, f)

# 4. JSON
model_json = model.to_json()
with open(os.path.join(output_dir, 'skill_model.json'), 'w') as f:
    f.write(model_json)

print("All models and artifacts saved in:", output_dir)


Index(['job_id', 'category', 'job_title', 'job_description', 'job_skill_set'], dtype='object')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NXTWAVE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NXTWAVE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Recommended Skills: ["['help desk support'", "'customer service'", "'networking principles'", "'teamwork'", "'adaptability'", "'organizational skills'", "'oral communication'", "'written communication'", "'multi-tasking'", "'confidentiality'"]
Epoch 1/2
Epoch 2/2
All models and artifacts saved in: C:\Users\NXTWAVE\Downloads\Job Skill Recommendation


  saving_api.save_model(
