In [1]:
!pip install faiss-cpu fasttext flask fuzzywuzzy requests numpy



In [2]:
import numpy as np
import fasttext
import faiss
import json
import re
import requests
import os
from flask import Flask, request, jsonify
from difflib import SequenceMatcher

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Paths for saving models
faiss_dir = "/content/drive/MyDrive/Faiss_Index"
faiss_index_path = os.path.join(faiss_dir, "faiss.index")
embeddings_path = os.path.join(faiss_dir, "admin_embeddings.npy")
conceptnet_cache_path = os.path.join(faiss_dir, "conceptnet_cache.json")

# Ensure directory exists
if not os.path.exists(faiss_dir):
    os.makedirs(faiss_dir)
    print("Created directory:", faiss_dir)

# Load FastText model
model_path = "/content/drive/MyDrive/FastText_Models/cc.en.300.bin"
fasttext_model = fasttext.load_model(model_path)

# **Admin-defined skill list (normalized)**
admin_skill_list = ["python", "relational database", "software engineering", "data science", "nlp", "natural language processing"]

# **Load FAISS embeddings**
if os.path.exists(embeddings_path):
    print("Loading embeddings from Drive...")
    admin_embeddings = np.load(embeddings_path)
else:
    print("Embeddings not found, creating new ones...")
    admin_embeddings = np.array([fasttext_model.get_word_vector(skill) for skill in admin_skill_list])
    np.save(embeddings_path, admin_embeddings)  # Save for future use

# **Load FAISS index**
if os.path.exists(faiss_index_path):
    print("Loading FAISS index from Drive...")
    faiss_index = faiss.read_index(faiss_index_path)
else:
    print("FAISS index not found, creating new one...")
    faiss_index = faiss.IndexHNSWFlat(300, 32)
    faiss_index.add(admin_embeddings)
    faiss.write_index(faiss_index, faiss_index_path)

# **MiniDBpedia (Keys = Skills from Admin List)**
miniDBpedia = {
    "python": ["scripting", "software development"],
    "relational database": ["sql", "postgresql", "mysql"],
    "software engineering": ["agile development", "devops", "software development"],
    "data science": ["data analysis", "big data", "machine learning"],
    "nlp": ["natural language processing", "text analysis", "ner", "transformers", "text mining", "linguistics"],
    "natural language processing": ["nlp", "text mining", "text analysis", "ner", "transformers", "linguistics"]
}

# **Abbreviations Dictionary**
abbreviations = {
    "py": "python",
    "db": "relational database",
    "se": "software engineering",
    "ds": "data science",
    "nlp": "natural language processing"
}

# **Normalize Function**
def normalize_text(skill):
    skill = skill.lower().strip()
    skill = re.sub(r'[^a-zA-Z0-9\s]', '', skill)
    return skill

# **Sequence Matcher for Similarity Check**
def similar_match(skill, skill_list, threshold=0.8):
    for s in skill_list:
        if SequenceMatcher(None, skill, s).ratio() > threshold:
            return s
    return None

# **Function to Generate Word Embeddings**
def get_mean_embedding(skill):
    words = skill.split()
    word_vectors = [fasttext_model.get_word_vector(w) for w in words]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(300)

# **ConceptNet API Query**
def query_conceptnet(skill):
    api_url = f"http://api.conceptnet.io/c/en/{skill}"
    try:
        response = requests.get(api_url).json()
        related_skills = []
        for edge in response.get("edges", []):
            relation = edge.get("rel", {}).get("label", "").lower()
            target = edge.get("end", {}).get("label", "").lower()
            if relation in ["synonym", "relatedto", "isa", "partof"] and target in admin_skill_list:
                related_skills.append({"matched_skill": target, "matching_method": relation})
        return related_skills
    except:
        return []

# **Semantic Similarity Matching with FAISS**
def match_with_admin_skills(user_skill):
    user_vec = get_mean_embedding(user_skill).reshape(1, -1)
    distances, indices = faiss_index.search(user_vec, k=1)  # Get best match

    threshold = 0.8
    min_similarity = 0.7

    matches = []
    for idx, dist in zip(indices[0], distances[0]):
        similarity_score = 1 / (1 + dist)
        if dist < threshold and similarity_score >= min_similarity:
            matches.append({"matched_skill": admin_skill_list[idx], "matching_method": "semantic_similarity"})

    return matches

# **Flask API**
app = Flask(__name__)

@app.route('/match_skill', methods=['POST'])
def match_skills():
    data = request.get_json()
    user_queries = data.get("skills", [])
    if not isinstance(user_queries, list) or not user_queries:
        return jsonify({"error": "No skills provided or incorrect format"}), 400

    final_results = []
    for user_query in user_queries:
        normalized_skill = normalize_text(user_query)
        match_results = []

        # **1. Exact Match or Abbreviation**
        if normalized_skill in admin_skill_list:
            match_results.append({"matched_skill": normalized_skill, "matching_method": "exact_match"})
        elif normalized_skill in abbreviations:
            match_results.append({"matched_skill": abbreviations[normalized_skill], "matching_method": "abbreviation"})

        # **2. Misspelling Correction**
        elif similar_match(normalized_skill, admin_skill_list):
            match_results.append({"matched_skill": similar_match(normalized_skill, admin_skill_list), "matching_method": "misspelling"})

        # **3. MiniDBpedia Mapping**
        else:
            for key, values in miniDBpedia.items():
                if normalized_skill in values:
                    match_results.append({"matched_skill": key, "matching_method": "knowledge_graph"})
                    break

        # **4. Semantic Similarity Matching (FastText + FAISS)**
        if not match_results:
            match_results.extend(match_with_admin_skills(normalized_skill))

        # **5. ConceptNet Knowledge Graph Lookup**
        if not match_results:
            match_results.extend(query_conceptnet(normalized_skill))

        # **Only Append if Matches Exist**
        if match_results:
            final_results.append({"user_skill": user_query, "matches": match_results})

    return jsonify({"query_results": final_results}), 200


Mounted at /content/drive
Loading embeddings from Drive...
Loading FAISS index from Drive...


In [3]:
from threading import Thread

def run_flask():
    app.run(host="0.0.0.0", port=5000)

flask_thread = Thread(target=run_flask)
flask_thread.start()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [6]:
import requests

url = "http://127.0.0.1:5000/match_skill"
data = {"skills": ["data science", "machine learning", "mysql","random skill"]}

response = requests.post(url, json=data)
print(response.json())

INFO:werkzeug:127.0.0.1 - - [14/Feb/2025 02:01:50] "POST /match_skill HTTP/1.1" 200 -


{'query_results': [{'matches': [{'matched_skill': 'data science', 'matching_method': 'exact_match'}], 'user_skill': 'data science'}, {'matches': [{'matched_skill': 'data science', 'matching_method': 'knowledge_graph'}], 'user_skill': 'machine learning'}, {'matches': [{'matched_skill': 'relational database', 'matching_method': 'knowledge_graph'}], 'user_skill': 'mysql'}]}
