In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import os

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m194.6/232.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:

from PyPDF2 import PdfReader
import spacy

nlp = spacy.load("en_core_web_sm")

class PyPDFDirectoryLoader:
    def __init__(self, directory):
        self.directory = directory

    def load(self):
        docs = []
        for filename in os.listdir(self.directory):
            filepath = os.path.join(self.directory, filename)
            try:
                with open(filepath, 'rb') as f:
                    pdf_reader = PdfReader(f)
                    text = ""
                    for page in pdf_reader.pages:
                        page_text = page.extract_text()
                        if page_text:  # Ensure the extracted text is not None
                            text += page_text
                    docs.append((filename, text))
            except Exception as e:
                print(f"Error processing {filepath}: {e}")
        return docs

def extract_resume_sections(text):
    sections = {}
    lines = text.split("\n")
    current_section = None
    for line in lines:
        line = line.strip()
        if line.lower() in ["education", "experience", "summary", "skills", "projects"]:
            current_section = line.lower()
            sections[current_section] = ""
        elif current_section:
            sections[current_section] += line + " "
    return sections

def preprocess_text(text):
    doc = nlp(text)
    processed_text = " ".join(token.lemma_ for token in doc if not token.is_stop)
    return processed_text


loader = PyPDFDirectoryLoader("/content/drive/MyDrive/resume_dataset")
docs = loader.load()
print(f"Loaded {len(docs)} documents.")

preprocessed_resumes = []
for filename, text in docs:
    sections = extract_resume_sections(text)
    combined_text = " ".join(sections.values())
    preprocessed_text = preprocess_text(combined_text)
    preprocessed_resumes.append((filename, preprocessed_text))

resumes_text = [text for filename, text in preprocessed_resumes]
resumes_filenames = [filename for filename, text in preprocessed_resumes]




Error processing /content/drive/MyDrive/resume_dataset/sandhyalakshmi676@gmail.com_resume.pdf: Cannot read an empty file




Loaded 999 documents.


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def filter_resumes(resumes, query):
    vectorizer = TfidfVectorizer(stop_words='english')
    resume_texts = [text for filename, text in resumes]
    tfidf_matrix = vectorizer.fit_transform(resume_texts)
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    relevant_resumes = []
    for idx, sim in enumerate(cosine_similarities):
        if sim > 0.1:
            relevant_resumes.append(resumes[idx][0])
    return relevant_resumes


In [11]:
!pip install PyPDF2 spacy flask pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [15]:
from flask import Flask, request, render_template_string
from pyngrok import ngrok
app = Flask(__name__)

html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Resume Semantic Search</title>
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
</head>
<body>
    <div class="container">
        <h1 class="mt-5">SkillCatalyst</h1>
        <form method="POST" action="/search">
            <div class="form-group">
                <label for="query">Enter Your Search Criteria:</label>
                <input type="text" class="form-control" id="query" name="query" required>
            </div>
            <button type="submit" class="btn btn-primary">Search</button>
        </form>
        {% if query %}
            <h2 class="mt-5">List of Resumes:</h2>
            {% if resumes %}
                <ul class="list-group">
                    {% for resume in resumes %}
                        <li class="list-group-item">{{ resume }}</li>
                    {% endfor %}
                </ul>
            {% else %}
                <p>No relevant resumes found.</p>
            {% endif %}
        {% endif %}
    </div>
</body>
</html>
"""

@app.route("/", methods=["GET"])
def home():
    return render_template_string(html_template)

@app.route("/search", methods=["POST"])
def search():
    query = request.form["query"]
    relevant_resumes = filter_resumes(preprocessed_resumes, query)
    return render_template_string(html_template, query=query, resumes=relevant_resumes)


def start_ngrok():
    authtoken = "your_ngrok_authtoken"
    ngrok.set_auth_token(authtoken)
    public_url = ngrok.connect(5000, bind_tls=True)
    print(f" * ngrok tunnel available at {public_url}")
    return public_url

if __name__ == "__main__":
    url = start_ngrok()
    app.run(port=5000)

 * ngrok tunnel available at NgrokTunnel: "https://fc53-34-90-16-22.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [04/Jul/2024 03:11:31] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jul/2024 03:11:32] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [04/Jul/2024 03:12:12] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jul/2024 03:13:37] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jul/2024 03:15:02] "POST /search HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [04/Jul/2024 03:16:12] "POST /search HTTP/1.1" 200 -
