In [None]:
from google.colab import drive
drive.flush_and_unmount()  # This unmounts the drive cleanly
drive.mount('/content/drive')

In [None]:
pip install PyPDF2

In [None]:
import os
import PyPDF2
import nltk
import re
import numpy as np
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from google.colab import drive

# Define a custom stopword list
def custom_stopwords():
    return set([
        "the", "and", "a", "an", "to", "of", "in", "is", "it", "for", "on",
        "this", "by", "at", "from", "or", "that", "with", "as", "was",
        "be", "not", "are", "his", "they", "he", "she", "their", "you",
        "we", "has", "have", "had"
    ])

# Initialize stopword list and stemmer
stop_words = custom_stopwords()
stemmer = PorterStemmer()

# Tokenization, stopword removal, and stemming
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

# Load documents from Google Drive folder.
def load_documents(directory):
    documents = {}
    for filename in sorted(os.listdir(directory)):
        if filename.lower().endswith('.pdf'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in reader.pages:
                    text += page.extract_text() or ''
                documents[filename] = preprocess_text(text)
    return documents

# Set the path to your folder inside Google Drive
directory = '/content/drive/My Drive/Candidate_list'
documents = load_documents(directory)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
vectorized_docs = vectorizer.fit_transform(documents.values())
doc_names = list(documents.keys())

# Queries to compare
queries = {
    "Query A - Data Management Skills": "Understanding of Data Management Principles: Basic knowledge of data governance, data quality, and data lifecycle management Excel & Google Sheets: Proficient in data manipulation, analysis, and visualization.SQL/Python: (Ideally) experience in querying databases and performing data analysis tasks",
    "Query B - Personal Attributes": "Strong focus on data accuracy and precision.Communication Skills: Ability to convey complex information clearly and effectively.Organizational Skills: Capable of managing multiple tasks and priorities efficiently.Logistics & Sup ply Chain: A keen interest in the logistics and supply chain sectors.Business Analytics: Passion for analyzing data to drive business decisions.",
}

# Score queries against documents
results = {}
for query_name, query in queries.items():
    query = preprocess_text(query)
    query_vector = vectorizer.transform([query])
    scores = vectorized_docs.dot(query_vector.T).toarray().flatten()
    ranked_indices = np.argsort(scores)[::-1]
    ranked_docs = [(doc_names[i], scores[i]) for i in ranked_indices[:8]]
    results[query_name] = ranked_docs

# Display results
for query_name, ranked_docs in results.items():
    print(f"Results for {query_name}:")
    for doc, score in ranked_docs:
        print(f"  Document: {doc}, Score: {score:.4f}")
    print()
