# Extracting the text from the documents

In [1]:
from docx import Document
import os

In [2]:
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

def extract_texts_from_folder(folder_path):
    extracted = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            file_path = os.path.join(folder_path, filename)
            text = extract_text_from_docx(file_path)
            extracted[filename] = text
    return extracted

In [3]:
folder_path = "./DataSet/cv"
all_docx = extract_texts_from_folder(folder_path)

#### Now we have the text

In [4]:
ex1 = list(all_docx.values())[0]
print(ex1)

Marius Petru Vlasceanu
Technical Skills
- Figma, Adobe XD
- Sketch, InVision
- HTML, CSS, JavaScript
- ReactJS, Bootstrap
- VueJS, TypeScript
Foreign Languages
- English: C1
- Spanish: B2
Education
- University Name: University of Bucharest
- Program Duration: 4 years
- Master Degree Name: University of Bucharest
- Program Duration: 2 years
Certifications
- Adobe Certified Expert (ACE) in XD
- Certified JavaScript Developer (CIW)
- Microsoft Certified: Azure Fundamentals
Project Experience
1. Interactive Portfolio Website
   Developed an interactive portfolio website using HTML, CSS, and JavaScript to showcase design and development skills. Utilized ReactJS and Bootstrap to create a responsive and visually appealing user interface. Integrated Figma and Adobe XD for designing prototypes and wireframes, ensuring a seamless user experience. The project was part of an academic course, focusing on modern web development practices and responsive design.

2. Mobile App Prototype Design
   Des

# Generating embeddings for cv text

In [5]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load the embedding model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
# Define a function to generate embeddings
def get_embedding(data, precision="float32"):
   return model.encode(data, precision=precision).tolist()

!!!!!!!!!!!!megablocks not available, using torch.matmul instead
<All keys matched successfully>


In [7]:
embeddings = []
for name in all_docx:
    embedding = get_embedding(all_docx[name])
    embeddings.append(embedding)

In [8]:
print(embeddings[0])

[-0.006587756797671318, 0.007058674935251474, -0.0004201043338980526, -0.0349012091755867, 0.03421035408973694, 0.004854682367295027, 0.06140625476837158, -0.021277043968439102, -0.028232937678694725, 0.011516370810568333, -0.051762133836746216, 0.049724023789167404, 0.06496068835258484, -0.013552832417190075, -0.005131447687745094, 0.0010818943846970797, 0.07473091781139374, -0.06584971398115158, -0.07672295719385147, -0.008836505003273487, -0.03006737492978573, -0.06905496120452881, -0.0024879167322069407, 0.02293391525745392, 0.08013307303190231, 0.022568052634596825, 0.017462503165006638, 0.04709474369883537, -0.05485549941658974, 0.018557853996753693, 0.04111076891422272, 0.006198098883032799, -0.04277167469263077, 0.0024860799312591553, 0.03677287697792053, -0.018522901460528374, 0.0034059633035212755, -0.008957801386713982, -0.026482541114091873, -0.001956221414729953, -0.0012583074858412147, -0.033566609025001526, -0.013043999671936035, -0.01853296346962452, 0.05677834898233414

# Loading the vectors into Mongo Atlas

In [9]:
def create_docs_with_embeddings(embeddings, texts, names):
    docs = []
    for i, (embedding, text, name) in enumerate(zip(embeddings, texts, names)):
        doc = {
            "_id": i,
            "name": name,
            "text": text,
            "embedding": embedding,
        }
        docs.append(doc)
    return docs

In [10]:
names = []
texts = []
for name in all_docx:
    names.append(name)
    texts.append(all_docx[name])

In [11]:
docs = create_docs_with_embeddings(embeddings,texts,names)

#### Connecting to the db and inserting the docs

In [12]:
import pymongo

# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient("mongodb+srv://mihnea:1kZIOaKb9AXFVvTA@cluster0.ucqcya0.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = mongo_client["syncv_mdb"]
collection = db["cvs"]

# Ingest data into Atlas
collection.insert_many(docs)

InsertManyResult([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 21

#### Create Atlas Vector search index

In [13]:
from pymongo.operations import SearchIndexModel

# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "path": "embedding",
        "similarity": "dotProduct",
        "numDimensions": 768
      }
    ]
  },
  name="vector_index",
  type="vectorSearch"
)
collection.create_search_index(model=search_index_model)

'vector_index'

# Querying to find matches for Job Description

Generating the embedding for the JD

In [16]:
jd_example = extract_text_from_docx('./DataSet/job_descriptions/job_description_35_Backend Developer.docx')
print(jd_example)

Job Title:
Tech Lead Backend Developer
Company Overview:
At Innovative Tech Solutions, we are committed to delivering cutting-edge technology solutions that empower businesses to achieve their goals. Our team is passionate about innovation, collaboration, and excellence, and we strive to create a dynamic work environment that fosters growth and creativity. Join us in our mission to drive technological advancement and make a meaningful impact in the industry.
Key Responsibilities:
- Lead the design, development, and deployment of scalable backend services and applications.
- Collaborate with cross-functional teams to define, design, and ship new features.
- Mentor and guide junior developers, fostering a culture of continuous learning and improvement.
- Ensure the performance, quality, and responsiveness of applications.
- Identify and resolve bottlenecks and bugs, and devise solutions to mitigate and address these issues.
- Conduct code reviews to maintain code quality and best practic

Querying using the pipeline

In [18]:
# Generate embedding for the search query
query_embedding = get_embedding(jd_example)

# Sample vector search pipeline
pipeline = [
   {
      "$vectorSearch": {
            "index": "vector_index",
            "queryVector": query_embedding,
            "path": "embedding",
            "exact": True,
            "limit": 5
      }
   }, 
   {
      "$project": {
         "_id": 0, 
         "text": 1,
         "score": {
            "$meta": "vectorSearchScore"
         }
      }
   }
]

# Execute the search
results = collection.aggregate(pipeline)

# Print formatted results
for i, result in enumerate(results, 1):
    print(f"\n=== Result {i} (Score: {result['score']:.4f}) ===\n")
    print(result['text'])


=== Result 1 (Score: 0.8300) ===

Teodor Mărgineanu
Technical Skills
- JavaScript, ReactJS, TypeScript
- Java, Spring Boot, REST APIs
- AWS, Docker, Kubernetes
- SQL, PostgreSQL, Node.js
- Figma, InVision
Foreign Languages
- English: C1
- Spanish: B2
Education
- University Name: University Politehnica of Bucharest
- Program Duration: 4 years
- Master Degree Name: University Politehnica of Bucharest
- Program Duration: 2 years
Certifications
- AWS Certified Solutions Architect – Professional
- Certified Kubernetes Administrator (CKA)
- Oracle Certified Professional, Java SE 11 Developer
Project Experience
1. **Enterprise Web Application Development**
   Led the development of a scalable enterprise web application using ReactJS and TypeScript for the frontend, and Spring Boot for the backend. Implemented RESTful APIs to facilitate seamless data exchange between the client and server, enhancing performance and user experience. Utilized Docker and Kubernetes for containerization and orche

In [36]:
query_embedding = get_embedding(extract_text_from_docx('./DataSet/job_descriptions/job_description_86_Project_Manager.docx'))
# Sample vector search pipeline
pipeline = [
   {
      "$vectorSearch": {
            "index": "vector_index",
            "queryVector": query_embedding,
            "path": "embedding",
            "exact": True,
            "limit": 3
      }
   }, 
   {
      "$project": {
         "_id": 0, 
         "text": 1,
         "score": {
            "$meta": "vectorSearchScore"
         }
      }
   }
]
results = collection.aggregate(pipeline)
for i, result in enumerate(results, 1):
    print(f"\n=== Result {i} (Score: {result['score']:.4f}) ===\n")
    print(result['text'])


=== Result 1 (Score: 0.8726) ===


Project Manager Resume
First Last Name
123 Main Street., Anytown, USA 10000 * email@address * 123-123-4567 * linkedin.com/username

Write a summary of one’s professional experience and career goals.

Summary 
Enthusiastic and detail-oriented Project Manager with a solid foundation in project management principles and a passion for delivering results. Recent graduate with a bachelor’s degree in business administration and hands-on experience in agile project management practices. Skilled in coordinating teams, managing project timelines, and ensuring seamless communication between stakeholders. Eager to contribute organizational, communication, and problem-solving skills to help teams meet project goals on time and within budget.

	

Skills & Qualifications
Project Management Tools: Familiar with ProjectManager and Microsoft Project
Methodologies: Agile (Scrum & Kanban), Waterfall
Project Planning & Scheduling: Creating timelines, defining milestones 

In [35]:
txt = extract_text_from_docx('./DataSet/cv/Project_Manager_Resume_Template_ProjectManager_Word_WLNK.docx')
cv_one_embedding = get_embedding(txt)
cv_one = { "name": "nuj",
            "text": txt,
            "embedding": cv_one_embedding }
collection.insert_one(cv_one)


InsertOneResult(ObjectId('67fbcd47686c21ecc0d624cb'), acknowledged=True)