## Importing the necessary libraries to handle the pdf 

In [2]:
import fitz
from langchain_community.document_loaders import PyPDFLoader
import pdfplumber
import pytesseract
from PIL import Image
import io

# Step 1: Loading the pdf files  

In [3]:
pdf_files=["budget.pdf","farmerbook.pdf", "llama.pdf"]

### Reading the number of pages in each pdf

In [4]:
total_pages = 0
for pdf in pdf_files:
    loader=PyPDFLoader(pdf)
    no_page = len(loader.load())
    print(f"In '{pdf}' no of pages : {no_page}")
    total_pages += no_page
print(f"Total number of pages in all PDFs: {total_pages}")


In 'budget.pdf' no of pages : 35
In 'farmerbook.pdf' no of pages : 154
In 'llama.pdf' no of pages : 77
Total number of pages in all PDFs: 266


### Read the content of pdf files

In [5]:
# Create a function to read the text from pdf using fitz (pymupdf) 
def extract_text_with_fitz(pdf_path):
    # open the pdf and create a doc object that will allow further operations like extraction
    doc = fitz.open(pdf_path)
    # create a empty string to store the text
    text=""
    # iterate through each page in the pdf
    for page in doc:
        # extract the plain text from the page 
        text += page.get_text("text") + "\n"
    return text


# Create a function to read the tables value from the pdf using pdfplumber
def extract_tables_with_pdfplumber(pdf_path):
    # open the pdf using pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        tables = []
        # iterate through each page in the pdf
        for page in pdf.pages:
            # extract the tables from the page
            tables_on_page = page.extract_tables()
            # append the tables to the list
            tables.extend(tables_on_page)
    return tables


# Use empty dictionary to store the extracted data from each PDF
pdf_data={}
# Iterate through each PDF file and extract text, tables, and images 
for pdf in pdf_files:
    # Extract text, tables, and images from the PDF
    pdf_data[pdf]={
        "text": extract_text_with_fitz(pdf),
        "tables": extract_tables_with_pdfplumber(pdf)
    }


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

In [6]:
print(pdf_data["budget.pdf"]["text"][:500])  # Print first 500 characters of text from budget.pdf

India Union 
Budget 2024-25
Tax Flash News
#Budget2024    I    #KPMGBudgetLive 
KPMG. Make the Difference.
July 2024
kpmg.com/in

India Union Budget 2024-25
© 2024 KPMG Assurance and Consulting Services LLP, an Indian Limited Liability Partnership and a member firm of the KPMG global organization of 
independent member firms affiliated with KPMG International Limited, a private English company limited by guarantee. All rights reserved.
2
Foreword
The Hon'ble Finance Minister Nirmala 
Sitharaman 


In [7]:
print(pdf_data["farmerbook.pdf"]["tables"][:500]) 

[[['', None, None, None], [None, '', None, None], ['', 'Farmer’s Handbook on Bas', 'ic Agriculture', ''], ['', '', None, None], ['', '', '', ''], ['', '', None, None]], [['', '']], [['Crop', 'Water\nsaving (%)', 'Yield\nincrease (%)'], ['Bajra', '56', '19'], ['Barley', '56', '16'], ['Bhendi', '28', '23'], ['Cabbage', '40', '3'], ['Cauliflower', '35', '12'], ['Chillies', '33', '24'], ['Cotton', '36', '50'], ['Cowpea', '19', '3'], ['Fenugreek', '29', '25'], ['Garlic', '28', '6'], ['Gram', '69', '57'], ['Groundnut', '20', '40'], ['Jowar', '55', '34'], ['Lucerne', '16', '27'], ['Maize', '41', '36'], ['Onion', '33', '23'], ['Potato', '46', '4'], ['Sunflower', '33', '20'], ['Wheat', '35', '24']], [['Crop Type', 'Crop Example'], ['Cereals', 'Maize, Sorghum, Wheat, Jowar'], ['Flowers', 'Carnation, Jasmine, Marigold'], ['Oilseeds', 'Groundnut, Mustard, Sunflower'], ['Vegetables', 'Onion, Potato, Radish, Carrot'], ['Fodders', 'Asparagus, Pastures'], ['Pulses', 'Gram, Pigeon pea, Beans'], ['Plant

In [8]:
print(pdf_data["llama.pdf"]["text"][:500]) 

Llama 2: Open Foundation and Fine-Tuned Chat Models
Hugo Touvron∗
Louis Martin†
Kevin Stone†
Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra
Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen
Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller
Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou
Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev


# Step 2: Semantic Chunking

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)
#  Load a pretrained Sentence Transformer model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

chunked_data = {}
semantic_embeddings = {}
# Iterate through each PDF file and split the text into chunks
for pdf in pdf_data:
    #Split the text into chunks
    text_chunks=text_splitter.split_text(pdf_data[pdf]["text"])
    # print("Printing the text  data *************************************")
    # print(text_chunks)
    print()
    chunked_data[pdf]=text_chunks
    # print(f"Printing the chunked  data {pdf} *************************************")
    # print(chunked_data[pdf])
    # print(len(chunked_data[pdf]))
    print()
    # Create semantic embeddings for each chunk
    semantic_embeddings[pdf]= embed_model.encode(text_chunks)
    print(f"Printing the embedding  data {pdf} *************************************")
    print(semantic_embeddings[pdf][:2])  # Print first 2 embeddings for brevity
    print(len(semantic_embeddings[pdf]))
    print()

  from .autonotebook import tqdm as notebook_tqdm




Printing the embedding  data budget.pdf *************************************
[[-2.22761575e-02  2.05192179e-03  3.57893296e-02 -7.39485072e-03
   4.71094884e-02  7.06946179e-02  2.08877027e-02 -2.36461852e-02
  -5.80396987e-02  2.66137738e-02 -7.60733411e-02 -5.89257292e-02
  -2.81909630e-02 -3.58236432e-02 -1.75641011e-02  1.44040538e-02
   1.46754738e-02 -2.43764799e-02 -9.91383684e-04 -5.83276153e-02
   5.84420189e-02  2.76828688e-02 -5.30117052e-03 -1.20240180e-02
   3.16191427e-02  2.11418495e-02 -3.97702828e-02 -7.49481246e-02
   5.87254809e-03 -1.29330056e-02  2.59372406e-02  8.93196389e-02
  -1.11224145e-01 -2.11711172e-02  2.26059649e-02 -6.96019903e-02
  -4.00937647e-02  4.94101271e-02  1.19887471e-01 -2.35635992e-02
   4.62767184e-02 -6.19027130e-02 -4.23513092e-02 -4.74723522e-03
   2.63148397e-02  2.84015853e-02 -2.88658533e-02 -5.41925849e-03
  -8.15967023e-02  5.42326197e-02  3.35257463e-02 -3.50057706e-02
  -5.96492458e-03 -4.84189987e-02  3.62104364e-02 -7.49084279e

In [10]:
print(semantic_embeddings["llama.pdf"][:2])  # Print first 2 embeddings for budget.pdf

[[ 4.39159423e-02  1.62548367e-02 -3.55705433e-02  1.25930691e-02
  -4.54727113e-02  4.94389534e-02 -6.55602142e-02  2.35310029e-02
  -4.12150361e-02  2.45272759e-02 -3.71516123e-02 -3.91522013e-02
   5.65604456e-02 -6.22282885e-02  1.30237844e-02  3.36664878e-02
   5.75224869e-02 -3.96889187e-02 -2.20016460e-04 -1.72956679e-02
   1.35655107e-03 -6.21606633e-02 -4.18856740e-03  1.23841426e-04
  -3.80389877e-02  2.71947421e-02 -6.92169145e-02  2.37007812e-02
  -4.93363030e-02 -6.64035305e-02 -4.88103516e-02  1.44769102e-01
   7.90138245e-02  8.51079356e-03 -5.62781058e-02  3.02345678e-02
   5.64189628e-02  5.47259971e-02 -5.82731366e-02 -1.51890833e-02
   7.67392525e-03 -8.09152722e-02 -4.05215845e-02 -3.40524167e-02
   9.35274512e-02 -1.07753865e-01  1.15959048e-02 -2.97959466e-02
   2.46927422e-02 -3.74809979e-03 -3.99571955e-02 -8.79745930e-02
   6.70397803e-02 -5.95054030e-03  1.56720988e-02 -7.19009563e-02
  -1.09704897e-01 -7.02711120e-02 -9.66429710e-03 -2.14969795e-02
   7.36249

In [11]:
print(chunked_data)



In [12]:
print(chunked_data["budget.pdf"][:2])  # Print first 2 chunks of text from budget.pdf

['India Union \nBudget 2024-25\nTax Flash News\n#Budget2024    I    #KPMGBudgetLive \nKPMG. Make the Difference.\nJuly 2024\nkpmg.com/in', "India Union Budget 2024-25\n© 2024 KPMG Assurance and Consulting Services LLP, an Indian Limited Liability Partnership and a member firm of the KPMG global organization of \nindependent member firms affiliated with KPMG International Limited, a private English company limited by guarantee. All rights reserved.\n2\nForeword\nThe Hon'ble Finance Minister Nirmala \nSitharaman presented her seventh \nconsecutive Union Budget for the fiscal \n2024-25 and the first of the Modi 3.0 \nGovernment signaling that the Indian \neconomy is on a strong wicket and a \nstable footing as well as demonstrating \nresilience in the face of geopolitical \nchallenges.\nMaintaining a strong GDP growth rate \nof 8.2 per cent in FY24, India aspires for \nan ambitious 6.5–7 per cent real GDP \ngrowth in FY25. For pursuit of ‘Viksit \nBharat’, this budget envisages sustained 

In [13]:

print(semantic_embeddings )

{'budget.pdf': array([[-0.02227616,  0.00205192,  0.03578933, ..., -0.06585979,
        -0.00954357,  0.06061332],
       [-0.02242715, -0.02857282, -0.00437082, ..., -0.1034427 ,
        -0.01807587,  0.00666775],
       [ 0.01623793, -0.00682482, -0.05466833, ..., -0.03655886,
         0.02304079,  0.01783749],
       ...,
       [-0.03412153, -0.00576887, -0.01575049, ..., -0.01890368,
         0.0053317 ,  0.00686479],
       [-0.06845634,  0.00796991,  0.02653745, ..., -0.10781474,
        -0.02059077, -0.01717774],
       [-0.04021035, -0.02996797,  0.02117627, ..., -0.09713436,
        -0.02516126, -0.00218592]], shape=(105, 384), dtype=float32), 'farmerbook.pdf': array([[ 3.49025067e-05,  3.21891345e-02, -2.67987587e-02, ...,
        -2.23541744e-02,  8.18692334e-03, -7.77861942e-03],
       [-3.47742736e-02, -3.66200320e-02, -6.37048036e-02, ...,
        -5.15435487e-02, -7.04142905e-04, -1.66143123e-02],
       [-7.66716599e-02, -4.85326658e-04, -7.25178570e-02, ...,
        

# Step 3: Store in Vector Database

In [14]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Zilliz Cloud connection credentials
CLUSTER_ENDPOINT = "https://in03-4c1ebc679337871.serverless.gcp-us-west1.cloud.zilliz.com"
TOKEN = "c7460a658be47de55d8d4e7246180aee4a8fa6063a4b011d05bffd830a6a48f06279fea8310d4ea18c62601de09306de68a43e13"

# Connect to Zilliz Cloud
connections.connect("default", uri=CLUSTER_ENDPOINT, token=TOKEN)

In [15]:
from pymilvus import Collection, utility

for pdf in pdf_data:
    safe_collection_name = pdf.replace(".", "_") + "_chunks"

    # Check if collection already exists
    if utility.has_collection(safe_collection_name):
        print(f"Collection '{safe_collection_name}' already exists in Zilliz Cloud. Skipping upload.")
        continue  # Skip if data is already stored

    # Define schema
    schema = CollectionSchema([
        FieldSchema(name="chunk_id", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=len(semantic_embeddings[pdf][0]))
    ])

    # Create collection
    collection = Collection(name=safe_collection_name, schema=schema)

    # Ensure correct data format
    insert_data = [
        {
            "chunk_id": int(i),  # Explicitly convert chunk ID to integer
            "embedding": semantic_embeddings[pdf][i].tolist()
        }
        for i in range(len(chunked_data[pdf]))
    ]

    # Debugging before inserting
    print(f"Inserting {len(insert_data)} rows into {safe_collection_name}...")
    print(f"Schema Fields: {[field.name for field in collection.schema.fields]}")
    print(f"Sample Insert Data Row: {insert_data[0]}")
    print(f"Row Length: {len(insert_data[0])}")

    collection.insert(insert_data)
    collection.flush()

print("Data successfully stored in Zilliz Cloud.")

Collection 'budget_pdf_chunks' already exists in Zilliz Cloud. Skipping upload.
Collection 'farmerbook_pdf_chunks' already exists in Zilliz Cloud. Skipping upload.
Collection 'llama_pdf_chunks' already exists in Zilliz Cloud. Skipping upload.
Data successfully stored in Zilliz Cloud.


In [16]:
# for pdf in pdf_data:
#     safe_collection_name = pdf.replace(".", "_") + "_chunks"

#     schema = CollectionSchema([
#         FieldSchema(name="chunk_id", dtype=DataType.INT64, is_primary=True),
#         FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=len(semantic_embeddings[pdf][0]))
#     ])

#     collection = Collection(name=safe_collection_name, schema=schema)

#     # Ensure correct data format
#     insert_data = [
#         {
#             "chunk_id": int(i),  # Explicitly convert chunk ID to integer
#             "embedding": semantic_embeddings[pdf][i].tolist()
#         }
#         for i in range(len(chunked_data[pdf]))
#     ]

#     # Debugging before inserting
#     print(f"Inserting {len(insert_data)} rows into {safe_collection_name}...")
#     print(f"Schema Fields: {[field.name for field in collection.schema.fields]}")
#     print(f"Sample Insert Data Row: {insert_data[0]}")
#     print(f"Row Length: {len(insert_data[0])}")

#     collection.insert(insert_data)
#     collection.flush()

# print("Data successfully stored in Zilliz Cloud.")

In [17]:
from pymilvus import utility

index_list = utility.list_indexes("llama_pdf_chunks")
print(f"Indexes for collection: {index_list}")

Indexes for collection: ['embedding']


In [18]:
from pymilvus import Index, Collection

# List of your chunk collections
collections = ["budget_pdf_chunks", "farmerbook_pdf_chunks", "llama_pdf_chunks"]

# Index parameters (Using HNSW for optimized nearest neighbor search)
index_params = {
    "index_type": "HNSW",      # Changed from IVF_FLAT to HNSW
    "metric_type": "COSINE",   # Alternatives: "L2" (Euclidean), "COSINE" for similarity
    "params": {"M": 16, "efConstruction": 200}  # Fine-tuning HNSW parameters
}

# Create index for each collection
for collection_name in collections:
    collection = Collection(collection_name)
    
    # Check if index already exists
    if collection.has_index():
        print(f"Index already exists for {collection_name}. Skipping...")
    else:
        print(f"Creating HNSW index for {collection_name}...")
        index = Index(Collection(collection_name), "embedding", index_params)
        print(f"HNSW index successfully created for {collection_name}.")

print("All collections indexed successfully with HNSW.")

Index already exists for budget_pdf_chunks. Skipping...
Index already exists for farmerbook_pdf_chunks. Skipping...
Index already exists for llama_pdf_chunks. Skipping...
All collections indexed successfully with HNSW.


In [19]:
collection.load()

In [20]:
# Retrieval Pipeline
def retrieve(query, pdf_name):
    query_embedding = embed_model.encode([query])

    # Ensure correct collection name formatting
    safe_collection_name = pdf_name.replace(".", "_") + "_chunks"

    # Load the collection before searching
    collection = Collection(safe_collection_name)
    collection.load()  # Ensure collection is ready for querying

    # Adjust search parameters for HNSW
    search_params = {
        "metric_type": "COSINE",  # Matching the index type used
        "params": {"ef": 50}       # Higher `ef` improves accuracy in HNSW searches
    }

    results = collection.search(
        data=query_embedding,
        anns_field="embedding",
        param=search_params,
        limit=5,
        output_fields=["chunk_id"]  # Ensure Milvus returns this field
    )

    # Extract chunk IDs properly
    retrieved_chunk_ids = [r.fields["chunk_id"] for r in results[0]]

    return [chunked_data[pdf_name][chunk_id] for chunk_id in retrieved_chunk_ids]

query = "Summarize key points"
retrieved_chunks = {pdf: retrieve(query, pdf) for pdf in pdf_files}

In [21]:
print(retrieved_chunks)

{'budget.pdf': ["goods reaches the market.\nThe interim budget, aligned with the \n'Viksit Bharat' roadmap had prioritised \nsupport for four key population \nsegments: the poor, women, youth, and \nfarmers. Notable measures encompass \nthe expansion of the Pradhan Mantri \nGarib Kalyan Anna Yojana, benefitting \nover 80 crore individuals, and the \nannouncement of increased Minimum \nSupport Prices (MSP) for major crops.\nBudget Theme and Priorities\nReflecting on the budget's central theme, \nthere is a notable emphasis on \nemployment, skill development, support \nfor MSMEs, and the middle class. The \nPrime Minister's package comprising \nfive schemes and initiatives is aimed at \nfacilitating employment, skill \nenhancement, and other opportunities \nfor 4.1 crore youth over a five-year \nperiod, with a central outlay of INR2 lakh \ncrore. This year, INR1.48 lakh crore has \nbeen earmarked for education, \nemployment, and skill development. \nFurthermore, the budget outlines nine 

In [22]:
# Evaluated Retrieval time
def retrieve(query, pdf_name):
    query_embedding = embed_model.encode([query])

    # Ensure correct collection name formatting
    safe_collection_name = pdf_name.replace(".", "_") + "_chunks"

    # Load the collection before searching
    collection = Collection(safe_collection_name)
    collection.load()  # Ensure collection is ready for querying

    # Adjust search parameters for HNSW
    search_params = {
        "metric_type": "COSINE",  # Matching the index type used
        "params": {"ef": 50}       # Higher `ef` improves accuracy in HNSW searches
    }

    results = collection.search(
        data=query_embedding,
        anns_field="embedding",
        param=search_params,
        limit=5,
        output_fields=["chunk_id"]  # Ensure Milvus returns this field
    )

    # Corrected retrieval approach
    retrieved_chunk_ids = [r.fields["chunk_id"] for r in results[0]]

    return [chunked_data[pdf_name][chunk_id] for chunk_id in retrieved_chunk_ids]

query = "Summarize key points"
retrieved_chunks = {pdf: retrieve(query, pdf) for pdf in pdf_files}

In [23]:
print(retrieved_chunks)

{'budget.pdf': ["goods reaches the market.\nThe interim budget, aligned with the \n'Viksit Bharat' roadmap had prioritised \nsupport for four key population \nsegments: the poor, women, youth, and \nfarmers. Notable measures encompass \nthe expansion of the Pradhan Mantri \nGarib Kalyan Anna Yojana, benefitting \nover 80 crore individuals, and the \nannouncement of increased Minimum \nSupport Prices (MSP) for major crops.\nBudget Theme and Priorities\nReflecting on the budget's central theme, \nthere is a notable emphasis on \nemployment, skill development, support \nfor MSMEs, and the middle class. The \nPrime Minister's package comprising \nfive schemes and initiatives is aimed at \nfacilitating employment, skill \nenhancement, and other opportunities \nfor 4.1 crore youth over a five-year \nperiod, with a central outlay of INR2 lakh \ncrore. This year, INR1.48 lakh crore has \nbeen earmarked for education, \nemployment, and skill development. \nFurthermore, the budget outlines nine 

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

accuracy_scores = {}

# Precompute query embedding to avoid redundant calculations
query_embedding = embed_model.encode([query])[0]

for pdf in pdf_files:
    accuracy_scores[pdf] = [
        cosine_similarity([query_embedding], [embed_model.encode([chunk])[0]])[0][0]
        for chunk in retrieved_chunks[pdf]
    ]

# Calculate average accuracy score per PDF
final_scores = {pdf: sum(scores) / len(scores) if scores else 0 for pdf, scores in accuracy_scores.items()}
print(final_scores)

{'budget.pdf': np.float32(0.09689766), 'farmerbook.pdf': np.float32(0.16040792), 'llama.pdf': np.float32(0.2797297)}


In [25]:
from fastbm25 import fastbm25 as BM25

bm25_scores = {}

# Ensure the query is tokenized properly
query_tokens = query.lower().split()

for pdf in pdf_files:
    if not chunked_data.get(pdf):  # Handle missing chunk data
        print(f"Warning: No chunked data found for {pdf}. Skipping...")
        continue

    # Tokenize the document chunks for BM25
    tokenized_chunks = [chunk.lower().split() for chunk in chunked_data[pdf]]

    bm25 = BM25(tokenized_chunks)  # Initialize BM25 model with tokenized chunks

    # Use `top_k_sentence()` instead of `get_scores()`
    ranked_chunks = bm25.top_k_sentence(query_tokens, k=5)  # Retrieve top 5 relevant chunks

    # Extract only the text from the ranked results
    bm25_scores[pdf] = [chunked_data[pdf][idx] for _, idx, _ in ranked_chunks]

print("BM25 reranking completed.")

BM25 reranking completed.


In [26]:
import os 
from dotenv import load_dotenv

In [27]:
import os
from langchain_google_genai import GoogleGenerativeAI

# Load Google API key
google_api = os.getenv("GOOGLE_API_KEY")

# Initialize Gemini LLM (Use Gemini 2.5 Pro for best performance)
llm = GoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api)

# Define prompt template
prompt_template = """Summarize the extracted PDF chunks:
Context: {context}
User Query: {query}
Response:"""

# Generate responses using Gemini
responses = {
    pdf: llm.invoke(prompt_template.format(context="\n".join(bm25_scores[pdf][:5]), query=query))
    for pdf in pdf_files
}

print(responses)



In [29]:
from docx import Document

# Create a new Word document
doc = Document()
doc.add_heading("RAG Pipeline Summary", 0)

# Iterate through responses, ensuring proper formatting
for pdf, response in responses.items():
    if not response:  # Handle cases where response is empty or missing
        response = "No relevant information found in this document."

    doc.add_heading(pdf, level=1)
    doc.add_paragraph(response)

    # Add a separator for better readability
    doc.add_paragraph("\n" + "-" * 50 + "\n")

# Save the document
doc.save("rag_output.docx")

print("RAG output saved successfully as 'rag_output.docx'.")

RAG output saved successfully as 'rag_output.docx'.
