In [1]:
import comtypes.client

def convert_doc_to_docx(doc_path, docx_path):

    word = comtypes.client.CreateObject("Word.Application")
    doc = word.Documents.Open(doc_path)
    doc.SaveAs(docx_path, FileFormat=16)  # 16 corresponds to .docx format
    doc.Close()
    word.Quit()

# Example Usage
convert_doc_to_docx("D:\ProvenTech\Document_1_Test.doc", "D:\ProvenTech\converted_doc1.docx")

In [2]:
from docx import Document
import pandas as pd

def extract_table_to_dataframe(file_path, table_index=0):

    # Load the Word document
    doc = Document(file_path)
    
    # Get all tables in the document
    tables = doc.tables
    
    if table_index >= len(tables):
        raise IndexError(f"Table index {table_index} is out of range. The document has {len(tables)} tables.")
    
    # Select the desired table
    table = tables[table_index]
    
    # Extract data from the table
    data = []
    for row in table.rows:
        # Remove \n and strip each cell's text
        data.append([cell.text.replace('\n', ' ').strip() for cell in row.cells])
    
    # Convert data to DataFrame
    df = pd.DataFrame(data)
    
    # Set the first row as the header if applicable
    df.columns = df.iloc[0]  # Use the first row as the header
    df = df[1:]  # Drop the header row from data
    
    return df

# Example Usage
file_path = r"D:\ProvenTech\converted_doc1.docx"  # Replace with the path to your Word document
table_index = 0  # Index of the table to extract (0 for the first table)
df = extract_table_to_dataframe(file_path, table_index)


In [3]:
df

Unnamed: 0,S. No.,Test,Specification Limit,Reference
1,1.0,Description,White to off white capsules,In-house
2,2.0,Blend Homogeneity,Stage – 1: Individual values should be between...,In-house
3,3.0,Water by KF,Not more than 8.0 % w/w,In-house


In [5]:
df.iloc[1,2]

'Stage – 1: Individual values should be between 90.0% to 110.0%, average of all individuals should be within 95.0% - 105.0% of labelled amount, RSD NMT 5.0%  Stage – 2: Individual values should be between 85.0% to 115.0%, Average of all individuals should be within 95.0%  - 105.0% of labelled amount, RSD NMT 5.0%'

In [11]:
df.columns=["Sr.","Test","Specification","Reference"]

In [12]:
df

Unnamed: 0,Sr.,Test,Specification,Reference
1,1.0,Description,White to off white capsules,In-house
2,2.0,Blend Homogeneity,Stage – 1: Individual values should be between...,In-house
3,3.0,Water by KF,Not more than 8.0 % w/w,In-house


In [13]:
SPEC_TABLE=df

In [14]:
SPEC_TABLE

Unnamed: 0,Sr.,Test,Specification,Reference
1,1.0,Description,White to off white capsules,In-house
2,2.0,Blend Homogeneity,Stage – 1: Individual values should be between...,In-house
3,3.0,Water by KF,Not more than 8.0 % w/w,In-house


In [15]:
SPEC_TABLE.to_csv("D:\ProvenTech\SPEC_TABLE_1.csv", index=False)

In [17]:
lst=df.iloc[0].tolist()
lst

['1.0', 'Description', 'White to off white capsules', 'In-house']

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_groq import ChatGroq
llm = ChatGroq(
        model_name="llama-3.1-70b-versatile",
        temperature=0,
        groq_api_key="Your Groq API key"
    )
prompt_template = PromptTemplate.from_template(
    """You are an expert in deriving a list from another list based on given conditions.
    you are given a row of a dataframe in the form of list as input. the columns of this dataframe is [Sr., Test,Specification,Reference].
    you have to return a list so that I can append it to a new dataframe. the new dataframe columns are [TEST_NAME,SUB_TEST,TEXT_LIMIT,NLT(Not Less Than),NMT(Not More Than)]
    so keep in mind to return list of values in such a way that I can append to this dataframe.
    here are the conditions of convertion:
    TEST_NAME: This will be directly taken from the Test column in SPEC_TABLE.
    SUB_TEST: If a sub-test is present, populate this column with its name; otherwise, use the value from the Test column.
    TEXT_LIMIT: Extract this directly from the Specification column of SPEC_TABLE.
    NLT (Not Less Than): Extract this value from the Specification text where it states "not less than" and store it as a string.
    NMT (Not More Than): Similarly, extract this value where it states "not more than" and store it as a string.
    Handling "Between" Specifications:
    If the Specification mentions "between," you will encounter two values. Populate these values accordingly:
    The first value should be assigned to NLT,
    The second value should be assigned to NMT.
    If Specification value has more than one value then return those number of lists with seperated sub topics and mention those names in subtests.
    example for splitting: input:[2.0,"Blend Homogeneity","Stage - 1: here is stage1 content. Stage - 2: here is stage2 content","In-house"]
    output:[["Blend Homogeneity","Stage - 1","here is stage1 content.","NLT extracted from specification","NMT extracted from specification"],
    ["Blend Homogeneity","Stage - 2","here is stage2 content.","NLT extracted from specification","NMT extracted from specification"]]
    if NLT and NMT are not present just put None.
    input :{lst}
    output: only return list and nothing else.
    NO PREAMBLE
    return only list not code or any explanation

    """
)
chain = LLMChain(llm=llm, prompt=prompt_template)
response = chain.run({"lst":lst})
#response = llm(prompt_template)

print(response)

['Description', 'Description', 'White to off white capsules', None, None]


In [32]:
lst=df.iloc[1].tolist()
response = chain.run({"lst":lst})
print(response)

[['Blend Homogeneity', 'Stage - 1', 'Individual values should be between 90.0% to 110.0%, average of all individuals should be within 95.0% - 105.0% of labelled amount, RSD NMT 5.0%', '90.0%', '110.0%'], 
 ['Blend Homogeneity', 'Stage - 2', 'Individual values should be between 85.0% to 115.0%, Average of all individuals should be within 95.0%  - 105.0% of labelled amount, RSD NMT 5.0%', '85.0%', '115.0%']]


In [33]:
lst=df.iloc[2].tolist()
response = chain.run({"lst":lst})
print(response)

['Water by KF', 'Water by KF', 'Not more than 8.0 % w/w', None, '8.0 % w/w']


In [35]:
columns = ["TEST_NAME", "SUB_TEST", "TEXT_LIMIT", "NLT (Not Less Than)", "NMT (Not More Than)"]

# Create an empty DataFrame with the defined columns
LIMITS = pd.DataFrame(columns=columns)

# Function to append a row or multiple rows to the DataFrame
def append_to_dataframe(LIMITS, data):
    # Ensure data is a list of lists
    if isinstance(data[0], list):
        new_data = data  # Data is already a list of lists
    else:
        new_data = [data]  # Convert single row to a list of lists

    # Convert new data to a DataFrame and append
    new_df = pd.DataFrame(new_data, columns=LIMITS.columns)
    return pd.concat([LIMITS, new_df], ignore_index=True)



In [36]:
response

"['Water by KF', 'Water by KF', 'Not more than 8.0 % w/w', None, '8.0 % w/w']"

In [37]:
type(response)

str

In [39]:
import ast

# String representation of a list
list_string = response

# Convert string to list
actual_list = ast.literal_eval(list_string)

print(actual_list)  # Output: [1, 2, 3]
print(type(actual_list))  # Output: <class 'list'>


['Description', 'Description', 'White to off white capsules', None, None]
<class 'list'>


In [40]:

LIMITS = append_to_dataframe(LIMITS, actual_list)

print(LIMITS)

     TEST_NAME     SUB_TEST                   TEXT_LIMIT NLT (Not Less Than)  \
0  Description  Description  White to off white capsules                None   

  NMT (Not More Than)  
0                None  


In [41]:
LIMITS

Unnamed: 0,TEST_NAME,SUB_TEST,TEXT_LIMIT,NLT (Not Less Than),NMT (Not More Than)
0,Description,Description,White to off white capsules,,


In [None]:
# Multiple rows
rows = [
    ["Test2", "SubTest2", "20", "10", "30"],
    ["Test3", "SubTest3", "30", "15", "45"]
]
dtf = append_to_dataframe(dtf, rows)

# Print the resulting DataFrame
print(dtf)

In [42]:
lst=df.iloc[1].tolist()
response = chain.run({"lst":lst})
actual_list = ast.literal_eval(response)
LIMITS = append_to_dataframe(LIMITS, actual_list)

In [43]:
LIMITS

Unnamed: 0,TEST_NAME,SUB_TEST,TEXT_LIMIT,NLT (Not Less Than),NMT (Not More Than)
0,Description,Description,White to off white capsules,,
1,Blend Homogeneity,Stage - 1,Individual values should be between 90.0% to 1...,90.0%,110.0%
2,Blend Homogeneity,Stage - 2,Individual values should be between 85.0% to 1...,85.0%,115.0%


In [44]:
lst=df.iloc[2].tolist()
response = chain.run({"lst":lst})
actual_list = ast.literal_eval(response)
LIMITS = append_to_dataframe(LIMITS, actual_list)

In [45]:
LIMITS

Unnamed: 0,TEST_NAME,SUB_TEST,TEXT_LIMIT,NLT (Not Less Than),NMT (Not More Than)
0,Description,Description,White to off white capsules,,
1,Blend Homogeneity,Stage - 1,Individual values should be between 90.0% to 1...,90.0%,110.0%
2,Blend Homogeneity,Stage - 2,Individual values should be between 85.0% to 1...,85.0%,115.0%
3,Water by KF,Water by KF,Not more than 8.0 % w/w,,8.0 % w/w


In [46]:
LIMITS.to_csv("D:\ProvenTech\LIMITS_1.csv", index=False)

In [47]:
SPEC_TABLE

Unnamed: 0,Sr.,Test,Specification,Reference
1,1.0,Description,White to off white capsules,In-house
2,2.0,Blend Homogeneity,Stage – 1: Individual values should be between...,In-house
3,3.0,Water by KF,Not more than 8.0 % w/w,In-house


In [48]:
LIMITS

Unnamed: 0,TEST_NAME,SUB_TEST,TEXT_LIMIT,NLT (Not Less Than),NMT (Not More Than)
0,Description,Description,White to off white capsules,,
1,Blend Homogeneity,Stage - 1,Individual values should be between 90.0% to 1...,90.0%,110.0%
2,Blend Homogeneity,Stage - 2,Individual values should be between 85.0% to 1...,85.0%,115.0%
3,Water by KF,Water by KF,Not more than 8.0 % w/w,,8.0 % w/w
