In [7]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils

# Load the eligibility criteria text file
filename = "concatenated_text.txt"

import chardet

# Detect the encoding of the file
with open(filename, 'rb') as f:
    result = chardet.detect(f.read())
    
# Load the file with the detected encoding
raw_text = open(filename, encoding=result['encoding']).read()


# with open(filename, 'r') as f:
#     raw_text = f.read()

# Create a mapping of unique characters to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# Prepare the data for training
seq_length = 50
dataX = []
dataY = []
for i in range(0, len(raw_text) - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

# Reshape the input data
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# Normalize the input data
X = X / float(len(chars))

# One-hot encode the output data
y = np_utils.to_categorical(dataY)

# Define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=5, batch_size=128)

# Use the trained model to predict the eligibility criteria
def predict_eligibility_criteria(model, tender_text):
    # Prepare the input data
    x_input = [char_to_int[char] for char in tender_text]
    x_input = np.array(x_input)
    x_input = np.reshape(x_input, (1, seq_length, 1))
    x_input = x_input / float(len(chars))
    # Generate the predicted output
    y_output = model.predict(x_input, verbose=0)
    # Convert the predicted output to text
    index = np.argmax(y_output)
    result = ""
    for i in range(seq_length):
        result += chars[x_input[0][i][0]]
    for i in range(1000):
        x_input[0][:-1] = x_input[0][1:]
        x_input[0][-1][0] = index
        y_output = model.predict(x_input, verbose=0)
        index = np.argmax(y_output)
        result += chars[index]
    return result



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
model.save("eligibility_criteria_model.h5")


In [9]:
from keras.models import load_model
model = load_model("eligibility_criteria_model.h5")


In [10]:
def predict_eligibility_criteria(model, tender_text):
    # Prepare the input data
    x_input = []
    for char in tender_text:
        if char in char_to_int:
            x_input.append(char_to_int[char])
    x_input = np.array(x_input)
    x_input = np.reshape(x_input, (1, seq_length, 1))
    x_input = x_input / float(len(chars))
    # Generate the predicted output
    y_output = model.predict(x_input, verbose=0)
    # Convert the predicted output to text
    index = np.argmax(y_output)
    result = chars[index]
    return result


In [11]:
from tensorflow.keras.models import load_model
model = load_model("eligibility_criteria_model.h5")


In [12]:
import numpy as np

# Define the maximum length of the input sequence
seq_length = 50

# Define the character set used in the input text
#char_set = sorted(list(set(open("concatenated_text.txt").read())))
char_set = sorted(list(set(open("concatenated_text.txt", encoding="ISO-8859-1").read())))


# Define dictionaries to convert characters to integers and vice versa
char_to_int = dict((c, i) for i, c in enumerate(char_set))
int_to_char = dict((i, c) for i, c in enumerate(char_set))

# Load the trained model
model = load_model("eligibility_criteria_model.h5")

def predict_eligibility_criteria(model, tender_text):
    # Remove any characters that are not in the character set
    tender_text = "".join(c for c in tender_text if c in char_set)

    # If the input text is longer than the maximum sequence length,
    # split it into chunks of length seq_length
    if len(tender_text) > seq_length:
        chunks = [tender_text[i:i+seq_length] for i in range(0, len(tender_text), seq_length)]
        x_input = []
        for chunk in chunks:
            x_input_chunk = [char_to_int[char] for char in chunk]
            x_input_chunk = np.array(x_input_chunk)
            x_input_chunk = np.reshape(x_input_chunk, (1, seq_length, 1))
            x_input.append(x_input_chunk)
        x_input = np.concatenate(x_input, axis=0)
    else:
        x_input = [char_to_int[char] for char in tender_text]
        x_input = np.array(x_input)
        x_input = np.reshape(x_input, (1, seq_length, 1))

    # Predict the output sequence using the trained model
    y_output = model.predict(x_input, verbose=0)

    # Convert the output sequence to text
    qualification_criteria = ""
    for y in y_output:
        char_index = np.argmax(y)
        char = int_to_char[char_index]
        qualification_criteria += char

    return qualification_criteria

# Test the function with a sample input
new_tender = open("new_tender.txt").read()
qualification_criteria = predict_eligibility_criteria(model, new_tender)
print(qualification_criteria)


ValueError: ignored

In [None]:
# Split the text into chunks of length 100
chunks = [new_tender[i:i+seq_length] for i in range(0, len(new_tender), seq_length)]

# Predict the eligibility criteria for each chunk
qualification_criteria = ""
for chunk in chunks:
    result = predict_eligibility_criteria(model, chunk)
    qualification_criteria += result

# Print the predicted eligibility criteria
print(qualification_criteria)


In [16]:
import numpy as np
from transformers import AutoTokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils

# Load the eligibility criteria text file
filename = "concatenated_text.txt"

import chardet

# Detect the encoding of the file
with open(filename, 'rb') as f:
    result = chardet.detect(f.read())

# Load the file with the detected encoding
raw_text = open(filename, encoding=result['encoding']).read()

# Create a mapping of unique characters to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# Prepare the data for training
seq_length = 50
dataX = []
dataY = []
for i in range(0, len(raw_text) - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

# Reshape the input data
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# Normalize the input data
X = X / float(len(chars))

# One-hot encode the output data
y = np_utils.to_categorical(dataY)

# Define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=5, batch_size=128)

# Use the trained model to predict the eligibility criteria
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def predict_eligibility_criteria(model, tender_text):
    # Encode the input text
    encoded_text = tokenizer.encode_plus(tender_text, max_length=50, padding='max_length', truncation=True, return_tensors='pt')
    x_input = encoded_text['input_ids'].numpy().squeeze()
    x_input = np.reshape(x_input, (1, seq_length, 1))
    x_input = x_input / float(len(chars))
    # Generate the predicted output
    y_output = model.predict(x_input, verbose=0)
    # Convert the predicted output to text
    index = np.argmax(y_output)
    result = chars[index]
    return result

model.save("eligibility_criteria_model.h5")
from keras.models import load_model
model = load_model("eligibility_criteria_model.h5")

def predict_eligibility_criteria(model, tender_text):
    # Encode the input text
    encoded_text = tokenizer.encode_plus(tender_text, max_length=50, padding='max_length', truncation=True, return_tensors='pt')
    x_input = encoded_text['input_ids'].numpy().squeeze()
    x_input = np.reshape(x_input, (1, seq_length, 1))
    x_input = x_input / float(len(chars))
    # Generate the predicted output
    y_output = model.predict(x_input, verbose=0)
    # Convert the predicted output to text
    index = np.argmax(y_output)
    result = chars[index]
    return result


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
new_tender = open("new_tender.txt").read()
# Detect the encoding of the file
with open(filename, 'rb') as f1:
    result = chardet.detect(f1.read())
qualification_criteria = predict_eligibility_criteria(model, new_tender)
print(qualification_criteria)


m


In [15]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [2]:
def classify_text(model, text, keywords):
    # Encode the input text
    encoded_text = tokenizer.encode_plus(text, max_length=50, padding='max_length', truncation=True, return_tensors='pt')
    x_input = encoded_text['input_ids'].numpy().squeeze()
    x_input = np.reshape(x_input, (1, seq_length, 1))
    x_input = x_input / float(len(chars))
    # Generate the predicted output
    y_output = model.predict(x_input, verbose=0)
    # Get the top predicted class
    index = np.argmax(y_output)
    predicted_class = chars[index]
    # Check if the predicted class is one of the keywords
    if predicted_class in keywords:
        return predicted_class
    else:
        return "Unknown"


In [3]:
keywords = {'pre-qualifi cation', 'eligibility criteria', 'pre – qualification', 
             'pre-qualification', 'pq criteria', 
           'prequalification', 'pre -qualification', 'pre -qualification detail', 
           'conditions of eligibility of bidder', 
           'pre-qualification performa', 'evaluation and qualification criteria',
            'qualifying criteria', 'eligible bidders', 
           'qualification criteria','minimum eligibility criteria','experience criteria',
           'financial criteria','enhancement factor','litigation history'}
# classification = classify_text(model, new_tender, keywords)
# print(classification)


NameError: ignored

In [4]:
import numpy as np
from transformers import AutoTokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils

# Load the eligibility criteria text file
filename = "concatenated_text.txt"

import chardet

# Detect the encoding of the file
with open(filename, 'rb') as f:
    result = chardet.detect(f.read())

# Load the file with the detected encoding
raw_text = open(filename, encoding=result['encoding']).read()

# Create a mapping of unique characters to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# Prepare the data for training
seq_length = 50
dataX = []
dataY = []
for i in range(0, len(raw_text) - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

# Reshape the input data
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# Normalize the input data
X = X / float(len(chars))

# One-hot encode the output data
y = np_utils.to_categorical(dataY)

# Define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train the model
model.fit(X, y, epochs=5, batch_size=128)

# Use the trained model to predict the eligibility criteria
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


def predict_eligibility_criteria(model, tender_text):
    # Check if any keywords are in the input text
    if any(keyword in tender_text.lower() for keyword in keywords):
        # Encode the input text
        encoded_text = tokenizer.encode_plus(tender_text, max_length=50, padding='max_length', truncation=True, return_tensors='pt')
        x_input = encoded_text['input_ids'].numpy().squeeze()
        x_input = np.reshape(x_input, (1, seq_length, 1))
        x_input = x_input / float(len(chars))
        # Generate the predicted output
        y_output = model.predict(x_input, verbose=0)
        # Convert the predicted output to text
        index = np.argmax(y_output)
        result = chars[index]
        return result
    else:
        return "No eligibility criteria found."

model.save("eligibility_criteria_model.h5")
from keras.models import load_model
model = load_model("eligibility_criteria_model.h5")

new_tender = open("new_tender.txt").read()
# Detect the encoding of the file
with open(filename, 'rb') as f1:
    result = chardet.detect(f1.read())
qualification_criteria = predict_eligibility_criteria(model, new_tender)
print(qualification_criteria)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
d


In [5]:
def extract_sentences_with_keywords(text, keywords):
    # Split the text into sentences
    sentences = text.split(". ")
    # Initialize a list to hold the sentences containing the keywords
    matching_sentences = []
    # Iterate over each sentence
    for sentence in sentences:
        # Check if the sentence contains any of the keywords
        if any(keyword in sentence.lower() for keyword in keywords):
            # Add the sentence to the list of matching sentences
            matching_sentences.append(sentence)
    # Return the list of matching sentences
    return matching_sentences


In [6]:

# Call the function and print the matching sentences
new_tender = open("new_tender.txt").read()
matching_sentences = extract_sentences_with_keywords(new_tender, [qualification_criteria])
print("Number of matching sentences: ", len(matching_sentences))
for sentence in matching_sentences:
    print(sentence)

Number of matching sentences:  22
 Name of Work: Construction of Auditorium Building at Godhra Nagarpalika

Godhra Nagarpalika, Godhra 
 
Page 47 of 64 
Condition of Contract 
Name of work: Construction of Auditorium Building at Godhra Nagarpalika including Acoustic 
Work, Electrification, Air Conditioning, Fire Safety etc., Godhra Nagar Palika, Dist.: Panchmahal

(Fifth Attempt) 
 
qualification :
The interested bidders should meet the following minimum qualifying criteria: 
1
Experience of having successfully completed similar works during the last 5 years ending previous 
day of last date of submission of tenders

a) Three similar works each costing not less than 40% of the estimated cost of the project

b) Two similar works each costing not less than 60% of the estimated cost of the project

c) One similar work costing not less than 80% of the estimated cost of the project
Bridge and Frame Structure Buildings and Renovation 
Works for Government or Semi Government only

The requisi

In [13]:
def extract_sentences_with_keywords(text, keywords):
    # Split the text into sentences
    sentences = text.split(". ")
    # Initialize a list to hold the sentences containing the keywords
    matching_sentences = []
    # Iterate over each sentence
    for sentence in sentences:
        # Check if any of the keywords appear in the sentence
        if any(keyword.lower() in sentence.lower() for keyword in keywords):
            # Add the sentence to the list of matching sentences
            matching_sentences.append(sentence)
    # Return the list of matching sentences
    return matching_sentences


new_tender = open("new_tender.txt").read()
matching_sentences = extract_sentences_with_keywords(new_tender, keywords)
for sentence in matching_sentences:
    print(sentence)
    print(len(sentence))

For similar work experience the details of only those works 
mentioned in Annexure IX may be given in Performa no
113
Self-attested copy of Certificates in support of meeting the criterion of Similar Work Experience in 
accordance with Para 2(a) of Annexure I.
7
144
Self-attested copy of Certificates in support of meeting the criterion of Construction Experience in 
key activities/specified components in accordance with Para 2(b) of Annexure I.
9
183
WORK EXPERIENCE
a) Similar Works Experience
The bidder should have satisfactorily completed in his own name or proportionate share as a 
Page 30
member of a joint venture of minimum value during the last 7 (seven) years prior to the last 
stipulated date for submission of the bid
280
The 
conversion rate shall be decided by RITES based on the rates of currency on the date of 
completion of work (the bidder to also submit the currency conversion rate as on completion date 
of the Credential Certificate relied upon by the bidder for the purp

In [11]:
def predict_eligibility_criteria(model, tender_text):
    # Encode the input text
    encoded_text = tokenizer.encode_plus(tender_text, max_length=50, padding='max_length', truncation=True, return_tensors='pt')
    x_input = encoded_text['input_ids'].numpy().squeeze()
    x_input = np.reshape(x_input, (1, seq_length, 1))
    x_input = x_input / float(len(chars))
    # Generate the predicted output
    y_output = model.predict(x_input, verbose=0)
    # Convert the predicted output to text
    index = np.argmax(y_output, axis=1)
    result = ''.join(chars[i] for i in index)
    print(result)
    return result


In [12]:
new_tender = open("new_tender.txt").read()
# Detect the encoding of the file
with open(filename, 'rb') as f1:
    result = chardet.detect(f1.read())
qualification_criteria = predict_eligibility_criteria(model, new_tender)
print(qualification_criteria)


d
d


In [14]:
def extract_paragraph_with_keywords(text, keywords):
    # Split the text into paragraphs
    paragraphs = text.split("\n\n")
    # Initialize a list to hold the paragraphs containing the keywords
    matching_paragraphs = []
    # Iterate over each paragraph
    for paragraph in paragraphs:
        # Check if any of the keywords appear in the paragraph
        if any(keyword.lower() in paragraph.lower() for keyword in keywords):
            # Add the paragraph to the list of matching paragraphs
            matching_paragraphs.append(paragraph)
    # Return the list of matching paragraphs
    return matching_paragraphs


new_tender = open("new_tender.txt").read()
matching_paragraphs = extract_paragraph_with_keywords(new_tender, keywords)
for paragraph in matching_paragraphs:
    print(paragraph)
    print(len(paragraph))


Page 1
RITES Limited
(A Govt. of India Enterprise)
E-Open Tender Document
For
Overhauling of
Cast Steel Bogies, Qty 27 Nos.
of ALCO WDS6 Locomotives.
TENDER No. RITES/RES/2022/BSP/R3Y-R6Y/BOGIE/F-2
SEPTEMBER 2022
 RAILWAY EQUIPMENT SERVICES DIVISION 
 
 RITES BHAWAN,
2
nd Floor, RIGHT WING, 
PLOT NO. - 1, SECTOR - 29, 
GURGAON -122 001 (HARYANA)
Deadline for Submission of Bids: 18.00 Hrs on 17.09.2022
Cost of Tender Document Rs 2500/-
Page 2
SECTION 1
NOTICE INVITING TENDER AND 
INSTRUCTIONS TO TENDERERS
Page 3
SECTION 1
NOTICE INVITING TENDER AND INSTRUCTIONS TO TENDERERS
1.0 GENERAL
1.1 Tender Notice
Tenders are invited through E-Tendering system by RITES Ltd., a Public Sector Enterprise under 
the Ministry of Railways, acting for and on behalf of SAIL/BSP/Bhilai (Employer) as an 
Agent/Power of Attorney Holder, from eligible Indian firms/agencies satisfying the set 
eligibility/qualifying criteria for the work of “Overhauling of Cast Steel Bogies, Qty 27Nos. of
ALCO WDS6 Locomotives

In [17]:
import docx2txt

def extract_paragraph_with_keywords(doc_path, keywords):
    # Read the document file
    text = docx2txt.process(doc_path)
    # Split the text into paragraphs
    paragraphs = text.split("\n\n")
    # Initialize a list to hold the paragraphs containing the keywords
    matching_paragraphs = []
    # Iterate over each paragraph
    for paragraph in paragraphs:
        # Check if any of the keywords appear in the paragraph
        if any(keyword.lower() in paragraph.lower() for keyword in keywords):
            # Add the paragraph to the list of matching paragraphs
            matching_paragraphs.append(paragraph)
    # Return the list of matching paragraphs
    return matching_paragraphs

# Example usage
doc_path = "16.docx"

matching_paragraphs = extract_paragraph_with_keywords(doc_path, keywords)
for paragraph in matching_paragraphs:
    print(paragraph)
    print(len(paragraph))


The bidder shall fill the pre-qualification Performa at Annexure IX. The bid will be evaluated only considering those details and corresponding documents as mentioned in Annexure IX and no other details/certificate/document will be taken in to consideration while evaluating the bid to decide whether the bidder is qualified or not. For similar work experience the details of only those works mentioned in Annexure IX may be given in Performa no. 1 attached to Annexure-I.
472
	Self-attested copy of Certificates in support of meeting the criterion of Similar Work Experience in accordance with Para 2(a) of Annexure I.
142
	Self-attested copy of Certificates in support of meeting the criterion of Construction Experience in key activities/specified components in accordance with Para 2(b) of Annexure I.
181
	WORK EXPERIENCE
16
	Similar Works Experience
25
In case the Bidder (Indian Company) wishes to rely on a work completed abroad, the value of such completed work in foreign convertible curren

In [16]:
!pip install docx2txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3977 sha256=1dd50e952fae19d6759617e31bca9876e79230c40ebf7d8fb1f5c7a944056741
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8
