In [1]:
!pip install pandas beautifulsoup4 scikit-learn transformers torch torchvision -q

[0m

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
import re
from bs4 import BeautifulSoup

In [3]:

# Set the device to GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the CSV file
csv_file = '/kaggle/input/tensorflow-classification-multilevel/tenders_08062023.csv'
df = pd.read_csv(csv_file)
df['ProductName'] = df['ProductName'].apply(lambda x: ', '.join(set([item.strip() for item in x.split(',')])))

df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text  # Remove HTML tags
    text = re.sub(r'\|\|\|', r' ', text)  # Replace ||| with a single space
    text = re.sub(r'http\S+', r'<URL>', text)  # Replace URLs starting with http or https with <URL>
    text = text.lower()
    text = text.replace('x', '')  # Remove occurrences of the letter 'x'
    return text

def preprocess_text(document):
    document = re.sub(r'\W', ' ', str(document))  # Remove special characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)  # Remove single characters
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)  # Remove single characters from the start
    document = re.sub(r'\s+', ' ', document, flags=re.I)  # Replace multiple spaces with single space
    document = re.sub(r'^b\s+', '', document)  # Remove prefixed 'b'
    document = document.lower()  # Convert to lowercase
    return document



In [4]:
# Clean and preprocess the text data
df['ProductDetails'] = df['ProductDetails'].apply(cleanText)
df['ProductDetails'] = df['ProductDetails'].apply(preprocess_text)


  text = BeautifulSoup(text, "lxml").text  # Remove HTML tags


In [5]:
df.dropna(inplace=True)

In [6]:
# Shuffle the dataframe
df = shuffle(df, random_state=42)
df=df.head(10000)
# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
df

Unnamed: 0,TCNo,sr_no,ProductDetails,ProductName
7299,54497115,56740122,gas used at the kobe district court office con...,Security Equipment
344294,47690401,49939216,rehabilitation of the albu akash water station...,"Pump House, Pipeline Project"
57325,49168272,51416132,providing installing 1 no 250 mm dia straight ...,"Drill Machine, Water Supply System, Tube Well"
355514,54892385,57136406,providing false ceiling for the lab at csb 120...,False Ceiling
448544,48312600,50561137,supply of various items listed in bhel scanner...,Detector
...,...,...,...,...
447493,59859514,62123393,sale of unusable bit woods on si months rate c...,Drill Machine
310402,59638074,61900865,renovation of cpwd guest house in cgo towers a...,"Interior Works, Civil Work"
407325,47226338,49475402,fabrication dismantling and erection of pipe l...,"Pipeline Project, Dismantaling Work"
145654,52518540,54764581,2424 lac ads 2020 21 chalakudy la construction...,"Auditorium, Building"


In [8]:
df = df[['ProductDetails', 'ProductName']]

In [9]:
X = df['ProductDetails'].values
Y = df['ProductName'].values

In [42]:
# import torch
# from transformers import BertTokenizer, BertForSequenceClassification

import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
custom_labels = [
    "Lime", "Chlorinator", "Barrage", "Chimney", "Boring Machine", "Bullet Proof Jacket",
    "Auditorium", "Fountain", "Jetty", "Helmet", "Runway", "Dredging Work", "Land Levelling",
    "Earth Filling", "Stadium", "Bus Stand", "Chlorination Plant", "Drilling Work", "Tunnel Work",
    "Sump", "Temple", "Channel Work", "Ballast", "Trenching Work", "Statue", "Manhole Chamber",
    "Foundation", "Reverse Osmosis Plant", "Barrack", "Interior Works", "False Ceiling",
    "Pump House", "Land Development", "Effluent Treatment Plant", "Swimming Pool",
    "Sewage Treatment Plant", "Dam Gate", "Dismantling Work", "Lining Work", "Demolition",
    "Shelter", "Drill Machine", "Platform", "Earth Work", "Parking Work", "Dam Work",
    "Arms/Ammunation Equipment", "Detector", "Seal", "Lake Development", "Culvert Work",
    "Excavation Work", "Desilting", "Bore Well", "Lift Irrigation", "Cable Laying",
    "Fire Detection System", "Well Work", "Protection Kit", "Fire Alarm System",
    "Soil Investigation", "Landscape", "Tube Well", "Sports Ground", "Environmental Work",
    "C C T V System", "Reservoir", "R C C Work", "Sewerage Line", "Toilet", "Fencing Work",
    "Water Purification System", "Tank", "Water Treatment Plant", "Canal Work", "Painting Work",
    "Hardware And Accessories", "Yard Work", "Roof Work", "Bridge", "Fire Fighting System",
    "Water Supply System", "Plumbing And Sanitary Work", "Security Equipment",
    "Surveillance System", "Shed Construction", "Building Material", "Wall", "Laying Pipe",
    "Flooring", "Water Supply", "Drainage", "Pipeline Project", "Building", "Road", "Civil Work"
]


In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification

In [46]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [47]:
# Load the pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [48]:
train_df.shape

(8000, 4)

In [49]:
test_df.shape

(2000, 4)

In [50]:
# Tokenize the input data
train_encodings = tokenizer.batch_encode_plus(
    train_df['ProductDetails'].tolist(),
    truncation=True,
    padding=True,
    return_tensors='pt'
)



In [51]:
test_encodings = tokenizer.batch_encode_plus(
    test_df['ProductDetails'].tolist(),
    truncation=True,
    padding=True,
    return_tensors='pt'
)

In [52]:
# Prepare the input tensors
train_inputs = train_encodings['input_ids'].to(device)
train_masks = train_encodings['attention_mask'].to(device)
train_labels = train_df['ProductName']




In [53]:
test_inputs = test_encodings['input_ids'].to(device)
test_masks = test_encodings['attention_mask'].to(device)
test_labels = test_df['ProductName']

In [54]:

# Convert the labels to binary format
label_encoder = MultiLabelBinarizer()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

# Create PyTorch datasets
train_dataset = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels_encoded).to(device))
test_dataset = TensorDataset(test_inputs, test_masks, torch.tensor(test_labels_encoded).to(device))

# Define model configuration
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
learning_rate = 1e-5
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.BCEWithLogitsLoss()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [95]:
train_labels_encoded

array([[1, 1, 0, ..., 0, 1, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]])

In [94]:
num_labels

49

In [96]:
import numpy as np

# Save train_labels_encoded to a CSV file
np.savetxt('train_labels.csv', train_labels_encoded, delimiter=',', fmt='%d')


In [56]:
num_epochs = 1
print_interval = 100  

model.train()

for epoch in range(num_epochs):
    total_loss = 0
    
    for batch_idx, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.float().to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = loss_fn(logits, labels)
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        
        # Print batch loss
        if (batch_idx + 1) % print_interval == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
    
    # Print epoch loss
    epoch_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

print("Training finished.")


Epoch [1/1], Batch [100/500], Loss: 0.4999
Epoch [1/1], Batch [200/500], Loss: 0.4242
Epoch [1/1], Batch [300/500], Loss: 0.4183
Epoch [1/1], Batch [400/500], Loss: 0.4151
Epoch [1/1], Batch [500/500], Loss: 0.3969
Epoch [1/1], Loss: 0.4704
Training finished.


In [60]:
# save_path = "/kaggle/working/model"

# # Save the model
# model.save_pretrained(save_path)
        
# # Save the tokenizer as well
# tokenizer.save_pretrained(save_path)

# print("Model saved successfully.")


Model saved successfully.


In [57]:
eval_dataset=test_df

In [58]:
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)


In [59]:
from sklearn.metrics import classification_report

model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.float().to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Apply sigmoid activation and convert logits to probabilities
        probabilities = torch.sigmoid(logits)
        
        # Round probabilities to get binary predictions (0 or 1)
        predicted_labels = torch.round(probabilities).squeeze().cpu().tolist()
        
        # Collect predictions and true labels for computing metrics
        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().tolist())

# Calculate classification metrics
classification_metrics = classification_report(true_labels, predictions)

# Print classification metrics
print(classification_metrics)


              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1798
           1       0.79      0.71      0.75      1282
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00       121
           4       0.00      0.00      0.00       408
           5       0.90      0.04      0.08       657
           6       0.00      0.00      0.00       627
           7       0.00      0.00      0.00       227
           8       0.00      0.00      0.00       279
           9       0.00      0.00      0.00       148
          10       0.00      0.00      0.00        78
          11       0.00      0.00      0.00       123
          12       0.00      0.00      0.00        17
          13       0.00      0.00      0.00        40
          14       0.00      0.00      0.00       343
          15       0.00      0.00      0.00       291
          16       0.00      0.00      0.00        70
          17       0.91    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# model = torch.load("/kaggle/working/model")


In [78]:

decoded_labels = label_encoder.inverse_transform(train_labels_encoded)

# Print the decoded labels
# print(decoded_labels)


In [83]:
import numpy as np

# Load the label encoder used during training
label_encoder = MultiLabelBinarizer()
label_encoder.fit(train_labels)

# Convert the predictions to a NumPy array
predictions = np.asarray(predictions)

# Reshape the predictions to a 2-dimensional array
predictions = predictions.reshape(1, -1)

# Inverse transform the predictions to obtain the original labels
predicted_labels = label_encoder.inverse_transform(predictions)

print(predicted_labels)


[(' ', 'e', 'i', 'n', 'o', 'r')]


In [84]:
# Convert the predictions to a NumPy array
predictions = np.asarray(predictions)

print(predictions)


[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0
  0 1 1 0 0 1 0 0 0 0 0 0 0]]


In [86]:
# Convert the binary predictions to boolean values
predicted_labels_binary = predictions.astype(bool)

# Inverse transform the binary labels to obtain the original labels
predicted_labels = label_encoder.inverse_transform(predicted_labels_binary)

# Print the predicted labels
print(predicted_labels)


[(' ', 'e', 'i', 'n', 'o', 'r')]


In [87]:
import torch
import numpy as np

text = input("Enter your sentence: ")


encoding = tokenizer(text, return_tensors="pt")
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Apply sigmoid activation
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())

# Apply threshold for binary classification
threshold = 0.7
predictions = (probs >= threshold).int()

# Convert predictions to a list of binary labels
predicted_labels = predictions.squeeze().tolist()

print(predicted_labels)

Enter your sentence:  CONSTRUCTION OF ROAD


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [68]:
a=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(a)

49

In [69]:
# Apply sigmoid activation and convert logits to probabilities
probabilities = torch.sigmoid(logits)

# Round probabilities to get binary predictions (0 or 1)
predicted_labels = torch.round(probabilities).squeeze().cpu().tolist()

# Get the predicted classes based on the indices
predicted_classes = [label_encoder.classes_[i] for i, label in enumerate(predicted_labels) if label == 1]

# Print the predicted labels
print(f"Predicted Labels: {predicted_classes}")


Predicted Labels: [' ', ',', 'W', 'a', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 't']


In [151]:
import torch
import numpy as np

# Load your trained model
model = torch.load("/kaggle/working/model.pth")


# Define the preprocess_sentence function
def preprocess_sentence(sentence):
    max_length = 128  # Define the maximum length for your input sequences
    
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    
    # Add special tokens and convert tokens to input tensors
    input_ids = tokenizer.encode(tokens, add_special_tokens=True)
    
    # Pad or truncate the input to the defined maximum length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))
    
    # Create attention mask
    attention_mask = [1] * len(input_ids)
    
    # Convert input to tensors and move to device
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
    attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)
    
    return input_ids, attention_mask

# Example sentence
sentence = input("Enter your sentence: ")

# Preprocess the sentence
input_ids, attention_mask = preprocess_sentence(sentence)

# Pass the preprocessed input through the model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Apply sigmoid activation and convert logits to probabilities
    probabilities = torch.sigmoid(logits)
    
    # Round probabilities to get binary predictions (0 or 1)
    predicted_labels = torch.round(probabilities).squeeze().cpu().tolist()

# Get the predicted classes based on the indices
predicted_classes = [label_encoder.classes_[i] for i, label in enumerate(predicted_labels) if label == 1]

# Print the predicted labels
print(f"Predicted Labels: {predicted_classes}")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/model.pth'

In [147]:
import torch
import numpy as np

# Define the preprocess_sentence function
def preprocess_sentence(sentence):
    max_length = 128  # Define the maximum length for your input sequences
    
    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)
    
    # Add special tokens and convert tokens to input tensors
    input_ids = tokenizer.encode(tokens, add_special_tokens=True)
    
    # Pad or truncate the input to the defined maximum length
    input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))
    
    # Create attention mask
    attention_mask = [1] * len(input_ids)
    
    # Convert input to tensors and move to device
    input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)
    attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)
    
    return input_ids, attention_mask

# Example sentence
sentence = input("Enter your sentence: ")

# Preprocess the sentence
input_ids, attention_mask = preprocess_sentence(sentence)

# Pass the preprocessed input through the model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    
    # Apply sigmoid activation and convert logits to probabilities
    probabilities = torch.sigmoid(logits)
    
    # Round probabilities to get binary predictions (0 or 1)
    predicted_labels = torch.round(probabilities).squeeze().cpu().tolist()

# Get the predicted classes based on the indices
predicted_classes = [label_encoder.classes_[i] for i, label in enumerate(predicted_labels) if label == 1]

# Print the predicted labels
print(f"Predicted Labels: {predicted_classes}")


Enter your sentence:  road


Predicted Labels: ['Chlorinator', 'Barrage', 'Auditorium', 'Helmet', 'Land Levelling', 'Bus Stand', 'Chlorination Plant', 'Tunnel Work', 'Temple', 'Channel Work', 'Ballast', 'Statue', 'Reverse Osmosis Plant', 'Barrack', 'Interior Works', 'False Ceiling', 'Sewage Treatment Plant', 'Dismantling Work', 'Lining Work', 'Drill Machine', 'Earth Work', 'Parking Work', 'Lake Development', 'Bore Well', 'Well Work', 'Protection Kit', 'Landscape', 'Tube Well', 'Environmental Work', 'C C T V System', 'R C C Work', 'Water Purification System', 'Water Treatment Plant', 'Canal Work', 'Painting Work', 'Hardware And Accessories', 'Roof Work', 'Bridge', 'Security Equipment', 'Shed Construction', 'Building Material', 'Wall', 'Flooring', 'Building', 'Road']


In [143]:
df.head(10)


Unnamed: 0,ProductDetails,ProductName
7299,gas used at the kobe district court office con...,Security Equipment
344294,rehabilitation of the albu akash water station...,"Pump House, Pipeline Project"
57325,providing installing 1 no 250 mm dia straight ...,"Drill Machine, Water Supply System, Tube Well"
355514,providing false ceiling for the lab at csb 120...,False Ceiling
448544,supply of various items listed in bhel scanner...,Detector
311404,reno and upgrading works ofqtrno a3including c...,"Civil Work, False Ceiling, Flooring, Pipeline ..."
433491,lpr fhis 55 2022 construction hydraulic concre...,"Lining Work, R C C Work"
285017,providing and installation of 1000lph ro unit ...,"Water Purification System, Shed Construction, ..."
254967,the acquisition of collection composed of orig...,Statue
362378,interior furnishing electrical data work for b...,Interior Works


In [None]:
# import torch
# from transformers import BertTokenizer, BertForSequenceClassification
# from sklearn.preprocessing import MultiLabelBinarizer

# # Load pre-trained tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Set the device to GPU if available, otherwise use CPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Define label encoder
# label_encoder = MultiLabelBinarizer()

In [102]:
# # Define label encoder
# label_encoder = MultiLabelBinarizer()
# label_encoder.classes_ = [
#     "Lime", "Chlorinator", "Barrage", "Chimney", "Boring Machine", "Bullet Proof Jacket",
#     "Auditorium", "Fountain", "Jetty", "Helmet", "Runway", "Dredging Work", "Land Levelling",
#     "Earth Filling", "Stadium", "Bus Stand", "Chlorination Plant", "Drilling Work", "Tunnel Work",
#     "Sump", "Temple", "Channel Work", "Ballast", "Trenching Work", "Statue", "Manhole Chamber",
#     "Foundation", "Reverse Osmosis Plant", "Barrack", "Interior Works", "False Ceiling",
#     "Pump House", "Land Development", "Effluent Treatment Plant", "Swimming Pool",
#     "Sewage Treatment Plant", "Dam Gate", "Dismantling Work", "Lining Work", "Demolition",
#     "Shelter", "Drill Machine", "Platform", "Earth Work", "Parking Work", "Dam Work",
#     "Arms/Ammunation Equipment", "Detector", "Seal", "Lake Development", "Culvert Work",
#     "Excavation Work", "Desilting", "Bore Well", "Lift Irrigation", "Cable Laying",
#     "Fire Detection System", "Well Work", "Protection Kit", "Fire Alarm System",
#     "Soil Investigation", "Landscape", "Tube Well", "Sports Ground", "Environmental Work",
#     "C C T V System", "Reservoir", "R C C Work", "Sewerage Line", "Toilet", "Fencing Work",
#     "Water Purification System", "Tank", "Water Treatment Plant", "Canal Work", "Painting Work",
#     "Hardware And Accessories", "Yard Work", "Roof Work", "Bridge", "Fire Fighting System",
#     "Water Supply System", "Plumbing And Sanitary Work", "Security Equipment",
#     "Surveillance System", "Shed Construction", "Building Material", "Wall", "Laying Pipe",
#     "Flooring", "Water Supply", "Drainage", "Pipeline Project", "Building", "Road", "Civil Work"
# ]

In [114]:
# import torch
# import numpy as np
# from sklearn.preprocessing import MultiLabelBinarizer

In [148]:
# Example sentence
sentence = input("Enter your sentence: ")

# Preprocess the sentence
cleaned_sentence = cleanText(sentence)
preprocessed_sentence = preprocess_text(cleaned_sentence)

# Tokenize the preprocessed sentence
encoding = tokenizer.encode_plus(
    preprocessed_sentence,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

# Extract input tensors
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

# Move tensors to device
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)

# Define the threshold value
threshold = 0.6

# Perform inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)
    predicted_labels = (probabilities > threshold).squeeze().cpu().tolist()

# Convert predicted labels to integers
predicted_labels = [int(label) for label in predicted_labels]

# Filter out labels below the threshold
relevant_labels = [label_encoder.classes_[idx] for idx, label in enumerate(predicted_labels) if label == 1]

# Print the predicted labels
print(relevant_labels)

Enter your sentence:  National Highway Authority Of India for Independent Engineer Services for Supervision of (i) Construction of 4/6 Lane Northern Ayodhya Bypass total length of 35.40 kms of Part-1 north of NH-27, from km 0+000 to km 30+400 (Starting near existing km 112+540, ending at km 139+928 of NH-27) and Part- 2 south of NH-27 from km 0+000 to km 5+000 (ii) Construction of 4/6 Lane Southern Ayodhya Bypass from km 5+000 to km 37+172 (Starting near km 112+540, ending at km 153+281 of NH-27) of total length of 32.172 kms on HAM basis under NHDP Phase-VII in the State of Uttar Pradesh. at Not Classified,Uttar Pradesh,India


['Chlorinator', 'Landscape', 'Tube Well', 'Environmental Work', 'Hardware And Accessories', 'Roof Work', 'Bridge', 'Security Equipment']


In [138]:
# Preprocess the input sentence
input_sentence = input("Enter your sentence: ")
cleaned_sentence = cleanText(input_sentence)
preprocessed_sentence = preprocess_text(cleaned_sentence)

# Tokenize the preprocessed sentence
input_encoding = tokenizer.encode_plus(
    preprocessed_sentence,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

# Prepare the input tensors
input_ids = input_encoding['input_ids'].to(device)
attention_mask = input_encoding['attention_mask'].to(device)

# Switch model to evaluation mode
model.eval()

# Pass the input tensors through the model to obtain logits
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

# Apply threshold-based predictions
threshold = 0.5
probabilities = torch.sigmoid(logits)
predictions = (probabilities > threshold).long()

# Convert predictions back to labels

predicted_labels = [label_encoder.inverse_transform(pred) for pred in predictions.cpu().numpy()]

# Print the predicted labels
print("Predicted Labels:", predicted_labels)


Enter your sentence:  ROAD


IndexError: tuple index out of range

In [137]:
# print("Predicted labels:", predicted_labels)
# print("Classes:", label_encoder.classes_)


In [77]:
# import torch

# # User input sentence
# user_sentence = input("Enter your sentence: ")

# # Tokenize the user input
# user_encoding = tokenizer.encode_plus(
#     user_sentence,
#     truncation=True,
#     padding=True,
#     return_tensors='pt'
# )

# user_input_ids = user_encoding['input_ids'].to(device)
# user_attention_mask = user_encoding['attention_mask'].to(device)

# # Make predictions for the user input
# model.eval()
# with torch.no_grad():
#     user_outputs = model(user_input_ids, attention_mask=user_attention_mask)
#     user_logits = user_outputs.logits
#     user_predictions = (user_logits.sigmoid() > 0.5).cpu().numpy().tolist()

# # Convert predictions to custom labels for the user input
# user_custom_labels = [custom_labels[i] for i, pred in enumerate(user_predictions[0]) if pred]

# # Print the custom labels for the user input
# print(user_custom_labels)


Enter your sentence:  National Highway Authority Of India for RFP for Construction of Four Lane Elevated Corridor and at-grade improvements from Design Ch:0+000 to Design Ch: 19+870 of Danapur – Bihta Section with providing connectivity to the existing RoB near Danapur station (0.231 km), 1.35 Km ramps & at-grade improvements to Four lane section on Danapur side and Upgradation of existing Two lane carriageway to Four Lane carriageway from Design Ch:19+870 to Design Ch:23+500 of Bihta - Koilwar section (Total Length 25.081 Kms) in the state of Bihar on EPC Mode at Not Classified,Bihar,India


['Lime', 'Chlorinator', 'Bullet Proof Jacket', 'Ballast', 'Statue', 'Reverse Osmosis Plant', 'Barrack', 'False Ceiling', 'Land Development', 'Swimming Pool', 'Sewage Treatment Plant', 'Dismantling Work', 'Lining Work', 'Drill Machine', 'Earth Work', 'Parking Work', 'Dam Work']


In [128]:
# import torch

# # User input sentence
# user_sentence = input("Enter your sentence: ")

# # Tokenize the user input
# user_encoding = tokenizer.encode_plus(
#     user_sentence,
#     truncation=True,
#     padding=True,
#     return_tensors='pt'
# )

# user_input_ids = user_encoding['input_ids'].to(device)
# user_attention_mask = user_encoding['attention_mask'].to(device)

# # Make predictions for the user input
# model.eval()
# with torch.no_grad():
#     user_outputs = model(user_input_ids, attention_mask=user_attention_mask)
#     user_probabilities = torch.sigmoid(user_outputs.logits)
#     user_predictions = (user_probabilities > 0.5).cpu().numpy().tolist()

# # Convert predictions to custom labels for the user input
# user_custom_labels = [custom_labels[i] for i, pred in enumerate(user_predictions[0]) if pred]

# # Print the custom labels for the user input
# print(user_custom_labels)


In [27]:
# import torch
# from sklearn.metrics.pairwise import cosine_similarity

# num_epochs = 5
# print_interval = 100
# train_embeddings = []

# model.train()

# for epoch in range(num_epochs):
#     total_loss = 0
    
#     for batch_idx, batch in enumerate(train_loader):
#         input_ids, attention_mask, labels = batch
#         input_ids = input_ids.to(device)
#         attention_mask = attention_mask.to(device)
#         labels = labels.float().to(device)
        
#         optimizer.zero_grad()
        
#         # Pass the attention mask to individual layers of the model
#         outputs = model.bert(input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.pooler_output
        
#         # Compute logits from pooled output
#         logits = model.classifier(pooled_output)
        
#         loss = loss_fn(logits, labels)
#         loss.backward()
        
#         optimizer.step()
        
#         total_loss += loss.item()
        
#         # Save the embeddings for each batch
#         with torch.no_grad():
#             batch_embeddings = model.bert.embeddings(input_ids, attention_mask)
#             train_embeddings.extend(batch_embeddings.tolist())
        
#         # Print batch loss
#         if (batch_idx + 1) % print_interval == 0:
#             print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}")
    
#     # Print epoch loss
#     epoch_loss = total_loss / len(train_loader)
#     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

# print("Training finished.")


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 15.90 GiB total capacity; 14.96 GiB already allocated; 35.75 MiB free; 14.99 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [79]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

# User input sentence
user_sentence = input("Enter your sentence: ")

# Tokenize the user input
user_encoding = tokenizer.encode_plus(
    user_sentence,
    truncation=True,
    padding=True,
    return_tensors='pt'
)

user_input_ids = user_encoding['input_ids'].to(device)
user_attention_mask = user_encoding['attention_mask'].to(device)

# Get the embeddings for the user input
with torch.no_grad():
    user_embeddings = model.bert.embeddings(user_input_ids, user_attention_mask)

# Calculate cosine similarity with each training data embedding
similarities = torch.cosine_similarity(user_embeddings, train_embeddings)

# Find the indices of the most similar training data
most_similar_indices = similarities.argmax(dim=1).tolist()

# Get the corresponding custom labels
user_custom_labels = [train_df['ProductDetails'].iloc[idx] for idx in most_similar_indices]

# Print the custom labels for the user input
print(user_custom_labels)


Enter your sentence:  construction of road


NameError: name 'train_embeddings' is not defined

In [83]:
!pip install sentence_transformers


Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=5e335d4786b686023c516e2280a74f36bca43c9d269848984716441effee4ed8
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [84]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Example sentences
sentence1 = "construction of road."
sentence2 = "road"

# Encode sentences into embeddings
embeddings1 = model.encode([sentence1], convert_to_tensor=True)
embeddings2 = model.encode([sentence2], convert_to_tensor=True)

# Calculate cosine similarity
similarity = util.pytorch_cos_sim(embeddings1, embeddings2)[0][0].item()

print(f"Semantic textual similarity: {similarity:.4f}")


Downloading (…)7e0d5/.gitattributes:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0e5ca7e0d5/README.md: 0.00B [00:00, ?B/s]

Downloading (…)5ca7e0d5/config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)7e0d5/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading (…)0e5ca7e0d5/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)ca7e0d5/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Semantic textual similarity: 0.7728


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# # Load custom_labels dataset and extract the text column
# custom_labels = [...]  # Your custom_labels dataset

# Encode the text column to obtain label embeddings
label_embeddings = model.encode(custom_labels, convert_to_tensor=True)

# User input sentence
user_sentence = input("Enter your sentence: ")

# Encode the user input sentence
user_embedding = model.encode([user_sentence], convert_to_tensor=True)

# Calculate cosine similarity between user embedding and label embeddings
similarities = util.pytorch_cos_sim(user_embedding, label_embeddings)

# Sort labels based on similarity scores
sorted_indices = similarities.argsort(descending=True).squeeze().tolist()
sorted_labels = [custom_labels[i] for i in sorted_indices]
similarity_scores = similarities.squeeze().tolist()

# Print labels with higher similarity and their corresponding similarity scores
for label, score in zip(sorted_labels, similarity_scores):
    print(f"Label: {label}\nSimilarity Score: {score:.4f}\n")


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [1]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# # Load custom_labels dataset and extract the text column
# custom_labels = [...]  # Your custom_labels dataset

# Encode the text column to obtain label embeddings
label_embeddings = model.encode(custom_labels, convert_to_tensor=True)

# User input sentence
user_sentence = input("Enter your sentence: ")

# Encode the user input sentence
user_embedding = model.encode([user_sentence], convert_to_tensor=True)

# Calculate cosine similarity between user embedding and label embeddings
similarities = util.pytorch_cos_sim(user_embedding, label_embeddings)

# Define threshold for similarity scores
threshold = 0.8

# Sort labels based on similarity scores
sorted_indices = similarities.argsort(descending=True).squeeze().tolist()
sorted_labels = [custom_labels[i] for i in sorted_indices]
similarity_scores = similarities.squeeze().tolist()

# Print labels with similarity scores above the threshold
for label, score in zip(sorted_labels, similarity_scores):
    if score > threshold:
        print(f"Label: {label}\nSimilarity Score: {score:.4f}\n")


ModuleNotFoundError: No module named 'sentence_transformers'