In [33]:
!pip install -qU pypdf langchain_community

In [1]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "../documents/Kiel/geomar/ifm-geomar_rep33.pdf"

loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

177


In [2]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

IFM-GEOMAR REPORT
Berichte aus dem Leibniz-Institut 
für Meereswissenschaften an der 
Christian-Albr
{'source': '../documents/Kiel/geomar/ifm-geomar_rep33.pdf', 'page': 0}


In [36]:
!pip install -qU langchain-openai

In [4]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [38]:
!pip install langchain_chroma langchain_openai



In [39]:
!pip install pysqlite3-binary



In [6]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [8]:
vectorstore
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7f6cce631b40>, search_kwargs={'k': 5})

In [41]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. list your answer in between two asterisks that each response in bullet list" # limit your answer to 1 sentence
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

# results = rag_chain.invoke({"input": "name auv missions happened in this report step by step in detail?"})
# results = rag_chain.invoke({"input": "explain about auv mission of anton dive?"})
results = rag_chain.invoke({"input": "What is name of  vehicle?/ vessel"})
results

{'input': 'What is name of  vehicle?/ vessel',
 'context': [Document(metadata={'page': 16, 'source': '../documents/Kiel/geomar/ifm-geomar_rep33.pdf'}, page_content='14\t\r \xa0 The vehicle consists of a tapered forward section, a cylindrical midsection and \na tapered tail section. An internal titanium strongback, which extends through much \nof the vehicle length, provides the structural integrity and acts as a mounting platform for syntactic foam, equipment housings, sensors and release mechanisms. The \nmaximum vehicle diameter is 0.66 meters and the overall length is 4 meters. Vehicle \nweight is approximately 880 kilograms, but is depending on the payload \nconfiguration.  A rectangular compartment in the midsection of the vehicle contains \nthree pressure housings and an oil -filled junc tion box. Two pressure housings each \ncontain one 5.6 kWh 29-Volt lithium -ion battery pack. The third pressure housing \ncontains the vehicle and sidescan sonar electronics. The vehicleʼ s iner

In [42]:
print(results["context"][0].page_content)

   The vehicle consists of a tapered forward section, a cylindrical midsection and 
a tapered tail section. An internal titanium strongback, which extends through much 
of the vehicle length, provides the structural integrity and acts as a mounting platform for syntactic foam, equipment housings, sensors and release mechanisms. The 
maximum vehicle diameter is 0.66 meters and the overall length is 4 meters. Vehicle 
weight is approximately 880 kilograms, but is depending on the payload 
configuration.  A rectangular compartment in the midsection of the vehicle contains 
three pressure housings and an oil -filled junc tion box. Two pressure housings each 
contain one 5.6 kWh 29-Volt lithium -ion battery pack. The third pressure housing 
contains the vehicle and sidescan sonar electronics. The vehicleʼ s inertial 
measurement unit and acoustic Doppler current profiler are hou sed in two other 
independent housings that are mounted forward of the 3 main pressure housings.


In [43]:
print(results["answer"])

**I don't know.**


In [44]:
import re
match = re.findall(r'\**([^\*]*)\**', results["answer"])

for item in match:
   if len(item)>3:
      print(item)

I don't know.


In [45]:
print(results["context"][0].metadata)

{'page': 16, 'source': '../documents/Kiel/geomar/ifm-geomar_rep33.pdf'}


In [46]:
results = rag_chain.invoke({"input": "explain auv mission anton 89 step by step in detail?"})
print(results["answer"])

Based on the provided context, there is no specific information available about a mission named "anton 89". Therefore, I cannot provide a detailed step-by-step explanation of this mission.

**However, here are some related points from the context that can provide a general understanding of AUV missions:**

* **AUV Mapping Techniques and Results:**
  - Four AUV dives collected bathymetric data.
  - One additional dive collected sidescan sonar data.
  - Low frequency (200 kHz) bathymetry data and sidescan sonar data processed using CARAIBES package from IFREMER.
  - High-frequency (400 kHz) bathymetric data processed using PDS2000 by RESON.

* **Dive 012 Example:**
  - The target was a small volcano near the ridge axis of segment 1B.
  - Previous data suggested a turbidity anomaly in the water column.
  - Due to highly variable topography, the mission was designed in two parts.

* **AUV Technical Description:**
  - The AUV ABYSS, built by HYDROID from IFM-GEOMAR, operates in depths up to

In [47]:
results = rag_chain.invoke({"input": "the path of anton 89 with specific locations step by step in detail?"})
print(results["answer"])

**I don't know.**


In [48]:
import re
match = re.findall(r'\**([^\*]*)\**', results["answer"])

for item in match:
   if len(item)>3:
      print(item)

I don't know.


In [9]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.3.3


In [10]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300')

In [25]:
# Initialize logging.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentence_obama = 'Obama speaks to the media in Illinois'
sentence_president = 'The president greets the press in Chicago'
sentence_orange = 'Oranges are my favorite fruit'
sentences = [sentence_obama, sentence_president, sentence_orange]

In [26]:
# Import and download stopwords from NLTK.
from nltk.corpus import stopwords
from nltk import download
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')

def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

a = preprocess(sentences[0])
b = preprocess(sentences[1])
c = preprocess(sentences[2])

sentence_cleaned = [a, b, c]


[nltk_data] Downloading package stopwords to /home/mahya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# !pip install pot

distance = model.wmdistance(sentence_cleaned[0], sentence_cleaned[1])
print('distance = %.4f' % distance)

2024-10-16 13:53:49,506 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-10-16 13:53:49,507 : INFO : built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)
2024-10-16 13:53:49,508 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)", 'datetime': '2024-10-16T13:53:49.508221', 'gensim': '4.3.3', 'python': '3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.31', 'event': 'created'}


distance = 1.0175


In [28]:
for i, s1 in enumerate(sentences):
   for j, s2 in enumerate(sentences):
      wmd_score = model.wmdistance(sentence_cleaned[i], sentence_cleaned[j])
      print(wmd_score)
      # res[s1].append(round(distance, 1))


      

2024-10-16 13:53:53,126 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-10-16 13:53:53,127 : INFO : built Dictionary<4 unique tokens: ['illinois', 'media', 'obama', 'speaks']> from 2 documents (total 8 corpus positions)
2024-10-16 13:53:53,127 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<4 unique tokens: ['illinois', 'media', 'obama', 'speaks']> from 2 documents (total 8 corpus positions)", 'datetime': '2024-10-16T13:53:53.127952', 'gensim': '4.3.3', 'python': '3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.31', 'event': 'created'}
2024-10-16 13:53:53,129 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-10-16 13:53:53,129 : INFO : built Dictionary<8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...> from 2 documents (total 8 corpus positions)
2024-10-16 13:53:53,130 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<8 unique tokens: ['illi

0.0
1.0174646858929572
1.3663502993722163
1.0174646858929575
0.0
1.3388266063724354
1.3663502993722163
1.3388266063724352
0.0


In [1]:
!pip install transformers torch



In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load CodeBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")


In [3]:
# Replace this with your PDDL code
pddl_code1 = """
(define (domain rv_poseidon)

  (:requirements :strips :typing :fluents :durative-actions)

  (:types 
    sensor task)

  (:predicates 
    (motion_data_collected ?sensor - sensor)
    (gps_position_acquired ?sensor - sensor)
    (ctd_measurement_taken ?sensor - sensor)
    (data_analyzed ?task - task)
  )

  (:functions 
    (data_quality ?task - task) ; Represents the quality of the data collected from sensors
  )

  ;; Durative action to collect motion data using the Motion Sensor IXSEA OCTANS 1000
  (:durative-action collect_motion_data
    :parameters (?sensor - sensor)
    :duration (= ?duration 15)
    :condition (and (at start (not (motion_data_collected ?sensor))))
    :effect (and (at end (motion_data_collected ?sensor))
                 (at end (increase (data_quality motion_task) 5)))
  )

  ;; Durative action to acquire GPS position using the GPS-Receiver GARMIN 152
  (:durative-action acquire_gps_position
    :parameters (?sensor - sensor)
    :duration (= ?duration 10)
    :condition (and (at start (not (gps_position_acquired ?sensor))))
    :effect (and (at end (gps_position_acquired ?sensor))
                 (at end (increase (data_quality gps_task) 3)))
  )

  ;; Durative action to take CTD measurements using the CTD48M Sound Velocity Probe
  (:durative-action take_ctd_measurement
    :parameters (?sensor - sensor)
    :duration (= ?duration 20)
    :condition (and (at start (not (ctd_measurement_taken ?sensor))))
    :effect (and (at end (ctd_measurement_taken ?sensor))
                 (at end (increase (data_quality ctd_task) 7)))
  )

  ;; Durative action to analyze collected data after gathering motion, GPS, and CTD data
  (:durative-action analyze_data
    :parameters (?task - task)
    :duration (= ?duration 30)
    :condition (and (at start (motion_data_collected ixsea_octans_1000))
                    (at start (gps_position_acquired garmin_152))
                    (at start (ctd_measurement_taken ctd48m)))
    :effect (at end (data_analyzed ?task))
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code1, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings1 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector


In [4]:
# Replace this with your PDDL code
pddl_code2 = """
(define (domain sopran_project)

  (:requirements :strips :typing :fluents :durative-actions)

  (:types 
    station sample profile)

  (:predicates 
    (ctd_profile_conducted ?station - station)
    (water_sample_collected ?station - station ?depth - number)
    (data_analyzed ?profile - profile)
    (halogen_compound_variation_studied ?profile - profile)
    (hydrographic_conditions_understood ?profile - profile)
  )

  (:functions 
    (depth ?station - station) ; Represents the depth of the water sample collection
  )

  ;; Durative action to conduct a CTD profile
  (:durative-action conduct_ctd_profile
    :parameters (?station - station)
    :duration (= ?duration 15)
    :condition (and (at start (not (ctd_profile_conducted ?station))))
    :effect (and (at start (ctd_profile_conducted ?station)))
  )

  ;; Durative action to collect water samples at different depths using Niskin bottles
  (:durative-action collect_water_sample
    :parameters (?station - station ?depth - number)
    :duration (= ?duration 10)
    :condition (and (at start (ctd_profile_conducted ?station))
                    (at start (not (water_sample_collected ?station ?depth))))
    :effect (and (at end (water_sample_collected ?station ?depth)))
  )

  ;; Durative action to analyze the collected data for temperature, salinity, and oxygen levels
  (:durative-action analyze_data
    :parameters (?profile - profile)
    :duration (= ?duration 20)
    :condition (at start (not (data_analyzed ?profile)))
    :effect (at end (data_analyzed ?profile))
  )

  ;; Durative action to study halogenated compound variations
  (:durative-action study_halogen_compounds
    :parameters (?profile - profile)
    :duration (= ?duration 30)
    :condition (at start (data_analyzed ?profile))
    :effect (at end (halogen_compound_variation_studied ?profile))
  )

  ;; Durative action to understand hydrographic conditions
  (:durative-action understand_hydrographic_conditions
    :parameters (?profile - profile)
    :duration (= ?duration 25)
    :condition (at start (data_analyzed ?profile))
    :effect (at end (hydrographic_conditions_understood ?profile))
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code2, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings2 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector


In [5]:
# Replace this with your PDDL code
pddl_code3 = """
(define (domain NAMV_operations)
  (:requirements :strips :durative-actions)
  
  ;; Types of tasks
  (:types task observatory instrument ship CTD OBS)

  ;; Predicates
  (:predicates
    (data_downloaded ?obs)
    (CTD_performed ?ctd)
    (microstructure_tested ?instrument)
    (methane_plume_identified ?method)
    (OBS_recovered ?obs)
    (equipment_tested ?instrument)
    (temperature_observatory_recovered ?obs)
    (task_completed ?task)
    (task_failed ?task))

  ;; Action for downloading data
  (:durative-action download_data
    :parameters (?obs - observatory)
    :duration (= ?duration 2)
    :condition (and (at start (not (data_downloaded ?obs))))
    :effect (and (at end (data_downloaded ?obs) (task_completed download_data)))
  )

  ;; Action for performing CTD cast
  (:durative-action perform_CTD_cast
    :parameters (?ctd - CTD)
    :duration (= ?duration 2)
    :condition (and (at start (not (CTD_performed ?ctd))))
    :effect (and (at end (CTD_performed ?ctd) (task_completed perform_CTD_cast)))
  )

  ;; Action for testing temperature microstructure
  (:durative-action test_microstructure
    :parameters (?instrument - instrument)
    :duration (= ?duration 1)
    :condition (and (at start (not (microstructure_tested ?instrument))))
    :effect (at end (task_failed test_microstructure))
  )

  ;; Action for identifying methane plume
  (:durative-action identify_methane_plume
    :parameters (?method - ship)
    :duration (= ?duration 4)
    :condition (and (at start (not (methane_plume_identified ?method))))
    :effect (and (at end (methane_plume_identified ?method) (task_completed identify_methane_plume)))
  )

  ;; Action for recovering OBS stations
  (:durative-action recover_OBS
    :parameters (?obs - OBS)
    :duration (= ?duration 4)
    :condition (and (at start (not (OBS_recovered ?obs))))
    :effect (and (at end (OBS_recovered ?obs) (task_completed recover_OBS)))
  )

  ;; Action for ROV deployment and equipment test
  (:durative-action deploy_ROV
    :parameters (?instrument - instrument)
    :duration (= ?duration 3.5)
    :condition (and (at start (not (equipment_tested ?instrument))))
    :effect (and (at end (equipment_tested ?instrument) (task_completed deploy_ROV)))
  )

  ;; Action for recovering temperature observatory
  (:durative-action recover_temperature_observatory
    :parameters (?obs - observatory)
    :duration (= ?duration 8)
    :condition (and (at start (not (temperature_observatory_recovered ?obs))))
    :effect (and (at end (temperature_observatory_recovered ?obs) (task_completed recover_temperature_observatory)))
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code3, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings3 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector

In [6]:
# Replace this with your PDDL code
pddl_code4 = """
(define (domain auv_anton_mission)

  (:requirements :strips :typing :fluents :durative-actions)

  (:types 
    auv setting)

  (:predicates 
    (mission_started ?auv - auv)
    (mission_completed ?auv - auv)
    (acoustic_command_tested ?auv - auv)
    (drift_tested ?auv - auv)
    (no_response ?auv - auv)
    (usbl_received ?auv - auv)
    (settings_changed ?auv - auv)
  )

  (:functions 
    (depth ?auv - auv) ; Represents the current depth of the AUV
    (duration ?auv - auv) ; Represents the duration of the mission in minutes
  )

  ;; Action to start a mission with an acoustic GoTo command and handle drift
  (:durative-action acoustic_goto_command
    :parameters (?auv - auv)
    :duration (= ?duration 10)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 200)))
    :effect (and (at start (mission_started ?auv))
                 (at end (drift_tested ?auv))
                 (at end (mission_completed ?auv)))
  )

  ;; Action to test acoustic commands
  (:durative-action test_acoustic_command
    :parameters (?auv - auv)
    :duration (= ?duration 14)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 20)))
    :effect (and (at start (mission_started ?auv))
                 (at end (acoustic_command_tested ?auv))
                 (at end (mission_completed ?auv)))
  )

  ;; Action for drift testing
  (:durative-action perform_drift_test
    :parameters (?auv - auv)
    :duration (= ?duration 29)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 50)))
    :effect (and (at start (mission_started ?auv))
                 (at end (drift_tested ?auv))
                 (at end (mission_completed ?auv)))
  )

  ;; Action to test acoustic abort command with no response, then change settings
  (:durative-action test_acoustic_abort_command
    :parameters (?auv - auv)
    :duration (= ?duration 6)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 20)))
    :effect (and (at start (mission_started ?auv))
                 (at end (no_response ?auv))
                 (at end (settings_changed ?auv))
                 (at end (mission_completed ?auv)))
  )

  ;; Action to test acoustic commands with new settings, resulting in no response
  (:durative-action test_acoustic_command_new_settings
    :parameters (?auv - auv)
    :duration (= ?duration 8)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 20))
                    (at start (settings_changed ?auv)))
    :effect (and (at start (mission_started ?auv))
                 (at end (no_response ?auv))
                 (at end (mission_completed ?auv)))
  )

  ;; Action to test acoustic commands again with new settings, resulting in no response
  (:durative-action test_acoustic_command_new_settings_again
    :parameters (?auv - auv)
    :duration (= ?duration 7)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 20))
                    (at start (settings_changed ?auv)))
    :effect (and (at start (mission_started ?auv))
                 (at end (no_response ?auv))
                 (at end (mission_completed ?auv)))
  )

  ;; Action for successful communication with USBL
  (:durative-action receive_usbl_signal
    :parameters (?auv - auv)
    :duration (= ?duration 4)
    :condition (and (at start (not (mission_started ?auv)))
                    (at start (= (depth ?auv) 20))
                    (at start (settings_changed ?auv)))
    :effect (and (at start (mission_started ?auv))
                 (at end (usbl_received ?auv))
                 (at end (mission_completed ?auv)))
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code4, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings4 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector


In [7]:
# Replace this with your PDDL code
pddl_code5 = """
(define (domain mission-report)
  (:requirements :strips :typing)
  
  (:types 
    task - object
    duration - number
    outcome - object
  )
  
  (:predicates
    (task-started ?t - task)
    (task-completed ?t - task)
    (task-failed ?t - task)
    (task-duration ?t - task ?d - duration)
    (task-outcome ?t - task ?o - outcome)
  )

  (:action start-seismic-profiling
    :parameters ()
    :precondition (not (task-started seismic-profiling))
    :effect (and
      (task-started seismic-profiling)
      (task-duration seismic-profiling duration-24h)
    )
  )

  (:action complete-seismic-profiling
    :parameters ()
    :precondition (task-started seismic-profiling)
    :effect (and
      (task-completed seismic-profiling)
      (task-outcome seismic-profiling outcome-successful-recovery)
    )
  )

  (:action start-heat-flow-measurements
    :parameters ()
    :precondition (not (task-started heat-flow-measurements))
    :effect (and
      (task-started heat-flow-measurements)
      (task-duration heat-flow-measurements duration-multiple)
    )
  )

  (:action complete-heat-flow-measurements
    :parameters ()
    :precondition (task-started heat-flow-measurements)
    :effect (and
      (task-completed heat-flow-measurements)
      (task-outcome heat-flow-measurements outcome-partial-completion)
    )
  )

  (:action start-magnetometer-profiles
    :parameters ()
    :precondition (not (task-started magnetometer-profiles))
    :effect (and
      (task-started magnetometer-profiles)
      (task-duration magnetometer-profiles duration-transit)
    )
  )

  (:action complete-magnetometer-profiles
    :parameters ()
    :precondition (task-started magnetometer-profiles)
    :effect (and
      (task-completed magnetometer-profiles)
      (task-outcome magnetometer-profiles outcome-successful-data-collection)
    )
  )

  (:action start-seismic-profile-P03
    :parameters ()
    :precondition (not (task-started seismic-profile-P03))
    :effect (and
      (task-started seismic-profile-P03)
      (task-duration seismic-profile-P03 duration-shortened)
    )
  )

  (:action complete-seismic-profile-P03
    :parameters ()
    :precondition (task-started seismic-profile-P03)
    :effect (and
      (task-completed seismic-profile-P03)
      (task-outcome seismic-profile-P03 outcome-partial-completion)
    )
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code5, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings5 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector


In [8]:
# Replace this with your PDDL code
pddl_code6 = """
(define (domain celtic_explorer_mission)
  (:requirements :strips :typing)
  (:types
    vessel component task location
  )

  (:predicates
    (at ?v - vessel ?l - location)               ; Vessel's current location
    (has_component ?v - vessel ?c - component)   ; Vessel has a specific component
    (task_completed ?t - task)                    ; Task has been completed
    (task_in_progress ?t - task)                  ; Task is currently in progress
    (is_successful ?t - task)                     ; Task was successful
    (is_installed ?c - component)                 ; Component is installed
    (is_rigged ?c - component)                    ; Component is rigged
    (is_pumped ?c - component)                    ; Component has been pumped
    (is_tested ?c - component)                    ; Component has been tested
  )

  (:action unpack_and_install
    :parameters (?v - vessel ?c - component)
    :precondition (and (at ?v shore) (has_component ?v ?c) (not (is_installed ?c)))
    :effect (and (is_installed ?c) (task_completed unpacking_installation))
  )

  (:action rig_up_vibro_corer
    :parameters (?v - vessel ?c - component)
    :precondition (and (at ?v shore) (has_component ?v ?c) (not (is_rigged ?c)))
    :effect (and (is_rigged ?c) (task_completed rigging_vibro_corer))
  )

  (:action conduct_meeting
    :parameters (?v - vessel)
    :precondition (at ?v shore)
    :effect (task_completed meeting_principal_investigators)
  )

  (:action pump_hydraulic_oil
    :parameters (?v - vessel ?c - component)
    :precondition (and (at ?v offshore) (has_component ?v ?c) (not (is_pumped ?c)))
    :effect (and (is_pumped ?c) (task_completed pumping_oil))
  )

  (:action perform_harbor_test
    :parameters (?v - vessel ?c - component)
    :precondition (and (at ?v offshore) (has_component ?v ?c) (not (is_tested ?c)))
    :effect (and (is_tested ?c) (is_successful perform_harbor_test))
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code6, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings6 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector


In [9]:
# Replace this with your PDDL code
pddl_code7 = """
(define (domain cruise_so190_leg2)
  (:requirements :strips :typing)
  (:types
    vessel component task location
  )

  (:predicates
    (at ?v - vessel ?l - location)               ; Vessel's current location
    (has_component ?v - vessel ?c - component)   ; Vessel has a specific component
    (task_completed ?t - task)                    ; Task has been completed
    (task_in_progress ?t - task)                  ; Task is currently in progress
    (is_successful ?t - task)                     ; Task was successful
    (is_deployed ?c - component)                  ; Component is deployed
    (is_recovered ?c - component)                 ; Component has been recovered
    (is_mapped ?t - task)                         ; Task of mapping is completed
    (is_transiting ?v - vessel)                   ; Vessel is transiting
  )

  (:action conduct_bathymetric_survey
    :parameters (?v - vessel)
    :precondition (at ?v pacific_ocean)
    :effect (and (task_completed bathymetric_survey) (is_successful bathymetric_survey))
  )

  (:action deploy_instruments
    :parameters (?v - vessel ?c - component)
    :precondition (and (at ?v pacific_ocean) (has_component ?v ?c) (not (is_deployed ?c)))
    :effect (and (is_deployed ?c) (task_completed deploying_instruments))
  )

  (:action recover_instruments
    :parameters (?v - vessel ?c - component)
    :precondition (and (at ?v pacific_ocean) (has_component ?v ?c) (is_deployed ?c))
    :effect (and (is_recovered ?c) (task_completed recovering_instruments))
  )

  (:action map_trench_and_slope
    :parameters (?v - vessel)
    :precondition (at ?v pacific_ocean)
    :effect (and (is_mapped trench_and_slope_mapping) (task_completed mapping_trench_slope))
  )

  (:action initiate_transit
    :parameters (?v - vessel)
    :precondition (and (at ?v pacific_ocean) (not (is_transiting ?v)))
    :effect (and (is_transiting ?v) (task_completed initiating_transit))
  )
)
"""
# Tokenize input PDDL text
inputs = tokenizer(pddl_code7, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Pass tokenized inputs through CodeBERT
with torch.no_grad():
    outputs = model(**inputs)

# Obtain the embedding vector (use the last hidden state or pooler output)
embeddings7 = outputs.last_hidden_state.mean(dim=1)  # Mean pooling over token embeddings for a single vector


In [163]:
# embeddings
embedding_list1 = embeddings1.detach().cpu().numpy().tolist()
embedding_list1[0]

embedding_list2 = embeddings2.detach().cpu().numpy().tolist()
embedding_list2[0]

embedding_list3 = embeddings3.detach().cpu().numpy().tolist()
embedding_list3[0]

embedding_list4 = embeddings4.detach().cpu().numpy().tolist()
embedding_list4[0]


embedding_list5 = embeddings5.detach().cpu().numpy().tolist()
embedding_list5[0]

embedding_list6 = embeddings6.detach().cpu().numpy().tolist()
embedding_list6[0]

embedding_list7 = embeddings7.detach().cpu().numpy().tolist()
embedding_list7[0]

[-0.34456998109817505,
 0.221358060836792,
 0.2283305823802948,
 0.06683887541294098,
 -0.19324393570423126,
 -0.3169556260108948,
 -0.016985444352030754,
 0.1470722109079361,
 0.3732360899448395,
 0.3972192704677582,
 -0.311428964138031,
 1.0108121633529663,
 -0.18656593561172485,
 -0.17097653448581696,
 0.8891522884368896,
 0.05173138529062271,
 0.12107185274362564,
 0.19791340827941895,
 0.0072701843455433846,
 -0.1153978705406189,
 -0.33129066228866577,
 -0.21077004075050354,
 0.5366114377975464,
 -0.2525824308395386,
 0.4000083804130554,
 0.3553965091705322,
 -0.02509145997464657,
 0.766335129737854,
 -0.6201920509338379,
 0.7524734735488892,
 -0.16984489560127258,
 0.08243720233440399,
 1.533645510673523,
 0.16211435198783875,
 0.26835519075393677,
 -0.40060973167419434,
 -0.3200230598449707,
 0.30036699771881104,
 0.2730899751186371,
 -0.3699343800544739,
 0.17340299487113953,
 0.731128454208374,
 -1.0676223039627075,
 0.021572129800915718,
 0.5095532536506653,
 0.36768656969070

In [60]:
import numpy as np
embed1 =embedding_list1[0]

np.size(embed1)


768

In [61]:
embed2 =embedding_list2[0]
np.size(embed2)

768

In [62]:
embed3 =embedding_list3[0]
np.size(embed3)

768

In [63]:
embed4 =embedding_list4[0]
np.size(embed4)

768

In [113]:
embed5 =embedding_list5[0]
np.size(embed5)

768

In [146]:
embed6 =embedding_list6[0]
np.size(embed6)

768

In [164]:
embed7 =embedding_list7[0]
np.size(embed7)

768

In [190]:
from scipy.stats import wasserstein_distance
distance = wasserstein_distance(embed6, embed7)
print("Wasserstein Distance:", distance)

Wasserstein Distance: 0.006221371358151361


In [177]:
l2_distance = np.linalg.norm(np.array(embed1) - np.array(embed7))
print("L2 Distance:", l2_distance)

L2 Distance: 4.090807418955


In [184]:
cosine_similarity = np.dot(np.array(embed6), np.array(embed7)) / (np.linalg.norm(np.array(embed6)) 
                                                                  * np.linalg.norm(np.array(embed7)))
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.9993634577307621


In [58]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics import pairwise_distances

# Load CodeBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

def get_token_embeddings(code_snippet):
    # Tokenize the code and get token embeddings
    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden_size)

def cosine_similarity_matrix(embedding1, embedding2):
    # Compute pairwise cosine similarities between tokens in two sequences
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()
    return 1 - pairwise_distances(embedding1, embedding2, metric="cosine")

def bert_score(code1, code2):
    # Get token embeddings for each code snippet
    embeddings1 = get_token_embeddings(code1)
    embeddings2 = get_token_embeddings(code2)
    
    # Calculate cosine similarity matrix
    sim_matrix = cosine_similarity_matrix(embeddings1, embeddings2)
    
    # Precision: for each token in code1, find max similarity in code2
    precision = np.mean(np.max(sim_matrix, axis=1))
    
    # Recall: for each token in code2, find max similarity in code1
    recall = np.mean(np.max(sim_matrix, axis=0))
    
    # F1 Score: harmonic mean of precision and recall
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return {"precision": precision, "recall": recall, "f1": f1_score}

def meteor_score_codebert(code1, code2):
    # Get token embeddings
    embeddings1 = get_token_embeddings(code1)
    embeddings2 = get_token_embeddings(code2)
    
    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity_matrix(embeddings1, embeddings2)
    
    # Exact and synonym matches based on similarity threshold (e.g., >= 0.8 for synonym match)
    threshold = 0.8
    matched1 = (sim_matrix >= threshold).any(axis=1).sum()  # Matches in code1
    matched2 = (sim_matrix >= threshold).any(axis=0).sum()  # Matches in code2
    
    # Precision and Recall
    precision = matched1 / len(embeddings1)
    recall = matched2 / len(embeddings2)
    
    # F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0
    
    # Penalty for unmatched sequences
    penalty = 0.5 * ((1 - precision) + (1 - recall))
    
    # METEOR Score with penalty
    meteor = f1_score * (1 - penalty)
    return {"precision": precision, "recall": recall, "f1_score": f1_score, "meteor_score": meteor}


def rouge_score_codebert(code1, code2, threshold=0.8):
    # Get token embeddings for each code snippet
    embeddings1 = get_token_embeddings(code1)
    embeddings2 = get_token_embeddings(code2)
    
    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity_matrix(embeddings1, embeddings2)
    
    # ROUGE-1 (Unigram) Calculation
    matched1 = (sim_matrix >= threshold).any(axis=1).sum()  # Matches in code1
    matched2 = (sim_matrix >= threshold).any(axis=0).sum()  # Matches in code2
    rouge1_precision = matched1 / len(embeddings1)
    rouge1_recall = matched2 / len(embeddings2)
    rouge1_f1 = 2 * (rouge1_precision * rouge1_recall) / (rouge1_precision + rouge1_recall + 1e-10)  # Small value to prevent div by zero
    
    # ROUGE-2 (Bigram) Calculation - create bigrams by combining consecutive token embeddings
    bigrams1 = [embeddings1[i:i+2] for i in range(len(embeddings1) - 1)]
    bigrams2 = [embeddings2[i:i+2] for i in range(len(embeddings2) - 1)]
    bigram_matches = sum(
        max(cosine_similarity_matrix(b1, b2).max() >= threshold for b2 in bigrams2)
        for b1 in bigrams1
    )
    rouge2_precision = bigram_matches / len(bigrams1) if bigrams1 else 0
    rouge2_recall = bigram_matches / len(bigrams2) if bigrams2 else 0
    rouge2_f1 = 2 * (rouge2_precision * rouge2_recall) / (rouge2_precision + rouge2_recall + 1e-10)
    
    # # ROUGE-L Calculation - longest common subsequence based on similarity
    # lcs_len = sum(1 for i, row in enumerate(sim_matrix) if row.max() >= threshold)
    # rouge_l_precision = lcs_len / len(embeddings1)
    # rouge_l_recall = lcs_len / len(embeddings2)
    # rouge_l_f1 = 2 * (rouge_l_precision * rouge_l_recall) / (rouge_l_precision + rouge_l_recall + 1e-10)
    
    return {
        "ROUGE-1": {"precision": rouge1_precision, "recall": rouge1_recall, "f1": rouge1_f1},
        "ROUGE-2": {"precision": rouge2_precision, "recall": rouge2_recall, "f1": rouge2_f1},
        # "ROUGE-L": {"precision": rouge_l_precision, "recall": rouge_l_recall, "f1": rouge_l_f1}
    }



score = bert_score(pddl_code2, pddl_code3)
print("BERTScore:", score)

# score = meteor_score_codebert(code1, code2)
# print("METEORScore:", score)

# score = rouge_score_codebert(code1, code2)
# print("ROUGE Score:", score)

BERTScore: {'precision': 0.94524723, 'recall': 0.94949126, 'f1': 0.9473645070175833}
