In [None]:
import os

offload_dir = "./offload"
os.makedirs(offload_dir, exist_ok=True)


In [None]:
!huggingface-cli login

In [None]:
!pip install transformers==4.40.1 torch==2.0.1 accelerate==0.29.3 trl

In [None]:
!pip install peft neo4j node2vec

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer
from neo4j import GraphDatabase
import networkx as nx
from node2vec import Node2Vec
import pandas as pd
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# HF model
peft_model_id = "solanaO/llama3-8b-sft-qlora-re"

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16,
  offload_buffers=True
).to(device)

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id =  tokenizer.eos_token_id

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
system_message = """You are an experienced annontator.
Extract all entities and the relations between them from the following text.
Write the answer as a triple entity1|relationship|entitity2. Give exactly one set of triples and not more than that. This is very important.
Do not add anything else.
Example Text: Alice is from France.
Answer: Alice|is from|France.
"""

def create_input_prompt(sample):
    return {
        "messages": [
            {"role": "system","content": system_message},
            {"role": "user", "content": sample},
        ]
    }


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd

# Read the CSV file
csv_file_path = pd.read_csv('Your_file.csv') #Include the data file


In [None]:
df = csv_file_path

In [None]:
# Store all the rows of the 'Sentence' column in a list
sentence_list = df['self_text'].tolist()


# Print the list to verify
print(sentence_list)

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
import pandas as pd
import re
import string
from textblob import Word


# Define the contractions dictionary and the expand function
contractions = {
    "ain't": "am not / are not / is not / has not / have not",
    "aren't": "are not / am not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had / he would",
    "he'd've": "he would have",
    "he'll": "he shall / he will",
    "he'll've": "he shall have / he will have",
    "he's": "he has / he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how has / how is / how does",
    "i'd": "i had / i would",
    "i'd've": "i would have",
    "i'll": "i shall / I will",
    "i'll've": "i shall have / i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it had / it would",
    "it'd've": "it would have",
    "it'll": "it shall / it will",
    "it'll've": "it shall have / it will have",
    "it's": "it has / it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she had / she would",
    "she'd've": "she would have",
    "she'll": "she shall / she will",
    "she'll've": "she shall have / she will have",
    "she's": "she has / she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as / so is",
    "that'd": "that would / that had",
    "that'd've": "that would have",
    "that's": "that has / that is",
    "there'd": "there had / there would",
    "there'd've": "there would have",
    "there's": "there has / there is",
    "they'd": "they had / they would",
    "they'd've": "they would have",
    "they'll": "they shall / they will",
    "they'll've": "they shall have / they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had / we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what shall / what will",
    "what'll've": "what shall have / what will have",
    "what're": "what are",
    "what's": "what has / what is",
    "what've": "what have",
    "when's": "when has / when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where has / where is",
    "where've": "where have",
    "who'll": "who shall / who will",
    "who'll've": "who shall have / who will have",
    "who's": "who has / who is",
    "who've": "who have",
    "why's": "why has / why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had / you would",
    "you'd've": "you would have",
    "you'll": "you shall / you will",
    "you'll've": "you shall have / you will have",
    "you're": "you are",
    "you've": "you have"
}

cont_re = re.compile('(%s)' % '|'.join(contractions.keys()))

def expand(s, contractions=contractions):
    def replace(match):
        return contractions[match.group(0)]
    return cont_re.sub(replace, s)

# Function to remove emojis
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Function to apply all preprocessing steps
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^<]+?>', '', text)
    # Expand contractions
    text = expand(text)
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove new line characters
    text = re.sub('\n', '', text)
    # Remove emojis
    text = remove_emojis(text)
    # Lemmatize words
    text = " ".join([Word(word).lemmatize() for word in text.split()])
    return text

# Apply preprocessing to each sentence
preprocessed_sentences = [preprocess_text(sentence) for sentence in sentence_list]

# Verify the results
print(preprocessed_sentences)


In [None]:
!pip install tqdm

from tqdm import tqdm

# Ensure that the device is set to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


total_responses2 = []

for sentence in tqdm(preprocessed_sentences, desc="Processing sentences"):
  # Transform to conversational format
  test_dataset = create_input_prompt(sentence)

  # Assuming 'test_dataset' is defined and accessible
  prompt = pipe.tokenizer.apply_chat_template(test_dataset["messages"][:2],
                                              tokenize=False,
                                              add_generation_prompt=True)
  # Generate output using pipeline
  outputs = pipe(prompt,
                max_new_tokens=128,
                do_sample=False,
                temperature=0.1,
                top_k=50,
                top_p=0.1,
                #device=device,
                )
  #print(outputs[0]['generated_text'])
  total_responses2.append(outputs[0]['generated_text'])


# Define the path to the file in Google Drive
file_path = '/content/drive/MyDrive/outputresponse.txt'

# Open the file in write mode and write the variable's value to it
with open(file_path, 'w') as file:
    file.write(repr(total_responses2))

In [None]:
def parse_response(response):
    """
    Parses the model's response into triples of (subject, relationship, object).
    Example response: 'streets|were wet|this morning'
    Returns: [('streets', 'were wet', 'this morning')]
    """
    # Assuming each response is a single line triple separated by '|'
    parts = response.strip().split('|')
    if len(parts) == 3:
        return [(parts[0].strip(), parts[1].strip(), parts[2].strip().replace('.',''))]
    return []

def extract_assistant_responses(text):
    """
    Extracts lines following 'assistant' from a given block of text.
    Args:
    text (str): Multi-line string containing various roles and responses.

    Returns:
    list of str: List of responses that come directly after 'assistant'.
    """
    lines = text.split('\n')
    assistant_line = ""
    capture_next_line = False

    for line in lines:
        if capture_next_line:
            assistant_line = line
            break
        if 'assistant' in line:
            capture_next_line = True

    return assistant_line


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
file_path = '/content/drive/MyDrive/outputresponse.txt'

with open(file_path, 'r') as file:
    total_responses = eval(file.read())  # Use eval to parse the string representation of the list

# Alternatively, you can use ast.literal_eval for safer parsing
import ast
with open(file_path, 'r') as file:
    total_responses = ast.literal_eval(file.read())



In [None]:
responses = []

for response in total_responses:
  responses.append(extract_assistant_responses(response))

responses2 = []

for response in responses:
  # Split the response by the delimiter '|'
  triples = response.split('|')

  # Reconstruct the first triple from the split parts
  response = '|'.join(triples[:3])
  responses2.append(parse_response(response))

combined_list = [item for sublist in responses2 for item in sublist]

len(combined_list)

Import Libraries and Define Helper Functions

In [None]:
pip install node2vec

In [None]:
pip install openai

In [None]:
import spacy
import networkx as nx
from node2vec import Node2Vec
from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertTokenizer, BertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
import openai

# Load pre-trained BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs['last_hidden_state'][:, 0, :].numpy()

def cosine_similarity_bert(s1, s2):
    emb1 = get_bert_embedding(s1)
    emb2 = get_bert_embedding(s2)
    similarity = cosine_similarity(emb1, emb2)[0][0]
    return similarity

# Load Spacy model for NLP
nlp = spacy.load("en_core_web_sm")


OpenAI API Setup

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key="" #Include the API Key from OpenAI site
)


Define Functions for Knowledge Graph and Embeddings

In [None]:
# Create a knowledge graph from the triples
def create_kg_from_triples(total_responses):
    kg = nx.Graph()
    for response in total_responses:
        triples = response.split('assistant\n')[1].split('\n')
        for triple in triples:
            if '|' in triple:
                parts = triple.split('|')
                if len(parts) == 3:
                    entity1, relationship, entity2 = parts
                    kg.add_edge(entity1.strip(), entity2.strip(), relation=relationship.strip())
    return kg

# Generate Node2Vec embeddings for the graph
def generate_node2vec_embeddings(kg):
    node2vec = Node2Vec(kg, dimensions=64, walk_length=30, num_walks=200, workers=2)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    node_embeddings = {node: model.wv[node] for node in kg.nodes}
    return node_embeddings, model


Define Functions for Query Processing and Answer Generation

In [None]:
# Modified function to get similar nodes with BERT embedding similarity
def get_similar_nodes_with_neighbors(query, node_embeddings, model, kg, top_n=10, threshold=0.35):
    query_embedding = get_bert_embedding(query)

    similarities = {}
    for node, emb in node_embeddings.items():
        node_embedding = get_bert_embedding(node)
        similarity = cosine_similarity(query_embedding, node_embedding)[0][0]
        if similarity > threshold:
            similarities[node] = similarity

    sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    similar_nodes = [node for node, sim in sorted_similarities[:top_n]]

    expanded_nodes = set(similar_nodes)
    for node in similar_nodes:
        node_embedding = node_embeddings[node].reshape(1, -1)
        neighbor_similarities = {}
        for other_node, other_emb in node_embeddings.items():
            if other_node not in expanded_nodes:
                similarity = 1 - cosine_similarity(node_embedding, other_emb.reshape(1, -1))
                if similarity > threshold:
                    neighbor_similarities[other_node] = similarity
        sorted_neighbor_similarities = sorted(neighbor_similarities.items(), key=lambda item: item[1], reverse=True)
        top_neighbors = [neighbor for neighbor, sim in sorted_neighbor_similarities[:top_n]]
        expanded_nodes.update(top_neighbors)

    return list(expanded_nodes)

def prompt_(prompt):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an experienced causal reasoner."},
            {"role": "user", "content": prompt},
        ]
    )
    return completion.choices[0].message.content.strip()


Map Nodes to Sentences

In [None]:
# Map nodes to sentences
def map_nodes_to_sentences(nodes, preprocessed_sentences):
    node_sentence_map = {}
    for sentence in preprocessed_sentences:
        entities = [token.text for token in nlp(sentence) if token.ent_type_]
        for entity in entities:
            if entity in nodes:
                if entity not in node_sentence_map:
                    node_sentence_map[entity] = []
                node_sentence_map[entity].append(sentence)
    return node_sentence_map


Generate Answers with KG RAG

In [None]:
# Generate answers using KG RAG with updated get_similar_nodes_with_neighbors
def generate_answers_with_kg_rag(queries, kg, node_embeddings, node2vec_model, preprocessed_sentences, tokenizer, model):
    kg_rag_answers = []
    for query in queries:
        similar_nodes = get_similar_nodes_with_neighbors(query, node_embeddings, node2vec_model, kg)
        node_sentence_map = map_nodes_to_sentences(similar_nodes, preprocessed_sentences)
        kg_context_sentences = [sentence for sentences in node_sentence_map.values() for sentence in sentences]
        kg_context = " ".join(kg_context_sentences)

        prompt = f"You are an experienced causal reasoner. Look at the query and infer cause for the query based on the provided context. Don't consider anything else. And don't say anything else, just provide the most probable cause. Query: {query} Context: {kg_context}"

        answer = prompt_(prompt)
        kg_rag_answers.append(answer)
        print(f"KG Context: '{kg_context}'")
        print(f"KG RAG Answer for query '{query}': {answer}")
    return kg_rag_answers


Main Function to Run the Process

In [None]:
# Main function to run the entire process
def main(total_responses, queries, preprocessed_sentences):
    # Create KG from triples
    kg = create_kg_from_triples(total_responses)
    context2 = '.'.join(preprocessed_sentences)

    # Generate Node2Vec embeddings
    node_embeddings, node2vec_model = generate_node2vec_embeddings(kg)

    # Generate answers using KG RAG
    kg_rag_answers = generate_answers_with_kg_rag(queries, kg, node_embeddings, node2vec_model, preprocessed_sentences, tokenizer, model)

    print(kg_rag_answers)


# Define the queries
queries = [

    # Add other queries here...
]

# Run the main function
main(total_responses, queries, preprocessed_sentences)
