In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

from datasets import Dataset, DatasetDict
import pandas as pd
import json 
import re
import numpy as np

import os
os.environ["HUGGINGFACE_TOKEN"] = "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"

In [2]:
!huggingface-cli login --token $HUGGINGFACE_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [6]:
# Open and read the JSON file
with open('email_data_set.json', 'r') as file:
    email_data = json.load(file)

# Open and read the JSON file
with open('scraped_data.json', 'r') as file:
    scraped_data = json.load(file)

In [7]:
def get_email_thread(email_id):
    input_text = ""
    labels = []
    for thread in email_data[email_id]['conversation']:
      labels.append(thread['category'])
      # add the sender
      if thread['direction'] == 'incoming':
          input_text += "Customer Email: " + thread['body'] + "\n"
      elif thread['direction'] == 'outgoing':
          input_text += "Customer Service Email: " + thread['body'] + "\n"
      elif thread['direction'] == 'forwarded':
          input_text += "Third Party Email: " + thread['body'] + "\n"      

    return input_text, labels

In [8]:
# Function to clean markdown content by removing unnecessary whitespace and characters
def clean_markdown(text):
    # Check if the text is a valid string
    if not isinstance(text, str):
        return ""
    
    # Remove markdown links/images
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)  # Remove images
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)  # Remove links
    # Normalize whitespace (remove extra newlines, tabs, etc.)
    text = ' '.join(text.split())
    return text

# Process the scraped data
for entry in scraped_data:
    markdown_content = entry.get("markdown", "")
    
    # Clean the markdown content if it's a string, otherwise set it to empty
    if isinstance(markdown_content, str):
        entry["cleaned_markdown"] = clean_markdown(markdown_content)
    else:
        entry["cleaned_markdown"] = ""


In [28]:
# Function to extract information from the contact form and parse the query
def extract_contact_form_info(query):
    # Regex patterns to extract various fields based on the structure of the input
    form_data = {
        "event": re.search(r'Veranstaltung:\s*([^\s]+)', query),  # Match the event after "Veranstaltung:"
        "first_name": re.search(r'Vorname:\s*([^\s]+)', query),  # Match the first name after "Vorname:"
        "last_name": re.search(r'Nachname:\s*([^\s]+)', query),  # Match the last name after "Nachname:"
        "birth_date": re.search(r'Geburtsdatum:\s*([^\s]+)', query),  # Match the birth date after "Geburtsdatum:"
        "address": re.search(r'Adresse:\s*(.+?)\s*E-Mail:', query),  # Match the address until "E-Mail:"
        "email": re.search(r'E-Mail:\s*([^\s]+)', query),  # Match the email after "E-Mail:"
        "phone": re.search(r'Telefon:\s*([^\s]+)', query),  # Match the phone number after "Telefon:"
        "message": re.search(r'Mitteilung:\s*(.*)', query, re.DOTALL)  # Capture everything after "Mitteilung"
    }
    
    # Extract and clean values
    extracted_info = {key: match.group(1).strip() if match else None for key, match in form_data.items()}
    return extracted_info

In [14]:
import faiss
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model for embedding generation
sentence_model = SentenceTransformer('all-mpnet-base-v2')  # You can choose other models as well

# Generate embeddings for the cleaned markdown content
for entry in scraped_data:
    entry['embedding'] = sentence_model.encode(entry['cleaned_markdown'])

# Convert embeddings to a numpy array
embeddings = np.array([entry['embedding'] for entry in scraped_data])

# Ensure embeddings are created
assert len(embeddings) > 0, "No embeddings found. Ensure embeddings are properly generated."

# Get the embedding dimension from the model (i.e., the size of each embedding vector)
dimension = embeddings.shape[1]  # Typically something like 768 for transformers

# Create FAISS index for L2 (Euclidean) similarity
index = faiss.IndexFlatL2(dimension)

# Add the embeddings to the FAISS index
index.add(embeddings)

print("FAISS index created with the embeddings.")

FAISS index created with the embeddings.


In [171]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generator = AutoModelForCausalLM.from_pretrained(model_name)

# sent model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = generator.to(device)

In [176]:
import requests
import time
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": "Bearer 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"}

def send_to_huggingface_api(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

# Define a retry mechanism to wait for the model to load
def send_with_retry(payload, max_retries=10, wait_time=30):
    retries = 0
    while retries < max_retries:
        response = send_to_huggingface_api(payload)
        if 'error' in response and 'loading' in response['error']:
            print(f"Model is loading, retrying in {wait_time} seconds... ({response['estimated_time']} seconds estimated)")
            time.sleep(wait_time)  # Wait for the estimated time or a fixed amount before retrying
            retries += 1
        else:
            return response
    return {"error": "Model failed to load after multiple retries."}

In [209]:
# The RAG pipeline function
def rag_pipeline(query, sentence_model, index, generator, tokenizer, scraped_data):
    # Step 1: Parse the query and extract contact form information
    contact_info = extract_contact_form_info(query)
    
    # Extract relevant parts from the contact form
    cleaned_query = contact_info["message"]
    
    # Step 2: Encode the cleaned query and retrieve relevant documents
    query_embedding = sentence_model.encode(cleaned_query)
    
    # Retrieve top-k relevant documents (using FAISS or other retrieval methods)
    k = 3
    distances, indices = index.search(np.array([query_embedding]), k)
    retrieved_docs = [scraped_data[i] for i in indices[0]]
    
    # Step 3: Combine retrieved documents into a single context
    #context = " ".join([doc['cleaned_markdown'] for doc in retrieved_docs])
    context = " ".join([doc['cleaned_markdown'][:1500] for doc in retrieved_docs])

    # Step 4: Create a prompt with an email structure
    prompt = f"""
    The user has the following query: "{cleaned_query}"
    Use the context provided below to craft a professional response as an email.
    
    Context:
    {context}
    
    Email format:
    - Start with a greeting (e.g., "Dear [name],")
    - Include a clear response addressing the user's query.
    - End with a professional sign-off (e.g., "Best regards, \nYour DataSport Team" or "Sincerely, \nYour DataSport Team").
    - No more text after this.

    Answer:
    """

    # pipe = pipeline(task="text-generation", model=generator, tokenizer=tokenizer, max_length=len(prompt)+100, device = device)
    # result = pipe(f"[INST] {prompt} [/INST]")
    # generated_response = result[0]['generated_text'][len(prompt):]

    # Send the prompt to Hugging Face API with retry logic
    payload = {
       "inputs": prompt,
       "parameters": {
           "max_new_tokens": 500,  # Specify the number of new tokens to generate
           "stop": ["Best regards, \nYour DataSport Team"],
           "temperature": 0.9,
       }
    }
    api_response = send_with_retry(payload)
    
    # Extract the generated text from the API response
    if 'error' in api_response:
       return f"Error: {api_response['error']}"
    
    generated_response = api_response[0]['generated_text'][len(prompt):]
    
    return generated_response

In [210]:
# Example usage of the RAG pipeline
query = "Mitteilung: Hello there is a problem with my data sport account. I believe you still have an old email address linked to it (<EMAIL_1>). This email address is no longer valid. I have not received confirmation of my number\/ entry for Saturday's race (Davos X trails) nor have I received information\/ code for the Swiss runners ticket. Please can you contact me urgently to resolve this. Thank you. Best regards, <FRIST_NAME_2> IP: <IP_ADDRESS_1>"
            
# Generate the response
generated_answer = rag_pipeline(query, sentence_model, index, generator, tokenizer, scraped_data)
print(generated_answer)


 Dear <FRIST_NAME_2>,
     We apologize for the inconvenience you're experiencing with your DataSport account. We've found that the email address linked to your account is no longer valid, which may have caused you to miss important updates, such as confirmation of your entry and information about your Swiss runners ticket. We're on it! Our team is working to resolve this issue as soon as possible. We will contact you shortly to confirm the updated information. If you have any further questions or concerns, feel free to reach out. We appreciate your patience and cooperation in this matter.
     Best regards,
     Your DataSport Team. IP: <IP_ADDRESS_1> ""


    Note: The user's first name and IP address will be filled in automatically. Also, the email address should be removed from the response as it's no longer valid.


In [1]:
from rag_pipeline import get_email


  from tqdm.autonotebook import tqdm, trange


In [3]:
query = "<FIRST_NAME_1>, here are our recommendations based on what you viewed - \t?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? ?? Not rendering correctly or images not showing? View in a webpage <URL_1> Jobs you might be interested in <URL_2> And if you need a little help putting together the perfect application, check out our various tips and templates ? (German French only) This email was intended for <NAME_1> <EMAIL_1>. You are receiving our recommendations by email based on what you have been browsing on JobScout24 and on your communication preferences. You can change or stop receiving these emails by opting out in your application profile. Unsubscribe from this e-mail | Help | Privacy Policy Copyright \u00a9 2023 | JobCloud AG, JobScout24.ch, <ADDRESS_1>"

get_email(query)

Generated Answer:
  Dear [FIRST_NAME_1],

     Thank you for your interest in our services. We understand that you are looking for the perfect application tips and templates to help you with your job search. As a valued user, we are happy to provide you with some recommendations based on your browsing history on JobScout24.

     Please find attached the link to view our recommendations in a webpage format: [URL_1]. Additionally, you may be interested in exploring the jobs we have curated specifically for you at [URL_2].

     If you need any assistance with creating a standout application, our various tips and templates for German and French applications are available for you to review.

     This email was sent to you based on your browsing activity on JobScout24 and your communication preferences. If you would like to change or stop receiving these emails, please visit your application profile to opt out.

     We hope you find these recommendations helpful in your job search. If yo