In [1]:
%pip install numpy
%pip install transformers torch
%pip install sentence_transformers
%pip install accelerate




Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import numpy as np
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer
import json
from datetime import datetime

class Document:
    def __init__(self, content: str, metadata: Dict[str, Any] = None):
        self.content = content
        self.metadata = metadata or {}
from sentence_transformers import SentenceTransformer

class EmbeddingModel:
    def __init__(self, model_name: str = "jinaai/jina-embeddings-v2-base-en"):
        self.model = SentenceTransformer(model_name)

    def encode(self, texts: List[str]) -> np.ndarray:
        return self.model.encode(texts, normalize_embeddings=True)

class DocumentRetriever:
    def __init__(self, documents: List[Document], embedding_model: EmbeddingModel):
        self.documents = documents
        self.embedding_model = embedding_model
        self.document_embeddings = self.embedding_model.encode([doc.content for doc in documents])

    def retrieve(self, query: str, k: int = 3) -> List[Document]:
        query_embedding = self.embedding_model.encode([query])
        similarities = query_embedding @ self.document_embeddings.T
        top_k_indices = similarities.argsort()[0][-k:][::-1]
        return [self.documents[i] for i in top_k_indices]

class PromptConstructor:
    def __init__(self, system_prompt: str, template: str):
        self.system_prompt = system_prompt
        self.template = template

    def construct(self, query: str, retrieved_docs: List[Document]) -> str:
        context = "\n\n".join([doc.content for doc in retrieved_docs])
        prompt = self.template.format(context=context, query=query)
        return f"{self.system_prompt}\n\n{prompt}"

class LanguageModel:
    def __init__(self, model_name: str = "meta-llama/Meta-Llama-3-8B"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
        
    def generate(self, prompt: str) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

class SDGScorer:
    def __init__(self):
        self.scores = {}

    def score_action(self, user_id: str, action: str, sdg: int) -> float:
        score = len(action) / 100  # Simple scoring based on action length
        if user_id not in self.scores:
            self.scores[user_id] = {}
        if sdg not in self.scores[user_id]:
            self.scores[user_id][sdg] = 0
        self.scores[user_id][sdg] += score
        return score

    def get_user_score(self, user_id: str) -> Dict[int, float]:
        return self.scores.get(user_id, {})

class RAG:
    def __init__(self, embedding_model: EmbeddingModel, language_model: LanguageModel,
                 system_prompt: str, prompt_template: str):
        self.embedding_model = embedding_model
        self.language_model = language_model
        self.documents = []
        self.document_retriever = None
        self.prompt_constructor = PromptConstructor(system_prompt, prompt_template)
        self.sdg_scorer = SDGScorer()

    def add_documents(self, documents: List[Document]):
        self.documents.extend(documents)
        self.document_retriever = DocumentRetriever(self.documents, self.embedding_model)

    def query(self, query: str, k: int = 3) -> str:
        retrieved_docs = self.document_retriever.retrieve(query, k)
        prompt = self.prompt_constructor.construct(query, retrieved_docs)
        return self.language_model.generate(prompt)

    def score_sdg_action(self, user_id: str, action: str, sdg: int) -> float:
        return self.sdg_scorer.score_action(user_id, action, sdg)

    def get_user_sdg_score(self, user_id: str) -> Dict[int, float]:
        return self.sdg_scorer.get_user_score(user_id)

# Load and process JSON files
def load_json_documents(file_paths):
    documents = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for doc in data[0]['docs']:
                content = doc['content']
                metadata = {
                    'source': file_path,
                    'title': doc['metadata'].get('title', ''),
                    'description': doc['metadata'].get('description', '')
                }
                documents.append(Document(content, metadata))
    return documents

# Usage example
embedding_model = EmbeddingModel()
language_model = LanguageModel("meta-llama/Meta-Llama-3-8B")

system_prompt = """
You are an AI assistant for BARBAH Games, an organization focused on promoting esports in East Africa and integrating Sustainable Development Goals (SDGs) into gaming. You have expertise in esports, SDGs, and BARBAH Games' initiatives. Provide informative and engaging responses to user queries, always emphasizing the connection between gaming, education, and sustainable development.
"""

prompt_template = """
Context:
{context}

User Query: {query}

Please provide a response based on the context, your expertise in esports and SDGs, and BARBAH Games' initiatives. If relevant, suggest ways the user can contribute to SDGs through gaming or esports activities.
"""

rag_system = RAG(embedding_model, language_model, system_prompt, prompt_template)

# Add existing documents to the RAG system
existing_documents = [
    Document("BARBAH GAMES TOKENIZING GAMING FOR REAL-WORLD CAREERS IN WEB3...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/BARBAHGAMES Deck018 2.txt"}),
    Document("SUSTAINABLE DEVELOPMENT GOALS Progress Chart 2023...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/SUSTAINABLE DEVELOPMENT GOALS Progress-Chart-2023.txt"}),
    Document("BARBAH GAMES ESPORTS : A CATALYST FOR GLOBAL ESG INITIATIVES...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/VISION DECK.txt"}),
    Document("FOR THE NEXT GENERATION FY22 NIKE, Inc. Impact Report...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/FY22-NIKE,-Inc.-Impact-Report.txt"}),
    Document("Driving Engagement by gamifying SDG Quests At BARBAH Games, we are committed to more than just gaming...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/SDG QUESTS.txt"}),
    Document("Skills for a Sustainable Future: How Green and Digital Skills Intersect...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/TxtSkills for a Sustainable Future How Green and Digital Skills Intersect.txt"}),
    Document("Corporate Sponsorship Model Overview...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/Corporate Sponsorship Tiers.txt"}),
    Document("BARBAH Games Essentials Copy Elevate Your Game with BARBAH Games Essentials!...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/BARBAH GAMERS ESSENTIALS.txt"}),
    Document("Promote Esports in East Africa July 12, 2024 www.barbahgames.com...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/BARBAH_GAMES_GPT_Conversation.txt"}),
    Document("Gaming Mentor EA Supreme Instructions Overview...", {"source": "C:/Users/22837352/OneDrive - The University of Northampton/Work/BARBAH Games/Business/MVP/Chatbot/RAG-Architecture/data/Txt/BARBAH_GAMES_GPT_Instructions.txt"})
]

rag_system.add_documents(existing_documents)

# Add new documents from JSON files
json_file_paths = [
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\barbah games.com.json",
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\barbah games.gg.json",
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\epic games.json"
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\forwardfaster global compacts.json"
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\prophix esg report.json"
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\un global compact.json"
    r"C:\Users\22837352\OneDrive - The University of Northampton\Work\BARBAH Games\Business\MVP\Chatbot\RAG-Architecture\data\Websites Info\22-02-2024\un sdg learn.json"   
]

new_documents = load_json_documents(json_file_paths)
rag_system.add_documents(new_documents)

# Main loop
def main():
    while True:
        query = input("Enter your query (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        response = rag_system.query(query)
        print(response)

        # Example of scoring an SDG action
        user_id = "example_user"
        action = input("Enter an SDG action you've taken: ")
        sdg = int(input("Enter the SDG number (1-17): "))
        score = rag_system.score_sdg_action(user_id, action, sdg)
        print(f"SDG Action Score: {score}")

        # Get user's overall SDG score
        user_score = rag_system.get_user_sdg_score(user_id)
        print(f"User's SDG Scores: {user_score}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.79s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
