In [1]:
import pandas as pd

import pandas as pd
from src.data.ingestion import load_tickets

In [2]:
from src.data.graph_rag_ingest import build_support_graph
from src.data.graph_rag_query import top_solutions_for_issue,graph_rag_candidates

In [3]:
json_data = load_tickets("../data/raw/support_tickets.json")
Data_tickets = json_data[0]
Data_tickets[:5]

[Ticket(ticket_id='TK-2024-000001', created_at=datetime.datetime(2023, 11, 2, 12, 30, 10, tzinfo=datetime.timezone.utc), updated_at=datetime.datetime(2023, 11, 2, 15, 30, 46, tzinfo=datetime.timezone.utc), customer_id='CUST-02387', customer_tier='starter', organization_id='ORG-234', product='CloudBackup Enterprise', product_version='4.5.10', product_module='encryption_layer', category='Feature Request', subcategory='Documentation', priority='critical', severity='P2', channel='portal', subject='Request: Add bulk operation support to CloudBackup Enterprise', description='We would like to request a feature for CloudBackup Enterprise that allows bulk operations. Currently, we have to process items one by one, which is time-consuming. Having bulk support would greatly improve our workflow efficiency.', error_logs='', stack_trace='', customer_sentiment='frustrated', previous_tickets=9, resolution='Issue resolved by updating configuration settings. Changed timeout values from 30s to 120s in c

In [4]:
support_graph = build_support_graph(Data_tickets[:])

print("nodes:", support_graph.number_of_nodes())
print("edges:", support_graph.number_of_edges())

print(top_solutions_for_issue(support_graph, "Account Management", "Upgrade", top_k=5))

nodes: 110078
edges: 517739
[('solution:DATA_REPAIR', 0.6649484536082474, 388), ('solution:ENVIRONMENT_ISSUE', 0.6240208877284595, 383), ('solution:DUPLICATE', 0.6222826086956522, 368), ('solution:WORKAROUND', 0.6208955223880597, 335), ('solution:CONFIG_CHANGE', 0.6186666666666667, 375)]


In [5]:
from src.data.graph_rag_query import GraphQuery
##3
query_ticket = Data_tickets[3]

query = GraphQuery(
    text=f"{query_ticket.subject} {query_ticket.description} {query_ticket.error_logs}",
    category=query_ticket.category,
    subcategory=query_ticket.subcategory,
    product=query_ticket.product,
    product_module=query_ticket.product_module,
)
query

GraphQuery(text='License upgrade needed for CloudBackup Enterprise We need to upgrade our license for CloudBackup Enterprise. Our team has grown and we need additional seats. Please provide information on pricing and the upgrade process. 2024-11-27T18:17:26 ERROR ERROR_SERVER_500: Connection timeout after 30s\n2024-11-27T18:17:27 RETRY_FAILED: Max retries exceeded', category='Account Management', subcategory='Upgrade', product='CloudBackup Enterprise', product_module='backup_service')

In [6]:
from src.data.graph_rag_query import graph_rag_candidates

graph_candidates = graph_rag_candidates(
    graph=support_graph,
    query=query,
    top_solutions=5,
    top_error_solutions=5,
    top_tickets=10,
)


In [7]:
graph_candidates

GraphCandidates(solution_nodes=['solution:DUPLICATE', 'solution:DATA_REPAIR', 'solution:ENVIRONMENT_ISSUE', 'solution:CONFIG_CHANGE', 'solution:WORKAROUND', 'solution:PATCH_APPLIED', 'solution:ESCALATED', 'solution:USER_EDUCATION', 'solution:WONT_FIX'], ticket_nodes=['ticket:TK-2024-000002', 'ticket:TK-2024-000004', 'ticket:TK-2024-000031', 'ticket:TK-2024-000092', 'ticket:TK-2024-000172', 'ticket:TK-2024-000186', 'ticket:TK-2024-000205', 'ticket:TK-2024-000271', 'ticket:TK-2024-000304', 'ticket:TK-2024-000334'], solution_priors={'solution:DATA_REPAIR': 0.8232786781428799, 'solution:ENVIRONMENT_ISSUE': 0.7938531745651853, 'solution:DUPLICATE': 1.0, 'solution:WORKAROUND': 0.783653535269434, 'solution:CONFIG_CHANGE': 0.7888420152700604, 'solution:PATCH_APPLIED': 0.4, 'solution:ESCALATED': 0.4, 'solution:USER_EDUCATION': 0.4, 'solution:WONT_FIX': 0.4})

In [None]:
# Issue prior → What usually works for this kind of problem? .
# - comes from top_solutions_for_issue()

# Error boost → What works for THIS specific failure?
# - comes from solutions_from_error_codes()


# solution_priors = {
#   'solution:DATA_REPAIR':        0.8233,
#   'solution:ENVIRONMENT_ISSUE':  0.7939,
#   'solution:DUPLICATE':          1.0,
#   'solution:WORKAROUND':         0.7837,
#   'solution:CONFIG_CHANGE':      0.7888,
#   'solution:PATCH_APPLIED':      0.4,
#   'solution:ESCALATED':          0.4,
#   'solution:USER_EDUCATION':     0.4,
#   'solution:WONT_FIX':           0.4
# }

# Scale meaning

# 0.0 – 0.5 → weak prior (mostly error-based signal)

# 0.6 – 0.85 → strong issue-based prior

# 1.0 → very strong (issue prior + error boost, capped)

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

class HybridTicketIndex:
    def __init__(self, embed_model_name: str = "sentence-transformers/paraphrase-MiniLM-L3-v2"):
        self.embedder = SentenceTransformer(embed_model_name)
        self.tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_features=200_000)

        self.df = None
        self.emb = None           # normalized embeddings (N, d)
        self.tfidf_mat = None     # sparse (N, vocab)

    @staticmethod
    def make_text(df: pd.DataFrame) -> pd.Series:
        return (
            df["subject"].fillna("").astype(str) + "\n" +
            df["description"].fillna("").astype(str) + "\n" +
            df.get("error_logs", "").fillna("").astype(str)
        )

    def fit(self, tickets_df: pd.DataFrame):
        df = tickets_df.copy()
        df["doc_text"] = self.make_text(df)
        self.df = df.reset_index(drop=True)

        self.emb = self.embedder.encode(
            self.df["doc_text"].tolist(),
            batch_size=256,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=True,
        )

        self.tfidf_mat = self.tfidf.fit_transform(self.df["doc_text"])
        return self
