In [2]:
import pandas as pd
df = pd.read_csv("../../Tweets_data.csv")
df.isnull().sum()

tweet_id                    0
author_id                   0
inbound                     0
created_at                  0
text                        0
response_tweet_id          28
in_response_to_tweet_id    25
dtype: int64

In [3]:
df = df[df['response_tweet_id'].notna()] # remove any NaN values as it blows up serialization
df = df[df['in_response_to_tweet_id'].notna()]
df.isnull().sum()

tweet_id                   0
author_id                  0
inbound                    0
created_at                 0
text                       0
response_tweet_id          0
in_response_to_tweet_id    0
dtype: int64

In [4]:
data = df.to_dict('records') # Get only 700 records. More records will make it slower to index
len(data)

40

In [5]:
print(data)

[{'tweet_id': 119240, 'author_id': 'VirginTrains', 'inbound': False, 'created_at': 'Tue Oct 10 15:16:08 +0000 2017', 'text': '@105836 LiveChat is online at the moment - https://t.co/SY94VtU8Kq or contact 03331 031 031 option 1, 4, 3 (Leave a message) to request a call back', 'response_tweet_id': '119241', 'in_response_to_tweet_id': 119242.0}, {'tweet_id': 119241, 'author_id': '105836', 'inbound': True, 'created_at': 'Tue Oct 10 15:17:21 +0000 2017', 'text': "@VirginTrains see attached error message. I've tried leaving a voicemail several times in the past week https://t.co/NxVZjlYx1k", 'response_tweet_id': '119243', 'in_response_to_tweet_id': 119240.0}, {'tweet_id': 119243, 'author_id': 'VirginTrains', 'inbound': False, 'created_at': 'Tue Oct 10 15:25:14 +0000 2017', 'text': '@105836 Have you tried from another device, Miriam ^MM', 'response_tweet_id': '119244', 'in_response_to_tweet_id': 119241.0}, {'tweet_id': 119244, 'author_id': '105836', 'inbound': True, 'created_at': 'Tue Oct 10 

In [6]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [7]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [8]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [9]:
vector_size = encoder.get_sentence_embedding_dimension()
if vector_size is None:
    raise ValueError("Encoder's embedding dimension could not be determined. Ensure the model is loaded correctly.")

collection_name = "tweets"
if not qdrant.collection_exists(collection_name=collection_name):
    qdrant.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=vector_size,
            distance=models.Distance.COSINE
        )
    )
else:
    print(f"Collection '{collection_name}' already exists.")

In [None]:
from typing import Any, Dict

points_to_upload = []
for idx, doc in enumerate(data):
    notes_text = doc.get("text", "")

    qdrant_payload: Dict[str, Any] = {}
    for key, value in doc.items():
        if isinstance(key, str):
            # Basic check for common Qdrant-compatible value types
            if isinstance(value, (str, int, float, bool, list, dict, type(None))):
                qdrant_payload[key] = value
            # else: # You might want to handle non-compatible values, e.g., convert to string or skip
            #     qdrant_payload[key] = str(value) # Example: Convert to string
        # else: # Handle non-string keys if they exist in `doc`
        #     qdrant_payload[str(key)] = value # Example: Convert non-string keys to string

    points_to_upload.append(
        models.PointStruct(
            id=idx,
            vector=encoder.encode(notes_text).tolist(),
            payload=qdrant_payload
        )
    )

# Upload the points in batches if 'data' is very large
# qdrant.upload_points is suitable for smaller lists
# For very large datasets, consider qdrant.fast_upsert or batching
qdrant.upload_points(
    collection_name=collection_name,
    points=points_to_upload,
    batch_size=100 # Optional: Process in batches
)

[PointStruct(id=0, vector=[-0.09216129034757614, -0.07929351925849915, -0.01612984947860241, 0.011713593266904354, 0.03204577416181564, -0.014678417704999447, -0.014979972504079342, -0.0032887733541429043, 0.00727169681340456, 0.05252618342638016, -0.0019939716439694166, -0.057714998722076416, -0.07037801295518875, -0.020965218544006348, 0.046457305550575256, -0.047522395849227905, -0.02438722550868988, -0.049096040427684784, -0.042355939745903015, 0.03379315882921219, -0.02885427325963974, -0.03339571878314018, -0.05121265724301338, -0.021394772455096245, 0.04210672527551651, -0.11056920886039734, -0.030610088258981705, 0.03628930822014809, -0.028406614437699318, 0.036081742495298386, -0.002443808363750577, 0.08784627169370651, -0.03898192197084427, -0.033146996051073074, 0.024067653343081474, 0.12241867184638977, 0.044213660061359406, -0.010108614340424538, 0.005471268203109503, 0.05069823935627937, -0.03069678321480751, 0.011208083480596542, -0.042003780603408813, 0.0308747328817844

In [12]:
user_prompt = "Suggest the most positive tweet response about supporting"

In [None]:
# Search time for awesome wines!

hits = qdrant.query_points(
    collection_name="tweets",
    query_vector=encoder.encode(user_prompt).tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)

{'tweet_id': 119260, 'author_id': '105840', 'inbound': True, 'created_at': 'Wed Oct 11 14:22:05 +0000 2017', 'text': '@SpotifyCares Brilliant thanks 😊', 'response_tweet_id': '119261', 'in_response_to_tweet_id': 119259.0} score: 0.2164447968079223
{'tweet_id': 119259, 'author_id': 'SpotifyCares', 'inbound': False, 'created_at': 'Wed Oct 11 14:20:00 +0000 2017', 'text': "@105840 That's great to hear. If anything comes up, just let us know. We'll carry on helping out 🙂 /AY", 'response_tweet_id': '119260', 'in_response_to_tweet_id': 119258.0} score: 0.2112715313224591
{'tweet_id': 119271, 'author_id': 'AppleSupport', 'inbound': False, 'created_at': 'Wed Oct 11 03:26:00 +0000 2017', 'text': "@105844 We'd be happy to help. Send us a DM and we can start there. https://t.co/GDrqU22YpT", 'response_tweet_id': '119270', 'in_response_to_tweet_id': 119272.0} score: 0.19020688337917013


  hits = qdrant.search(


In [14]:
# define a variable to hold the search results
search_results = [hit.payload for hit in hits]

In [23]:
import os
import re
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage

# Replace with your actual Groq API key
load_dotenv()
key = os.getenv("GROQ_API_KEY")
if key is None:
    raise ValueError("GROQ_API_KEY environment variable not set")
os.environ['GROQ_API_KEY'] = key

# Create the chat model instance
llm = ChatGroq(
    model="qwen-qwq-32b",  # or "mixtral-8x7b-32768" or "gemma-7b-it"
)

# Compose the conversation (same as your previous messages)
messages = [
    SystemMessage(content="You are chatbot, a tweet specialist. Your top priority is to provide support to user with their requests with a clear and concise response from the available list."),
    HumanMessage(content="provide the tweet id with the best tweet repsonse."),
    AIMessage(content=str(search_results))  # If you want to feed in search results
]

def remove_think_tag(response_text: str) -> str:
    cleaned_response = re.sub(r'<think>.*?</think>\n?', '', response_text, flags=re.DOTALL)
    return cleaned_response.strip()

# Get completion
response_obj = llm.invoke(messages)

# Extract string response
if hasattr(response_obj, "content"):
    result = str(response_obj.content)
else:
    result = str(response_obj)
    
response = remove_think_tag(result)
print(response)


The best response tweet ID is **119259**. 

This tweet from **@SpotifyCares** effectively acknowledges the user’s message, offers ongoing support, and results in a positive customer reply (ID 119260). It demonstrates clear, empathetic resolution. 

The other tweet (ID 119271) directs the user to a DM without addressing their issue publicly, which is less optimal for immediate resolution visibility.
