In [None]:
#Shiqi Zhang, Tyler Stevenson, Zefeng Pei
import pandas as pd
import json
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBClassifier
from collections import Counter

# Define JSON file paths
json_files = [
    "./DevGPT/snapshot_20231012/20231012_232232_hn_sharings.json",
    "./DevGPT/snapshot_20230803/20230803_105332_hn_sharings.json"
]

# Load JSON data
def load_conversations(json_files):
    conversations_data = []
    for file_path in json_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            for source in data.get("Sources", []):
                for chat in source.get("ChatgptSharing", []):
                    for conversation in chat.get("Conversations", []):
                        conversations_data.append({
                            "Prompt": conversation.get("Prompt", ""),
                            "Answer": conversation.get("Answer", ""),
                            "DateOfConversation": conversation.get("DateOfConversation", ""),
                            "Title": conversation.get("Title", ""),
                            "Source_File": file_path
                        })
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    return pd.DataFrame(conversations_data)

# Load data
df_conversations = load_conversations(json_files)

# Define categories
categories = [
    "Bug Report", "Feature Request", "Theoretical Question", "Code Help", "General Inquiry", "Other"
]

# Preprocessing function
def preprocess_text(text):
    text = text.lower().strip()
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text

df_conversations["Processed_Prompt"] = df_conversations["Prompt"].apply(preprocess_text)

# Auto-labeling with keyword-based matching
def categorize_prompt(prompt):
    prompt_lower = prompt.lower()
    
    if any(word in prompt_lower for word in ["error", "bug", "issue", "exception", "traceback", "crash", "fail", "stuck", "debug"]):
        return "Bug Report"
    elif any(word in prompt_lower for word in ["feature", "add", "support", "implement", "new functionality", "request", "enhancement", "improve"]):
        return "Feature Request"
    elif any(word in prompt_lower for word in ["why", "how", "explain", "difference", "theory", "concept", "definition", "principle"]):
        return "Theoretical Question"
    elif any(word in prompt_lower for word in ["code", "function", "class", "method", "best practice", "optimize", "refactor", "debug", "syntax"]):
        return "Code Help"
    elif any(word in prompt_lower for word in ["what", "when", "can", "possible", "is it", "should", "which", "does"]):
        return "General Inquiry"
    else:
        return "Other"

df_conversations["Category"] = df_conversations["Prompt"].apply(categorize_prompt)

# Encode categories into numerical labels
category_to_label = {category: i for i, category in enumerate(categories)}
label_to_category = {i: category for category, i in category_to_label.items()}

df_conversations["Label"] = df_conversations["Category"].map(category_to_label)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_conversations["Processed_Prompt"].tolist(),
    df_conversations["Label"].tolist(),
    test_size=0.2,
    random_state=42
)

# Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Convert text to embeddings
print("Converting text to embeddings...")
train_embeddings = model.encode(train_texts, convert_to_numpy=True)
test_embeddings = model.encode(test_texts, convert_to_numpy=True)

# Train XGBoost classifier
print("Training XGBoost model...")
xgb_model = XGBClassifier(objective="multi:softmax", num_class=len(categories), eval_metric="mlogloss")
xgb_model.fit(train_embeddings, train_labels)

# Predict categories
predicted_labels = xgb_model.predict(test_embeddings)

# Convert predictions back to category names
predicted_categories = [label_to_category[label] for label in predicted_labels]

# Reclassify "Other" samples using nearest category
print("Reclassifying 'Other' samples with nearest category...")

# Get embeddings for "Other" samples
df_other = df_conversations[df_conversations["Category"] == "Other"]
other_texts = df_other["Processed_Prompt"].tolist()
other_embeddings = model.encode(other_texts, convert_to_numpy=True)

# Compute similarity with known categories
category_examples = {
    "Bug Report": "I encountered an error in my code.",
    "Feature Request": "I want to add a new functionality.",
    "Theoretical Question": "Can you explain this concept?",
    "Code Help": "How do I fix this function?",
    "General Inquiry": "What is the best way to do this?"
}

category_embeddings = model.encode(list(category_examples.values()), convert_to_numpy=True)

# Assign the most similar category to "Other" samples
for i, embedding in enumerate(other_embeddings):
    similarities = cosine_similarity([embedding], category_embeddings)[0]
    best_category = list(category_examples.keys())[np.argmax(similarities)]
    df_conversations.loc[df_other.index[i], "Category"] = best_category  # Update category

# Count occurrences of each category
category_counts = Counter(df_conversations["Category"])

# Display results
print("\n Final Category Counts (XGBoost + Reclassification):")
print(pd.DataFrame(category_counts.items(), columns=["Category", "Count"]))

print("\n Classification completed successfully.")


Converting text to embeddings...
Training XGBoost model...
Reclassifying 'Other' samples with nearest category...

 Final Category Counts (XGBoost + Reclassification):
               Category  Count
0       General Inquiry    546
1            Bug Report    363
2       Feature Request    312
3  Theoretical Question    493
4             Code Help    169

 Classification completed successfully.
