In [1]:
import glob
import json
from datetime import datetime
import csv
import pandas as pd


In [2]:
def analyze_prompt_structure(all_data):
    rows = []
    # Define keywords for pattern detection
    persona_keywords = ["you are", "act as", "pretend to be", "pretend you are"]
    recipe_keywords = ["step-by-step", "recipe", "guide"]
    template_keywords = ["template", "formatting"]
    automator_keywords = ["script", "code", "executable"]
    simple_instruction_keywords = ["explain", "describe", "list", "tell me", "give me"]
    context_instruction_keywords = ["based on", "with this information"]
    question_keywords = ["what", "where", "when", "who", "why"]

    def contains_keywords(text, keywords):
        return any(keyword in text.lower() for keyword in keywords)

    def get_patterns(body):
        body_patterns = []
        if contains_keywords(body, persona_keywords):
            body_patterns.append("Persona Pattern")
        if contains_keywords(body, recipe_keywords):
            body_patterns.append("Recipe Pattern")
        if contains_keywords(body, template_keywords):
            body_patterns.append("Template Pattern")
        if contains_keywords(body, automator_keywords):
            body_patterns.append("Output Automator Pattern")
        if contains_keywords(body, simple_instruction_keywords):
            body_patterns.append("Simple Instruction Pattern")
        if contains_keywords(body, context_instruction_keywords):
            body_patterns.append("Context and Instruction Pattern")
        if contains_keywords(body, question_keywords):
            body_patterns.append("Question Pattern")
        return body_patterns

    # Process each JSON file's data
    for data in all_data:
        for source in data.get("Sources", []):
            # Get basic fields and calculate state
            body = source.get("Body", "")
            created_at = source.get("CreatedAt")
            closed_at = source.get("ClosedAt")
            state = "Closed" if closed_at is not None else "Open"
            time_lapsed = None

            if created_at and closed_at:
                created_at_dt = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
                closed_at_dt = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%SZ")
                time_lapsed = closed_at_dt - created_at_dt

            patterns = get_patterns(body)

            # Process ChatGPT sharing data
            if "ChatgptSharing" in source:
                for sharing in source.get("ChatgptSharing", []):
                    if "Conversations" in sharing:
                        for conversation in sharing["Conversations"]:
                            row = {
                                # Source fields
                                "Type": source.get("Type"),
                                "URL": source.get("URL"),
                                "Author": source.get("Author"),
                                "RepoName": source.get("RepoName"),
                                "RepoLanguage": source.get("RepoLanguage"),
                                "Number": source.get("Number"),
                                "Title": source.get("Title"),
                                "Body": source.get("Body"),
                                "MergedAt": source.get("MergedAt"),
                                "UpdatedAt": source.get("UpdatedAt"),
                                "State": state,
                                "Time_Lapsed": str(time_lapsed) if time_lapsed else None,
                                "Created_At": created_at,
                                "Closed_At": closed_at,
                                "Additions": source.get("Additions"),
                                "Deletions": source.get("Deletions"),
                                "ChangedFiles": source.get("ChangedFiles"),
                                "CommitsTotalCount": source.get("CommitsTotalCount"),

                                # ChatgptSharing fields
                                "CSharing_URL": sharing.get("URL"),
                                "CSharing_Status": sharing.get("Status"),
                                "CSharing_DateOfConversation": sharing.get("DateOfConversation"),
                                "CSharing_Title": sharing.get("Title"),
                                "CSharing_NumberOfPrompts": sharing.get("NumberOfPrompts"),
                                "CSharing_TokensOfPrompts": sharing.get("TokensOfPrompts"),
                                "CSharing_TokensOfAnswers": sharing.get("TokensOfAnswers"),

                                # Conversation fields
                                "Conversation_Prompt": conversation.get("Prompt"),
                                "Conversation_Answer": conversation.get("Answer"),

                                # Pattern detection results
                                "Detected_Patterns": "; ".join(patterns)
                            }
                            rows.append(row)

    # Return rows after processing all data
    return rows


# Load data from multiple JSON files
file_pattern = "C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset/snapshot_*/*_pr_sharings.json"
all_data = []

print("Reading JSON files...")
for path in glob.glob(file_pattern):
    print(f"Processing: {path}")
    with open(path, "r") as file:
        data = json.load(file)
        all_data.append(data)

print(f"Total files read: {len(all_data)}")

# Process all data
rows = analyze_prompt_structure(all_data)
print(f"Total rows generated: {len(rows)}")

# Define the column order with new fields
columns = [
    "Type", "URL", "Author", "RepoName", "RepoLanguage", "Number", "Title", 
    "Body", "MergedAt", "UpdatedAt", "State", "Time_Lapsed", "Created_At", "Closed_At",
    "Additions", "Deletions", "ChangedFiles", "CommitsTotalCount", 
    "CSharing_URL", "CSharing_Status", "CSharing_DateOfConversation", "CSharing_Title", 
    "CSharing_NumberOfPrompts", "CSharing_TokensOfPrompts", "CSharing_TokensOfAnswers", 
    "Conversation_Prompt", "Conversation_Answer", "Detected_Patterns"
]

# Write to CSV with specified columns
if rows:
    output_file = "pr_sharing_visualize_without_patterns.csv"
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        csv_writer = csv.DictWriter(csvfile, fieldnames=columns)
        csv_writer.writeheader()
        csv_writer.writerows(rows)
    print(f"CSV file created successfully: {output_file}")
else:
    print("No data to write to CSV.")

Reading JSON files...
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230727\20230727_195927_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230803\20230803_093947_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230810\20230810_123110_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230817\20230817_125147_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230824\20230824_100450_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230831\20230831_060603_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230907\20230907_091631_pr_sharings.json
Processing: C:/Users/Dell/Desktop/AI for Software Engineering/Project/Datas

In [3]:
# Load the DataFrame (replace with your actual data loading step)
df = pd.read_csv('./pr_sharing_visualize_without_patterns.csv')
df.head()

Unnamed: 0,Type,URL,Author,RepoName,RepoLanguage,Number,Title,Body,MergedAt,UpdatedAt,...,CSharing_URL,CSharing_Status,CSharing_DateOfConversation,CSharing_Title,CSharing_NumberOfPrompts,CSharing_TokensOfPrompts,CSharing_TokensOfAnswers,Conversation_Prompt,Conversation_Answer,Detected_Patterns
0,pull request,https://github.com/labdao/plex/pull/469,AdamGoyer,labdao/plex,Go,469,add readme for openbabel to PLEX,The Chatgpt Thread used to create this pull re...,,2023-07-05T03:30:59Z,...,https://chat.openai.com/share/8bd33825-e8c6-44...,200,"July 5, 2023",Open Babel on PLEX,6,2895,2311,"Good evening Chatgpt,\nI'd like your help to w...",Thanks for sharing the README file for Open Ba...,
1,pull request,https://github.com/labdao/plex/pull/469,AdamGoyer,labdao/plex,Go,469,add readme for openbabel to PLEX,The Chatgpt Thread used to create this pull re...,,2023-07-05T03:30:59Z,...,https://chat.openai.com/share/8bd33825-e8c6-44...,200,"July 5, 2023",Open Babel on PLEX,6,2895,2311,"Here is the PLEX readme, I think this will ans...",Thanks for sharing the README file for the PLE...,
2,pull request,https://github.com/labdao/plex/pull/469,AdamGoyer,labdao/plex,Go,469,add readme for openbabel to PLEX,The Chatgpt Thread used to create this pull re...,,2023-07-05T03:30:59Z,...,https://chat.openai.com/share/8bd33825-e8c6-44...,200,"July 5, 2023",Open Babel on PLEX,6,2895,2311,Essencially PLEX loads an openbabel docker con...,"Based on the given information, it looks like ...",
3,pull request,https://github.com/labdao/plex/pull/469,AdamGoyer,labdao/plex,Go,469,add readme for openbabel to PLEX,The Chatgpt Thread used to create this pull re...,,2023-07-05T03:30:59Z,...,https://chat.openai.com/share/8bd33825-e8c6-44...,200,"July 5, 2023",Open Babel on PLEX,6,2895,2311,"Excellent work chat, I think we should also ex...",These JSON files are indeed configuration file...,
4,pull request,https://github.com/labdao/plex/pull/469,AdamGoyer,labdao/plex,Go,469,add readme for openbabel to PLEX,The Chatgpt Thread used to create this pull re...,,2023-07-05T03:30:59Z,...,https://chat.openai.com/share/8bd33825-e8c6-44...,200,"July 5, 2023",Open Babel on PLEX,6,2895,2311,"Noted,\nTake a look at this docker file, and t...",This Dockerfile is used to create a Docker ima...,


In [4]:
print((df['Detected_Patterns'] == '').sum())
print(df['Detected_Patterns'].isna().sum())
df = df.dropna(subset=['Detected_Patterns'])
df.shape

0
4149


(3827, 28)

In [5]:
df['State'].unique()

array(['Closed', 'Open'], dtype=object)

In [6]:
df.to_csv('pr_sharing_visualize_output_with_patterns.csv', index=False)