In [11]:
import glob
import json
from datetime import datetime
import pandas as pd

In [12]:
# Path pattern to locate all JSON files across snapshots
file_pattern = "C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset/snapshot_*/*_issue_sharings.json"

# Initialize an empty list to store all JSON data
all_data = []

# Loop through all matching file paths and load each file
for path in glob.glob(file_pattern):
    print(path)
    with open(path, "r") as file:
        data = json.load(file)
        all_data.append(data)

# all_data now contains data from all JSON files
print(f"Total files read: {len(all_data)}")

C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230727\20230727_195941_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230803\20230803_094705_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230810\20230810_123938_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230817\20230817_130502_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230824\20230824_101836_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230831\20230831_061759_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230907\20230907_092956_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software Engineering/Project/Dataset\snapshot_20230914\20230914_080417_issue_sharings.json
C:/Users/Dell/Desktop/AI for Software En

In [13]:
# Initialize a list for rows that will form the DataFrame
rows = []

# Define the pattern analysis function
def analyze_prompt_structure(data):
    detected_patterns = []
    
    persona_keywords = ["you are", "act as", "pretend to be", "pretend you are"]
    recipe_keywords = ["step-by-step", "recipe", "guide"]
    template_keywords = ["template", "formatting"]
    automator_keywords = ["script", "code", "executable"]
    simple_instruction_keywords = ["explain", "describe", "list", "tell me", "give me"]
    context_instruction_keywords = ["based on", "with this information"]
    question_keywords = ["what", "where", "when", "who", "why"]

    def contains_keywords(text, keywords):
        return any(keyword in text.lower() for keyword in keywords)

    for source in data.get("Sources", []):
        body = source.get("Body", "")
        created_at = source.get("CreatedAt")
        closed_at = source.get("ClosedAt")
        state = "Closed" if closed_at is not None else "Open"

        time_lapsed = None
        if created_at and closed_at:
            created_at_dt = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
            closed_at_dt = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%SZ")
            time_lapsed = closed_at_dt - created_at_dt

        body_patterns = []

        # Check for patterns in the body text
        if contains_keywords(body, persona_keywords):
            body_patterns.append("Persona Pattern")
        if contains_keywords(body, recipe_keywords):
            body_patterns.append("Recipe Pattern")
        if contains_keywords(body, template_keywords):
            body_patterns.append("Template Pattern")
        if contains_keywords(body, automator_keywords):
            body_patterns.append("Output Automator Pattern")
        if contains_keywords(body, simple_instruction_keywords):
            body_patterns.append("Simple Instruction Pattern")
        if contains_keywords(body, context_instruction_keywords):
            body_patterns.append("Context and Instruction Pattern")
        if contains_keywords(body, question_keywords):
            body_patterns.append("Question Pattern")

        conversations = source.get("ChatgptSharing", [])
        for item in conversations:
            number_of_prompts = item.get("NumberOfPrompts", "N/A")
            conversation_url = item.get("URL", "N/A")

            for pattern in body_patterns:
                detected_patterns.append((
                    source['Number'],
                    pattern,
                    state,
                    time_lapsed,
                    number_of_prompts,
                    conversation_url
                ))

    return detected_patterns

In [14]:
# Loop through each dictionary in all_data
for data_dict in all_data:
    # Check if 'Sources' key exists in each dictionary
    if 'Sources' in data_dict:
        for item in data_dict['Sources']:
            if 'ChatgptSharing' in item:
                for sharing in item['ChatgptSharing']:
                    if 'Conversations' in sharing:
                        for conversation in sharing['Conversations']:
                            # Adding the fields to row
                            row = {
                                "Type": item.get('Type'),
                                "URL": item.get('URL'),
                                "Author": item.get('Author'),
                                "RepoName": item.get('RepoName'),
                                "RepoLanguage": item.get('RepoLanguage'),
                                "Number": item.get('Number'),
                                "Title": item.get('Title'),
                                "Body": item.get('Body'),
                                "UpdatedAt": item.get('UpdatedAt'),
                                "State": state,
                                "CSharing_URL": sharing.get('URL'),
                                "CSharing_Status": sharing.get('Status'),
                                "CSharing_DateOfConversation": sharing.get('DateOfConversation'),
                                "CSharing_Title": sharing.get('Title'),
                                "CSharing_NumberOfPrompts": sharing.get('NumberOfPrompts'),
                                "CSharing_TokensOfPrompts": sharing.get('TokensOfPrompts'),
                                "CSharing_TokensOfAnswers": sharing.get('TokensOfAnswers'),
                                "Conversation_Prompt": conversation.get('Prompt'),
                                "Conversation_Answer": conversation.get('Answer')
                            }
                            rows.append(row)

# Create DataFrame from rows
df = pd.DataFrame(rows)

In [15]:
# Detect patterns for each source and add to the DataFrame
detected_patterns = analyze_prompt_structure(data_dict)  # Use the last data_dict for patterns

In [16]:
# Prepare a dictionary to store detected patterns for easy lookup
pattern_dict = {}
for number, pattern, state, time_lapsed, number_of_prompts, conversation_url in detected_patterns:
    if number not in pattern_dict:
        pattern_dict[number] = []
    pattern_dict[number].append(pattern)

# Add a new column for detected patterns
df['Detected Patterns'] = df['Number'].map(lambda x: ', '.join(pattern_dict.get(x, [])))

In [17]:
df.head()

Unnamed: 0,Type,URL,Author,RepoName,RepoLanguage,Number,Title,Body,UpdatedAt,State,CSharing_URL,CSharing_Status,CSharing_DateOfConversation,CSharing_Title,CSharing_NumberOfPrompts,CSharing_TokensOfPrompts,CSharing_TokensOfAnswers,Conversation_Prompt,Conversation_Answer,Detected Patterns
0,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,Closed,https://chat.openai.com/share/7958273c-0081-48...,200,"July 6, 2023",ハンバーガーメニュー表示の修正,11,2637,4712,あなたはwebデザイナーです。ハンバーガーメニューを実装したところ、初めからメニューの内容が...,ハンバーガーメニューが初めから表示されており、表示非表示を切り替えることができない理由は、c...,"Question Pattern, Output Automator Pattern, Qu..."
1,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,Closed,https://chat.openai.com/share/7958273c-0081-48...,200,"July 6, 2023",ハンバーガーメニュー表示の修正,11,2637,4712,「ハンバーガーメニュー」とはなんですか。,ハンバーガーメニュー（Hamburger menu）は、ウェブデザインやユーザーインターフェ...,"Question Pattern, Output Automator Pattern, Qu..."
2,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,Closed,https://chat.openai.com/share/7958273c-0081-48...,200,"July 6, 2023",ハンバーガーメニュー表示の修正,11,2637,4712,そうですね。ありがとうございます。\n\n今回は初めから「ハンバーガーメニュー」が表示されて...,申し訳ありません、誤解が生じました。ハンバーガーメニューの問題点について理解しました。メニュ...,"Question Pattern, Output Automator Pattern, Qu..."
3,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,Closed,https://chat.openai.com/share/7958273c-0081-48...,200,"July 6, 2023",ハンバーガーメニュー表示の修正,11,2637,4712,変更点を解説してください。,修正点を以下で解説します。.containerクラスにdisplay: none;のスタイル...,"Question Pattern, Output Automator Pattern, Qu..."
4,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,Closed,https://chat.openai.com/share/7958273c-0081-48...,200,"July 6, 2023",ハンバーガーメニュー表示の修正,11,2637,4712,変更を加えましたが、メニューが最初から表示されていて、ハンバーガーアイコンをクリックしても切...,申し訳ありません、再度問題を確認いたしました。修正が不足していたため、以下の修正を加えてくだ...,"Question Pattern, Output Automator Pattern, Qu..."


In [18]:
print((df['Detected Patterns'] == '').sum())
print(df['Detected Patterns'].isna().sum())
df.shape

1940
0


(14564, 20)

In [9]:
df_to_save = df[(df['Detected Patterns'] != '')]
df_to_save.shape

(12624, 20)

In [10]:
# Save the DataFrame to a CSV file
df_to_save.to_csv('issues_sharing_visualize_output_with_patterns.csv', index=False)