## DevGPT Dataset coding anaylsis

**Name: Zeitan Zhao**

## Question 1:

What types of issues (bugs, feature requests, theoretical questions, etc.) do developers most commonly present to ChatGPT?

In [11]:
# read json file 
import json 
import pandas as pd

# read json file
with open('issue.json') as f:
    data = json.load(f)

# convert json to dataframe
df_issues = pd.json_normalize(data["Sources"])

df_issues.head()

Unnamed: 0,Type,URL,Author,RepoName,RepoLanguage,Number,Title,Body,CreatedAt,ClosedAt,UpdatedAt,State,ChatgptSharing
0,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,,2023-07-23T15:38:42Z,OPEN,[{'URL': 'https://chat.openai.com/share/795827...
1,issue,https://github.com/jabrena/aqa-tests-experimen...,jabrena,jabrena/aqa-tests-experiments,Java,4,Run a test in multiple java distros,- https://chat.openai.com/share/e169e9a7-40c5-...,2023-07-07T20:30:07Z,,2023-07-08T11:56:45Z,OPEN,[{'URL': 'https://chat.openai.com/share/b508dd...
2,issue,https://github.com/OpenVoiceOS/ovos-technical-...,JarbasAl,OpenVoiceOS/ovos-technical-manual,,4,document ovos-classifiers,ovos-classifiers is in pre-alpha but documenta...,2023-06-08T19:13:26Z,,2023-06-08T19:13:26Z,OPEN,[{'URL': 'https://chat.openai.com/share/1c4bc8...
3,issue,https://github.com/SKKUFastech/week1/issues/5,smh9800,SKKUFastech/week1,C,5,7/19,https://chat.openai.com/share/18990fa3-c8c6-41...,2023-07-19T01:36:52Z,,2023-07-19T01:50:48Z,OPEN,[{'URL': 'https://chat.openai.com/share/18990f...
4,issue,https://github.com/SKKUFastech/week1/issues/4,woojinsung-jimmy,SKKUFastech/week1,C,4,16진수,https://chat.openai.com/share/83859c4c-8894-41...,2023-07-18T06:14:42Z,,2023-07-18T06:14:42Z,OPEN,[{'URL': 'https://chat.openai.com/share/83859c...


In [12]:
# A function for defining issue type from string.
def customize_issue_type(content):
    # make column content all lower case for word matching.
    content = content.lower()
    
    # Define all the keywords list I can think about and the list is made by AI for fast testing.
    bug_keywords = ['bug', 'issue', 'glitch', 'fault']
    feature_keywords = ['feature', 'enhancement', 'request', 'improvement']
    error_keywords = ['error', 'failure', 'exception', 'crash']
    question_keywords = ['question', 'theory', 'explanation', 'concept']
    conceptual_keywords = ['concept', 'definition', 'explain', 'meaning', 'understand', 'clarification']
    implementation_keywords = ['how to', 'implement', 'example', 'code', 'usage', 'apply', 'best practice']
    performance_keywords = ['performance', 'optimize', 'efficiency', 'slow', 'improve', 'speed', 'bottleneck']
    security_keywords = ['security', 'vulnerability', 'attack', 'exploit', 'hack', 'safe']
    debug_keywords = ['debug', 'troubleshoot', 'fix', 'diagnose', 'log', 'trace', 'resolve']
    
    # Check if any keyword from the lists is in the body
    if any(word in content for word in bug_keywords):
        return 'Bug'
    elif any(word in content for word in feature_keywords):
        return 'Feature Request'
    elif any(word in content for word in error_keywords):
        return 'Error'
    elif any(word in content for word in question_keywords):
        return 'Theoretical Question'
    elif any(word in content for word in conceptual_keywords):
        return 'Conceptual Question'
    elif any(word in content for word in implementation_keywords):
        return 'Implementation'
    elif any(word in content for word in performance_keywords):
        return 'Performance'
    elif any(word in content for word in security_keywords):
        return 'Security'
    elif any(word in content for word in debug_keywords):
        return 'Debug'
    else:
        return "Others"
    

# now make a function to get the prompt from the nested structure in the ChatgptSharing column
def extract_prompt(chatgpt_sharing):
    # A list to store prompts into a single string.
    prompt_list = []
    if not isinstance(chatgpt_sharing, list):
        return None
    for elements in chatgpt_sharing:
        # check if conversations in the dictionary
        if "Conversations" in elements:
            # access elements in the conversations dict using the key.
            for parts in elements["Conversations"]:
                if "Prompt" in parts:
                    # add nothing if no prompt found.
                    prompt_list.extend([parts.get("Prompt", "")])
    # make all prompt for one conversation into a singel string.
    return " ".join(prompt_list)

# new column called prompt
df_issues["Prompt"] = df_issues["ChatgptSharing"].apply(extract_prompt)

print(df_issues["Prompt"])

df_issues["Issue_type"] = df_issues["Prompt"].apply(customize_issue_type)

print("\n")
print(df_issues["Issue_type"].value_counts())

0      あなたはwebデザイナーです。ハンバーガーメニューを実装したところ、初めからメニューの内容が...
1      How to add a java class in a generic container...
2      explain this code\n\nimport collections\nimpor...
3      #include <stdio.h>\n#include <stdlib.h>\n#incl...
4                                                       
                             ...                        
230    For iPhone 6+ (4K 30 FPS) I got new data.\n7 s...
231    Pourriez-vous expliquer la ligne de commande s...
232    Is "immature tool written by noobs for noobs "...
233    Using docker compose I get the following (usin...
234    In Linux, when you attach an ethernet cable to...
Name: Prompt, Length: 235, dtype: object


Issue_type
Others                  80
Bug                     54
Implementation          49
Error                   22
Feature Request          9
Conceptual Question      8
Theoretical Question     7
Debug                    5
Performance              1
Name: count, dtype: int64


In [13]:
# make the plot for better visutalization
import altair as alt 

# group the data by issue type and count the number of issues
group_issue_df = df_issues.groupby(["Issue_type"]).count().reset_index()

issue_type_chart = alt.Chart(group_issue_df).mark_bar().encode(
    x = alt.X("Issue_type:N", sort = "-y"),
    y = "Number:Q", 
    color="Issue_type:N",
    tooltip='Number'
).properties(
    title = "Issue Type",
    width = 500,
    height = 300
).interactive()

issue_type_chart