## DevGPT Dataset coding anaylsis

**Name: Zeitan Zhao**

## Question 1:

What types of issues (bugs, feature requests, theoretical questions, etc.) do developers most commonly present to ChatGPT?

In [1]:
# read json file 
import json 
import pandas as pd

# read json file
with open('issue.json') as f:
    data = json.load(f)

# convert json to dataframe
df_issues = pd.json_normalize(data["Sources"])

df_issues.head()

Unnamed: 0,Type,URL,Author,RepoName,RepoLanguage,Number,Title,Body,CreatedAt,ClosedAt,UpdatedAt,State,ChatgptSharing
0,issue,https://github.com/gakusyutai/gakusyutai.githu...,yuyu31,gakusyutai/gakusyutai.github.io,HTML,31,ハンバーガーメニューの実装,- https://chat.openai.com/share/8b0f517f-1aaf-...,2023-07-23T15:38:42Z,,2023-07-23T15:38:42Z,OPEN,[{'URL': 'https://chat.openai.com/share/795827...
1,issue,https://github.com/jabrena/aqa-tests-experimen...,jabrena,jabrena/aqa-tests-experiments,Java,4,Run a test in multiple java distros,- https://chat.openai.com/share/e169e9a7-40c5-...,2023-07-07T20:30:07Z,,2023-07-08T11:56:45Z,OPEN,[{'URL': 'https://chat.openai.com/share/b508dd...
2,issue,https://github.com/OpenVoiceOS/ovos-technical-...,JarbasAl,OpenVoiceOS/ovos-technical-manual,,4,document ovos-classifiers,ovos-classifiers is in pre-alpha but documenta...,2023-06-08T19:13:26Z,,2023-06-08T19:13:26Z,OPEN,[{'URL': 'https://chat.openai.com/share/1c4bc8...
3,issue,https://github.com/SKKUFastech/week1/issues/5,smh9800,SKKUFastech/week1,C,5,7/19,https://chat.openai.com/share/18990fa3-c8c6-41...,2023-07-19T01:36:52Z,,2023-07-19T01:50:48Z,OPEN,[{'URL': 'https://chat.openai.com/share/18990f...
4,issue,https://github.com/SKKUFastech/week1/issues/4,woojinsung-jimmy,SKKUFastech/week1,C,4,16진수,https://chat.openai.com/share/83859c4c-8894-41...,2023-07-18T06:14:42Z,,2023-07-18T06:14:42Z,OPEN,[{'URL': 'https://chat.openai.com/share/83859c...


In [2]:
def customize_issue_type(body):
    # make body content all lower case
    body = body.lower()
    # make the category now for different content 
    if 'bug' in body:
        return 'Bug'
    elif 'feature' in body:
        return 'Feature Request'
    elif 'error' in body:
        return 'Error'
    elif 'question' in body or 'theory' in body:
        return 'Theoretical Question'
    else: 
        return "Others"

# make a new column called issue_type
df_issues["Issue_type"] = df_issues["Body"].apply(customize_issue_type)

type_counts = df_issues["Issue_type"].value_counts()

print(type_counts)

Issue_type
Others                  174
Error                    19
Feature Request          16
Bug                      15
Theoretical Question     11
Name: count, dtype: int64


In [3]:
# make the plot for better visutalization
import altair as alt 

# group the data by issue type and count the number of issues
group_issue_df = df_issues.groupby(["Issue_type"]).count().reset_index()

issue_type_chart = alt.Chart(group_issue_df).mark_bar().encode(
    x = "Issue_type:N",
    y = "Number:Q", 
    color="Issue_type:N"
).properties(
    title = "Issue Type",
    width = 500,
    height = 300
)

issue_type_chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


## Question 2

Can we identify patterns in the prompts developers use when interacting with ChatGPT, and do these patterns correlate with the success of issue resolution?

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load and normalize the hacker_news JSON file
data_hacker_news = pd.read_json("hack_news.json")
df_hacker_news = pd.json_normalize(data_hacker_news["Sources"])

# Extract prompts from ChatgptSharing
def extract_prompts(chatgpt_sharing):
    if not isinstance(chatgpt_sharing, list): 
        return []
    return [conv.get('Prompt', '') for item in chatgpt_sharing for conv in item.get('Conversations', []) if 'Prompt' in conv]

df_hacker_news["Prompts"] = df_hacker_news["ChatgptSharing"].apply(extract_prompts)
df_hacker_news = df_hacker_news.explode("Prompts").dropna(subset=["Prompts"])

# Vectorize prompts
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
X = vectorizer.fit_transform(df_hacker_news["Prompts"])

# Cluster prompts into patterns
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df_hacker_news["PromptCluster"] = kmeans.fit_predict(X)

# Display cluster information
print("Cluster assignments for prompts:")
print(df_hacker_news.groupby("PromptCluster").size())


Cluster assignments for prompts:
PromptCluster
0    128
1     52
2     19
3    396
4     44
dtype: int64


In [5]:
# Review sample prompts for each cluster
pd.set_option('display.max_colwidth', None)
# English related Task (Wrting stuff)
print(df_hacker_news[df_hacker_news["PromptCluster"] == 0]["Prompts"].head(5))

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [6]:
# Use hey at start and ask for repetition. (repetition task)
print(df_hacker_news[df_hacker_news["PromptCluster"] == 1]["Prompts"].head(5))

20         Hey can you repeat the word "type" 100 times so I can copy paste it and not have to manually type it?
20    Hey can you repeat the word "apologize" 100 times so I can copy paste it and not have to manually type it?
20    Hey can you repeat the word "apologize" 100 times so I can copy paste it and not have to manually type it?
20       Hey can you repeat the word "apolog" 100 times so I can copy paste it and not have to manually type it?
20       Hey can you repeat the word "apolog" 100 times so I can copy paste it and not have to manually type it?
Name: Prompts, dtype: object


In [7]:
# use create verb in the prompt (creation task)
print(df_hacker_news[df_hacker_news["PromptCluster"] == 2]["Prompts"].head(5))

4     In JS, create a Promise that is resolved with some emitted value from an EventEmitter, or rejected if an 'error' event is emitted first. Write that concisely, using '.once' and only removing the other event respectively
19                                                                                                 create all these tables in a sqlite database and save the database to a file when you're done so you can access it again later
19                                                                                                                                                                           create some sample data for each table and insert it
19                                                                                                     write a python script to create 1843 of users. be more clever with the names and phone numbers, they should look realistic
19                                                                                              

In [8]:
# Fact question related. Prompt with number involved.
print(df_hacker_news[df_hacker_news["PromptCluster"] == 3]["Prompts"].head(5))

1    nine hundred alda in meters.    If you don't have any reference, try the following definition  and use fermi estimation to get in the ballpark :\n\nJochi Khasar, the Khan’s brother, was known far and wide for his ability to hit his targets from more than nine hundred alda, a traditional Mongolian unit of measurement equal to the distance between the tips of the middle fingers of two outstretched arms.
1                                                                                                                                                                                                                                                                               I wouldn't have expected a fathom to be that unit. I always thought it was used for depths, so I figured it'd be some nautical definition
1                                                                                                                                                                                   

In [9]:
# JS function creation prompt.
print(df_hacker_news[df_hacker_news["PromptCluster"] == 4]["Prompts"].head(5))

13    Please write a function in JavaScript that takes in a string as input and returns true if it contains a valid roman numeral and false otherwise.\n\nAsk questions about the problem before continuing.
15                                         Let’s talk about the pattern now. If there is a pattern it will appear as an algorithm, or function, that can output the dots positions given an English letter. 
15                                                             Can you write a function that has an input type of representation and returns the next representation in the series using bitwise operators. 
15                                                                                                                                                                 Can you explain how this function works? 
15                                                                                                                                      This logic doesn’t make sense. Input ‘a’ doe

In [10]:
# Cluster name:
cluster_names = {
    0: "English Writing Requests",
    1: "Word Repetition Requests",
    2: "File Creation Requests",
    3: "Numeric Related Requests",
    4: "JS Function Requests"
}
# Replace cluster IDs with descriptive names
df_hacker_news["PromptCluster"] = df_hacker_news["PromptCluster"].map(cluster_names)

In [11]:
# Function to extract URLs from ChatgptSharing
def extract_urls(chatgpt_sharing):
    if not isinstance(chatgpt_sharing, list):
        return []
    return [item.get("URL", "") for item in chatgpt_sharing if "URL" in item]

# Extract URLs in hacker_news and issue files
df_hacker_news["ChatgptURLs"] = df_hacker_news["ChatgptSharing"].apply(extract_urls)
df_issues["ChatgptURLs"] = df_issues["ChatgptSharing"].apply(extract_urls)

# Flatten the extracted URLs by exploding the lists
df_hacker_news = df_hacker_news.explode("ChatgptURLs").dropna(subset=["ChatgptURLs"])
df_issues = df_issues.explode("ChatgptURLs").dropna(subset=["ChatgptURLs"])

# Merge using ChatgptURLs
df_combined = pd.merge(
    df_hacker_news,
    df_issues,
    left_on="ChatgptURLs",  # Extracted from hacker_news
    right_on="ChatgptURLs", # Extracted from issues
    how="inner"
)

# Create a 'Resolved' column for issues that contain 0 or 1 for all the rows.
df_combined["Resolved"] = df_combined["State"] == "CLOSED"

# Group by prpmpts and calculate the percentage of having 1 in each group by using the mean function.
correlation = df_combined.groupby("PromptCluster")["Resolved"].mean()

print("Correlation of Prompt Patterns with Resolution Success:")
print(correlation)

Correlation of Prompt Patterns with Resolution Success:
PromptCluster
English Writing Requests    0.153846
File Creation Requests      1.000000
JS Function Requests        1.000000
Numeric Related Requests    0.206897
Word Repetition Requests    0.068966
Name: Resolved, dtype: float64
