In [1]:
# Research Question 6. What types of quality issues are common in ChatGPT-generated code, focusing on Python, JavaScript, and Bash?
# Group 4: Shiqi Zhang, Tyler Stevenson, Zefeng Pei

In [37]:
import pandas as pd
import json
import os
import subprocess
import re
import altair as alt
from collections import Counter


In [3]:
# Load the JSON file
with open("20231012_234250_file_sharings.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract ListOfCode content
code_snippets = []

for chat in data.get("Sources", []):
    for conversation in chat.get("ChatgptSharing", []):
        for conv in conversation.get("Conversations", []):
            for code in conv.get("ListOfCode", []):
                code_snippets.append({
                    "Type": code.get("Type", "Unknown"),  # Handle missing Type
                    "Content": code.get("Content", ""),  # Handle missing Content
                })

# Convert to DataFrame
df = pd.DataFrame(code_snippets)

# Display the DataFrame
df

Unnamed: 0,Type,Content
0,bash,sudo apt update\nsudo apt install git\n
1,bash,sudo dnf install git\n
2,bash,sudo yum install git\n
3,bash,sudo dnf install git\n
4,bash,sudo zypper install git\n
...,...,...
14127,python,m = folium.Map()\n
14128,python,geojson_data = gdf.to_json()\n
14129,python,folium.GeoJson(geojson_data).add_to(m)\n
14130,python,m.save('map.html')\n# OR\nm\n


In [4]:
df['Type'].unique()

array(['bash', 'python', 'csv', 'json', 'typescript', 'yaml', 'lua',
       'javascript', 'css', 'csharp', 'SudoLang', 'c', 'go', 'text',
       'arduino', None, 'makefile', 'powershell', 'swift', 'dart',
       'plaintext', 'cmd', 'markdown', 'scss', 'java', 'ql', 'codeql',
       'shell', 'R', 'dot', 'html', 'cpp', 'vbnet', 'excel', 'hcl',
       'prisma', 'ts', 'rust', 'matlab', 'perl', 'ejs', 'solidity',
       'ruby', 'less', 'sql', 'php', 'plantuml', 'liquid', 'diff',
       'gradle', 'kotlin', 'groovy', 'xml', 'CPP', 'jsx', 'vba',
       'haskell', 'http', 'toml', 'vue', 'Dockerfile', 'mathematica',
       'verilog', 'js', 'ini', 'nginx', 'tsx', 'hlsl', 'sh', 'regex',
       'qmake', 'elixir', 'mermaid', 'glsl', 'cmake', 'make', 'objective',
       'c#', 'C#', 'latex', 'pseudo', 'elisp', 'batch', 'jsonl', 'razor',
       'julia', 'apex', 'yml', 'ocaml', 'r', 'turtle', 'vim', 'proto',
       'graphql', 'objc', 'emacs', 'lisp', 'applescript', 'clojure',
       'env', 'tf', 'dslx',

In [6]:
# Load the dataset
df_filtered = df[df["Type"].isin(["python", "javascript", "bash"])]

# Sample 100 snippets per language
df_sampled = df_filtered.groupby("Type").apply(lambda x: x.sample(n=500, random_state=42)).reset_index(drop=True)

df_sampled

  df_sampled = df_filtered.groupby("Type").apply(lambda x: x.sample(n=500, random_state=42)).reset_index(drop=True)


Unnamed: 0,Type,Content
0,bash,pip install pygame\n
1,bash,pipenv install django\n
2,bash,source ~/.bashrc # If using bash\n# or\nsourc...
3,bash,export PATH=$PATH:/mingw64/bin\n
4,bash,git checkout <your-branch>\n
...,...,...
1495,python,"genrule(\n name = ""build_docker_image"",\n ..."
1496,python,"q_learning_instance = QLearning(initial_board,..."
1497,python,# Use loc to select rows based on a condition\...
1498,python,This is a text file.\n@other_file.txt\nThis is...


In [7]:
output_dir = "sampled_code_snippets"
os.makedirs(output_dir, exist_ok=True)

# File extensions per language
file_extensions = {"python": "py", "javascript": "js", "bash": "sh"}

# Save each sampled snippet as a separate file
for idx, row in df_sampled.iterrows():
    lang = row["Type"]
    file_path = os.path.join(output_dir, f"{lang}_code_{idx}.{file_extensions[lang]}")
    
    with open(file_path, "w") as f:
        f.write(row["Content"])

print("Sampled code snippets saved as files in 'sampled_code_snippets/'")

Sampled code snippets saved as files in 'sampled_code_snippets/'


In [8]:
# Define the directory containing sampled snippets
code_dir = "sampled_code_snippets"

# Initialize results storage
linter_results = []

# Define linter commands for each language (only key checks for efficiency)
linters = {
    "python": ["pylint"],
    "javascript": ["eslint"],
    "bash": ["shellcheck"]
}

# Get all sampled code files
code_files = [f for f in os.listdir(code_dir) if f.endswith((".py", ".js", ".sh"))]

# Process each file
for file in code_files:
    file_path = os.path.join(code_dir, file)
    lang = "python" if file.endswith(".py") else "javascript" if file.endswith(".js") else "bash"

    # Run the linter
    linter_cmd = linters[lang] + [file_path]
    result = subprocess.run(linter_cmd, capture_output=True, text=True)
    
    # Store only the first 5000 characters to avoid huge logs
    linter_results.append({"File": file, "Language": lang, "Linter Output": result.stdout[:5000]})

print("Linter completed. Saving results...")

# Save results to CSV
df_results = pd.DataFrame(linter_results)
df_results.to_csv("linter_results_sampled.csv", index=False)

print("Linter results saved to 'linter_results_sampled.csv'")

Linter completed. Saving results...
Linter results saved to 'linter_results_sampled.csv'


In [9]:
# start point for analysis
df_results = pd.read_csv("linter_results_sampled.csv")


In [10]:
# Combine all linter messages into a single text
all_linter_text = " ".join(df_results["Linter Output"].dropna())

print("Total length of combined linter text:", len(all_linter_text))

Total length of combined linter text: 886567


In [29]:
# Define non-meaningful errors
non_meaningful_shellcheck = {
    "SC2148" # unknown shell
}

non_meaningful_pylint = {
    "E0401", # import error
    "E0602", # undefined variable
    "F0002" # astroid-error
}

non_meaningful_eslint = {  # Fetch full list from: https://denar90.github.io/eslint.github.io/docs/rules/
    "no-undef", "import/no-unresolved", "import/named", "import/namespace", "import/default", "import/no-extraneous-dependencies"
}

# Combine all non-meaningful error sets with file path
non_meaningful_errors = non_meaningful_shellcheck | non_meaningful_pylint | non_meaningful_eslint | {"/Users/shiqizhang/DATA542/Data-542-Group-4"}



In [30]:
# Split text into sentences using regex (splitting on line breaks, periods, or semicolons)
linter_sentences = re.split(r'[\n;.]+', all_linter_text)

print("Total extracted sentences:", len(linter_sentences))
print("Sample sentence before filtering:", linter_sentences[:5])  # Preview first few sentences

# Keep only meaningful sentences
filtered_sentences = [
    sentence for sentence in linter_sentences
    if not any(error_code in sentence for error_code in non_meaningful_errors)
]

print("Remaining valid sentences:", len(filtered_sentences))
print("Sample sentence after filtering:", filtered_sentences[:5])  # Preview first few sentences


Total extracted sentences: 20169
Sample sentence before filtering: ['\x1b[0m\x1b[0m', '\x1b[0m\x1b[4m/Users/shiqizhang/DATA542/Data-542-Group-4/sampled_code_snippets/javascript_code_956', 'js\x1b[24m\x1b[0m', "\x1b[0m  \x1b[2m1:24\x1b[22m  \x1b[31merror\x1b[39m  'axios' is not defined     \x1b[2mno-undef\x1b[22m\x1b[0m", "\x1b[0m  \x1b[2m9:1\x1b[22m   \x1b[31merror\x1b[39m  'setimage' is not defined  \x1b[2mno-undef\x1b[22m\x1b[0m"]
Remaining valid sentences: 16409


In [31]:
# Reconstruct cleaned linter output as a single text block
cleaned_linter_text = " ".join(filtered_sentences)

print("Total length after filtering:", len(cleaned_linter_text))

# Save cleaned text to a file
with open("cleaned_linter_output.txt", "w") as f:
    f.write(cleaned_linter_text)

print("Filtered linter output saved as cleaned_linter_output.txt")


Total length after filtering: 590416
Filtered linter output saved as cleaned_linter_output.txt


In [40]:
import re

# Function to remove ANSI escape sequences
def remove_ansi_codes(text):
    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
    return ansi_escape.sub('', text)

# Read raw linter output
with open("cleaned_linter_output.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total characters before ANSI cleaning:", len(raw_text))

# Remove ANSI escape codes
cleaned_text = remove_ansi_codes(raw_text)

# Save the cleaned output back to the file
with open("cleaned_linter_output.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Total characters after ANSI cleaning:", len(cleaned_text))


Total characters before ANSI cleaning: 590416
Total characters after ANSI cleaning: 545463


In [41]:
# Updated regex to capture all error types
error_pattern = r'\b[A-Z]{1,2}\d{3,4}\b|\bno-[a-z-]+\b'  # Matches SC1063, E0602, and ESLint errors like no-unused-vars

# Extract all error codes
error_codes = re.findall(error_pattern, cleaned_text)

# Count occurrences of each error
error_counts = Counter(error_codes)

# Convert to DataFrame for visualization
import pandas as pd
df_error_counts = pd.DataFrame(error_counts.items(), columns=["Error Code", "Count"]).sort_values(by="Count", ascending=False)

# Display results
df_error_counts

Unnamed: 0,Error Code,Count
2,C0114,460
6,C0103,303
49,C0303,294
4,C0116,287
0,no-unused-vars,245
...,...,...
1,no-drop,1
52,W0640,1
62,W2301,1
54,W0702,1


In [42]:
print("Top 10 Most Frequent Issues:")
print(df_error_counts.head(10))

Top 10 Most Frequent Issues:
        Error Code  Count
2            C0114    460
6            C0103    303
49           C0303    294
4            C0116    287
0   no-unused-vars    245
9            C0301    191
3            C0115     80
20          SC2164     73
23           W0621     61
18          SC2086     61


In [43]:
# Create bar chart using Altair
chart = alt.Chart(df_error_counts.head(10)).mark_bar().encode(
    x=alt.X("Error Code:N", title="Error Code", sort="-y"),
    y=alt.Y("Count:Q", title="Frequency"),
    tooltip=["Error Code", "Count"]
).properties(
    title="Top 10 Most Frequent Linter Issues",
    width=600,
    height=400
)

# Display chart
chart

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [39]:
# C0114: missing-module-docstring, C0103: invalid-name, C0303: trailing-whitespace, C0116: missing-function-docstring,
# no-unused-vars: disallow unused variables, C0301: line-too-long, C0115: missing-class-docstring, SC2164: Use `cd ... || exit` in case `cd` fails, 
# W0621: redefined-outer-name, SC2086: Double quote to prevent globbing and word splitting
