diff --git a/README.md b/README.md index 976367e..4d881f1 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ With Code2Prompt, you can easily create a well-structured and informative docume - Respects .gitignore files to exclude unwanted files and directories - Generates a table of contents with links to each file section - Provides file metadata such as extension, size, creation time, and modification time +- Optionally strips comments from code files to focus on the core code - Includes the actual code content of each file in fenced code blocks - Handles binary files and files with encoding issues gracefully @@ -102,6 +103,7 @@ code2prompt --path /path/to/your/codebase --output output.md - `--output` (optional): Name of the output Markdown file. If not provided, the output will be displayed in the console. - `--gitignore` (optional): Path to a custom .gitignore file. If not provided, the tool will look for a .gitignore file in the specified directory. - `--filter` (optional): Filter pattern to include specific files (e.g., "*.py" to include only Python files). +- `--suppress-comments` (optional): Strip comments from the code files. If not provided, comments will be included. ### Examples @@ -120,6 +122,11 @@ code2prompt --path /path/to/your/codebase --output output.md code2prompt --path /path/to/your/project --output project.md --gitignore /path/to/custom/.gitignore ``` +4. Generate a Markdown file with comments stripped from code files: + ``` + code2prompt --path /path/to/your/project --output project.md --suppress-comments + ``` + ## Build To build a distributable package of Code2Prompt using Poetry, follow these steps: diff --git a/code2prompt/comment_stripper.py b/code2prompt/comment_stripper.py new file mode 100644 index 0000000..3552ebe --- /dev/null +++ b/code2prompt/comment_stripper.py @@ -0,0 +1,180 @@ +""" A collection of functions to strip comments from code strings based on the programming language. """ + +import re + + +def strip_c_style_comments(code: str) -> str: + """ + Strips C-style comments from the given code string. + Supports single-line comments (//), multi-line comments (/* */), and string literals. + + :param code: The code string to strip comments from. + :return: The code string with C-style comments removed. + """ + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) + + +def strip_html_style_comments(code: str) -> str: + """ + Strips HTML-style comments from the given code string. + Supports both single-line and multi-line comments (). + + :param code: The code string to strip comments from. + :return: The code string with HTML-style comments removed. + """ + pattern = re.compile(r"", re.DOTALL) + return re.sub(pattern, "", code) + + +def strip_python_style_comments(code: str) -> str: + """ + Strips Python-style comments from the given code string. + Supports single-line comments (#), multi-line comments (''' ''' or \"\"\" \"\"\"), and string literals. + + :param code: The code string to strip comments from. + :return: The code string with Python-style comments removed. + """ + pattern = re.compile( + r'(?s)#.*?$|\'\'\'.*?\'\'\'|""".*?"""|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: ( + "" if match.group(0).startswith(("#", "'''", '"""')) else match.group(0) + ), + code, + ) + + +def strip_shell_style_comments(code: str) -> str: + """ + Strips shell-style comments from the given code string. + Supports single-line comments (#) and multi-line comments (: ' '). + + :param code: The code string to strip comments from. + :return: The code string with shell-style comments removed. + """ + lines = code.split("\n") + new_lines = [] + in_multiline_comment = False + + for line in lines: + if line.strip().startswith("#!"): + # Preserve shebang lines + new_lines.append(line) + elif in_multiline_comment: + if line.strip().endswith("'"): + in_multiline_comment = False + elif line.strip().startswith(": '"): + in_multiline_comment = True + elif "#" in line: + # Remove single-line comments + line = line.split("#", 1)[0] + if line.strip(): + new_lines.append(line) + else: + new_lines.append(line) + + return "\n".join(new_lines).strip() + +def strip_sql_style_comments(code: str) -> str: + """ + Strips SQL-style comments from the given code string. + Supports single-line comments (--), multi-line comments (/* */), and string literals. + + :param code: The code string to strip comments from. + :return: The code string with SQL-style comments removed. + """ + pattern = re.compile( + r'--.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE, + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) + + +def strip_matlab_style_comments(code: str) -> str: + """ + Strips MATLAB-style comments from the given code string. + Supports single-line comments (%) and string literals. + + :param code: The code string to strip comments from. + :return: The code string with MATLAB-style comments removed. + """ + pattern = re.compile( + r'%.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) + + +def strip_r_style_comments(code: str) -> str: + """ + Strips R-style comments from the given code string. + Supports single-line comments (#) and string literals. + + :param code: The code string to strip comments from. + :return: The code string with R-style comments removed. + """ + pattern = re.compile( + r'#.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE + ) + return re.sub( + pattern, + lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", + code, + ) + + +def strip_comments(code: str, language: str) -> str: + """ + Strips comments from the given code string based on the specified programming language. + + :param code: The code string to strip comments from. + :param language: The programming language of the code. + :return: The code string with comments removed. + """ + if language in [ + "c", + "cpp", + "java", + "javascript", + "csharp", + "php", + "go", + "rust", + "kotlin", + "swift", + "scala", + "dart", + ]: + return strip_c_style_comments(code) + elif language in ["python", "ruby", "perl"]: + return strip_python_style_comments(code) + elif language in ["bash", "powershell", "shell"]: + return strip_shell_style_comments(code) + elif language in ["html", "xml"]: + return strip_html_style_comments(code) + elif language in ["sql", "plsql", "tsql"]: + return strip_sql_style_comments(code) + elif language in ["matlab", "octave"]: + return strip_matlab_style_comments(code) + elif language in ["r"]: + return strip_r_style_comments(code) + else: + return code diff --git a/code2prompt/language_inference.py b/code2prompt/language_inference.py new file mode 100644 index 0000000..8c8c53b --- /dev/null +++ b/code2prompt/language_inference.py @@ -0,0 +1,61 @@ +""" This module contains the function to infer the programming language based on the file extension. """ + +import os + + +def infer_language(filename: str) -> str: + """ + Infers the programming language based on the file extension. + + :param filename: The name of the file. + :return: The inferred programming language. + """ + _, extension = os.path.splitext(filename) + extension = extension.lower() + + if extension in [".c", ".h"]: + return "c" + elif extension in [".cpp", ".hpp", ".cc", ".cxx"]: + return "cpp" + elif extension in [".java"]: + return "java" + elif extension in [".js", ".jsx"]: + return "javascript" + elif extension in [".cs"]: + return "csharp" + elif extension in [".php"]: + return "php" + elif extension in [".go"]: + return "go" + elif extension in [".rs"]: + return "rust" + elif extension in [".kt"]: + return "kotlin" + elif extension in [".swift"]: + return "swift" + elif extension in [".scala"]: + return "scala" + elif extension in [".dart"]: + return "dart" + elif extension in [".py"]: + return "python" + elif extension in [".rb"]: + return "ruby" + elif extension in [".pl", ".pm"]: + return "perl" + elif extension in [".sh"]: + return "bash" + elif extension in [".ps1"]: + return "powershell" + elif extension in [".html", ".htm"]: + return "html" + elif extension in [".xml"]: + return "xml" + elif extension in [".sql"]: + return "sql" + elif extension in [".m"]: + return "matlab" + elif extension in [".r"]: + return "r" + else: + return "unknown" diff --git a/code2prompt/main.py b/code2prompt/main.py index 81d89cd..6388047 100644 --- a/code2prompt/main.py +++ b/code2prompt/main.py @@ -1,9 +1,13 @@ """ Main module for the code2prompt package. """ + from datetime import datetime from pathlib import Path from fnmatch import fnmatch import click +from code2prompt.language_inference import infer_language +from code2prompt.comment_stripper import strip_comments + def parse_gitignore(gitignore_path): """Parse the .gitignore file and return a set of patterns.""" @@ -98,7 +102,14 @@ def is_binary(file_path): @click.option( "--filter", "-f", type=str, help='Filter pattern to include files (e.g., "*.py").' ) -def create_markdown_file(path, output, gitignore, filter): +@click.option( + "--suppress-comments", + "-s", + is_flag=True, + help="Strip comments from the code files.", + default=False, +) +def create_markdown_file(path, output, gitignore, filter, suppress_comments): """Create a Markdown file with the content of files in a directory.""" content = [] table_of_contents = [] @@ -127,6 +138,10 @@ def create_markdown_file(path, output, gitignore, filter): try: with file_path.open("r", encoding="utf-8") as f: file_content = f.read() + if suppress_comments: + language = infer_language(file_path.name) + if language != "unknown": + file_content = strip_comments(file_content, language) except UnicodeDecodeError: # Ignore files that cannot be decoded continue @@ -158,4 +173,5 @@ def create_markdown_file(path, output, gitignore, filter): if __name__ == "__main__": + # pylint: disable=no-value-for-parameter create_markdown_file() diff --git a/pyproject.toml b/pyproject.toml index 7755a47..860297e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "code2prompt" -version = "0.1.3" +version = "0.2.0" description = "" authors = ["Raphael MANSUY "] readme = "README.md" diff --git a/tests/test_comment_stripper.py b/tests/test_comment_stripper.py new file mode 100644 index 0000000..ffc1e3e --- /dev/null +++ b/tests/test_comment_stripper.py @@ -0,0 +1,147 @@ +"""Tests for the comment_stripper module.""" +import re +from textwrap import dedent +from code2prompt.comment_stripper import ( + strip_c_style_comments, + strip_python_style_comments, + strip_shell_style_comments, + strip_html_style_comments, + strip_matlab_style_comments, + strip_sql_style_comments, + strip_r_style_comments, +) + + +def normalize_whitespace(text): + """ Normalize the whitespace in a string.""" + return re.sub(r'\s+', ' ', text.strip()) + + +def test_strip_c_style_comments(): + """Test the strip_c_style_comments function.""" + code = """ + int main() { + // Single-line comment + /* Multi-line + comment */ + printf("Hello, World!"); // Inline comment + } + """ + expected = """ + int main() { + printf("Hello, World!"); + } + """ + assert normalize_whitespace(strip_c_style_comments(code)) == normalize_whitespace(dedent(expected)) + + +def test_strip_python_style_comments(): + """Test the strip_python_style_comments function.""" + code = """ + def main(): + # Single-line comment + ''' + Multi-line + comment + ''' + print("Hello, World!") # Inline comment + """ + expected = """ + def main(): + print("Hello, World!") + """ + assert normalize_whitespace(strip_python_style_comments(code)) == normalize_whitespace(dedent(expected)) + + +def test_strip_shell_style_comments(): + """Test the strip_shell_style_comments function.""" + code = """ + #!/bin/bash + # Single-line comment + : ' + Multi-line + comment + ' + echo "Hello, World!" # Inline comment + """ + expected = """ + #!/bin/bash + echo "Hello, World!" + """ + assert normalize_whitespace(strip_shell_style_comments(code)) == normalize_whitespace(dedent(expected)) + + +def test_strip_html_style_comments(): + """Test the strip_html_style_comments function.""" + code = """ + + + + + Hello, World! + + """ + expected = """ + + + Hello, World! + + """ + assert normalize_whitespace(strip_html_style_comments(code)) == normalize_whitespace(dedent(expected)) + + +def test_strip_matlab_style_comments(): + """Test the strip_matlab_style_comments function.""" + code = """ + % Single-line comment + function y = foo(x) + % Multi-line + % comment + y = x + 1; % Inline comment + end + """ + expected = """ + function y = foo(x) + y = x + 1; + end + """ + assert normalize_whitespace(strip_matlab_style_comments(code)) == normalize_whitespace(dedent(expected)) + + +def test_strip_sql_style_comments(): + """Test the strip_sql_style_comments function.""" + code = """ + SELECT * + FROM table + -- Single-line comment + /* Multi-line + comment */ + WHERE condition; -- Inline comment + """ + expected = """ + SELECT * + FROM table + WHERE condition; + """ + assert normalize_whitespace(strip_sql_style_comments(code)) == normalize_whitespace(dedent(expected)) + + +def test_strip_r_style_comments(): + """Test the strip_r_style_comments function.""" + code = """ + # Single-line comment + foo <- function(x) { + # Multi-line + # comment + return(x + 1) # Inline comment + } + """ + expected = """ + foo <- function(x) { + return(x + 1) + } + """ + assert normalize_whitespace(strip_r_style_comments(code)) == normalize_whitespace(dedent(expected)) \ No newline at end of file diff --git a/tests/test_language_inference.py b/tests/test_language_inference.py new file mode 100644 index 0000000..452f686 --- /dev/null +++ b/tests/test_language_inference.py @@ -0,0 +1,27 @@ +from code2prompt.language_inference import infer_language + +def test_infer_language(): + """ Test the infer_language function.""" + assert infer_language("main.c") == "c" + assert infer_language("main.cpp") == "cpp" + assert infer_language("Main.java") == "java" + assert infer_language("script.js") == "javascript" + assert infer_language("Program.cs") == "csharp" + assert infer_language("index.php") == "php" + assert infer_language("main.go") == "go" + assert infer_language("lib.rs") == "rust" + assert infer_language("app.kt") == "kotlin" + assert infer_language("main.swift") == "swift" + assert infer_language("Main.scala") == "scala" + assert infer_language("main.dart") == "dart" + assert infer_language("script.py") == "python" + assert infer_language("script.rb") == "ruby" + assert infer_language("script.pl") == "perl" + assert infer_language("script.sh") == "bash" + assert infer_language("script.ps1") == "powershell" + assert infer_language("index.html") == "html" + assert infer_language("data.xml") == "xml" + assert infer_language("query.sql") == "sql" + assert infer_language("script.m") == "matlab" + assert infer_language("script.r") == "r" + assert infer_language("file.txt") == "unknown"