In [30]:
import requests
from docx import Document
import re
import pandas as pd

def extract_java_files_from_github(repo_owner, repo_name):
    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees/main?recursive=1'
    response = requests.get(url)
    
    if response.status_code == 200:
        tree = response.json()['tree']
        java_files = [file['path'] for file in tree if file['path'].endswith('.java')]
        return java_files
    else:
        print(f'Error fetching repository data: {response.status_code}')
        return []

def fetch_java_file_content(repo_owner, repo_name, file_path):
    url = f'https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main/{file_path}'
    response = requests.get(url)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f'Error fetching file {file_path}: {response.status_code}')
        return None

def filter_java_code(content):
    filtered_lines = []
    in_multiline_comment = False
    for line in content.splitlines():
        line = re.sub(r'//.*', '', line)  # Remove single-line comments

        if in_multiline_comment:
            if '*/' in line:
                in_multiline_comment = False
            continue
        elif '/*' in line:
            in_multiline_comment = True
            continue

        stripped_line = line.strip()
        if stripped_line:  # Ignore empty lines
            filtered_lines.append(stripped_line)

    return '\n'.join(filtered_lines)

def save_to_word(doc, java_files, repo_owner, repo_name):
    for java_file in java_files:
        content = fetch_java_file_content(repo_owner, repo_name, java_file)
        if content:
            filtered_content = filter_java_code(content)
            if filtered_content:
                doc.add_heading(f'{java_file} (from {repo_owner}/{repo_name})', level=2)
                doc.add_paragraph(filtered_content)

# Create the Document once outside the loop
doc = Document()
doc.add_heading('Java Files from GitHub Repositories', level=1)

df = pd.read_csv('results.csv')
for i in range(0, 1):
    try:
        repo_owner, repo_name = df.at[i, 'name'].split('/')
        print(repo_owner,repo_name)
        java_files = extract_java_files_from_github(repo_owner, repo_name)

        if java_files:
            save_to_word(doc, java_files, repo_owner, repo_name)
    except Exception as e:
        print(f'Error processing {df.at[i, "name"]}: {e}')

# Save the Document after processing all repositories
doc.save('Dataset.docx')
print('Filtered Java files saved to Dataset.docx')


spring-projects spring-framework
Filtered Java files saved to Dataset.docx


In [None]:
import requests
from docx import Document
import os
import re

def extract_java_files_from_github(repo_owner, repo_name):
    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees/main?recursive=1'
    response = requests.get(url)
    
    if response.status_code == 200:
        tree = response.json()['tree']
        java_files = [file['path'] for file in tree if file['path'].endswith('.java')]
        return java_files
    else:
        print(f'Error fetching repository data: {response.status_code}')
        return []

def fetch_java_file_content(repo_owner, repo_name, file_path):
    # Fetch the content of the .java file
    url = f'https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main/{file_path}'
    response = requests.get(url)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f'Error fetching file {file_path}: {response.status_code}')
        return None

def filter_java_code(content):
    # Remove comments and blank lines from the Java code
    filtered_lines = []
    in_multiline_comment = False
    for line in content.splitlines():
        # Remove single-line comments
        line = re.sub(r'//.*', '', line)

        # Handle multi-line comments
        if in_multiline_comment:
            if '*/' in line:
                in_multiline_comment = False
            continue
        elif '/*' in line:
            in_multiline_comment = True
            continue

        # Strip leading and trailing spaces
        stripped_line = line.strip()

        # Ignore empty lines
        if stripped_line:
            filtered_lines.append(stripped_line)

    # Join the filtered lines into a string
    return '\n'.join(filtered_lines)

def append_to_word(java_files, repo_owner, repo_name, file_name='java_files_filtered.docx'):
    # Check if the Word file already exists
    if os.path.exists(file_name):
        doc = Document(file_name)  # Open the existing file
        print(f'Appending data to {file_name}')
    else:
        doc = Document()  # Create a new document if the file doesn't exist
        doc.add_heading(f'Java Files from {repo_owner}/{repo_name}', level=1)

    for java_file in java_files:
        content = fetch_java_file_content(repo_owner, repo_name, java_file)
        if content:
            filtered_content = filter_java_code(content)
            if filtered_content:
                # Add the Java file and its content to the document
                doc.add_heading(f'{java_file} (from {repo_owner}/{repo_name})', level=2)
                doc.add_paragraph(filtered_content)

    # Save the document (append new content to the existing file)
    doc.save(file_name)
    print(f'Java files from {repo_owner}/{repo_name} appended to {file_name}')

# Example usage
repo_owner = 'spring-projects'  # Owner of the repository
repo_name = 'spring-framework'  # Name of the repository
java_files = extract_java_files_from_github(repo_owner, repo_name)

if java_files:
    append_to_word(java_files, repo_owner, repo_name)

    