In [5]:
import requests
from docx import Document
import re
import pandas as pd

def extract_java_files_from_github(repo_owner, repo_name):
    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/git/trees/main?recursive=1'
    response = requests.get(url)
    
    if response.status_code == 200:
        tree = response.json()['tree']
        java_files = [file['path'] for file in tree if file['path'].endswith('.java')]
        return java_files
    else:
        print(f'Error fetching repository data: {response.status_code}')
        return []

def fetch_java_file_content(repo_owner, repo_name, file_path):
    url = f'https://raw.githubusercontent.com/{repo_owner}/{repo_name}/main/{file_path}'
    response = requests.get(url)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f'Error fetching file {file_path}: {response.status_code}')
        return None

def filter_java_code(content):
    filtered_lines = []
    in_multiline_comment = False
    for line in content.splitlines():
        line = re.sub(r'//.*', '', line)  # Remove single-line comments

        if in_multiline_comment:
            if '*/' in line:
                in_multiline_comment = False
            continue
        elif '/*' in line:
            in_multiline_comment = True
            continue

        stripped_line = line.strip()
        if stripped_line:  # Ignore empty lines
            filtered_lines.append(stripped_line)

    return '\n'.join(filtered_lines)

def save_to_word(doc, java_files, repo_owner, repo_name):
    for java_file in java_files:
        content = fetch_java_file_content(repo_owner, repo_name, java_file)
        if content:
            filtered_content = filter_java_code(content)
            if filtered_content:
                doc.add_heading(f'{java_file} (from {repo_owner}/{repo_name})', level=2)
                doc.add_paragraph(filtered_content)

# Create the Document once outside the loop
doc = Document()


df = pd.read_csv('results.csv')
for i in range(100,1000):
    try:
        repo_owner, repo_name = df.at[i, 'name'].split('/')
        print(repo_owner,repo_name)
        java_files = extract_java_files_from_github(repo_owner, repo_name)

        if java_files:
            save_to_word(doc, java_files, repo_owner, repo_name)
            # Save the Document after processing all repositories
            doc.save('Dataset.docx')
            print('Filtered Java files saved to Dataset.docx')

    except Exception as e:
        print(f'Error processing {df.at[i, "name"]}: {e}')



ImportError: cannot import name 'Github' from 'github' (c:\Users\HP\AppData\Local\Programs\Python\Python310\lib\site-packages\github\__init__.py)