In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm  # Use tqdm.notebook for Jupyter Notebooks

In [7]:
# Load the main dataset
df = pd.read_csv("data/java_test_dataset.csv")

- consistent downloading process and stable connection, retry if it gets down
- process stage representation and showing in terminal
- thread for faster internet connection and communication 
- error handeling and pass if a file or repository dosnt exist (try main instead of master if a file content doesnt count). 

In [9]:
def get_requests_session():
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
    session.mount('http://', HTTPAdapter(max_retries=retries))
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

def get_raw_content_url(github_url, file_path, branch='master'):
    parts = github_url.split('/')
    user, repo = parts[3], parts[4]
    raw_url = f'https://raw.githubusercontent.com/{user}/{repo}/{branch}/{file_path}'
    return raw_url

def download_file_content(args):
    url, focal_file_path, test_file_path, session = args
    results = {}
    for file_type, file_path in [('focal_class_code', focal_file_path), ('test_class_code', test_file_path)]:
        for branch in ['master', 'main']:  # Try with 'master' first, then 'main'
            raw_url = get_raw_content_url(url, file_path, branch)
            try:
                response = session.get(raw_url)
                response.raise_for_status()
                results[file_type] = response.text
                break  # Break if successfully got the content
            except requests.RequestException:
                pass  # Try the next branch
        if file_type not in results:  # If both attempts fail
            results[file_type] = None
    return results

def download_files_concurrently(df):
    session = get_requests_session()
    with ThreadPoolExecutor(max_workers=20) as executor:
        # Update to include both focal and test class file paths
        tasks = [(row['url'], row['focal_class_file'], row['test_class_file'], session) for index, row in df.iterrows()]
        futures = {executor.submit(download_file_content, task): index for index, task in enumerate(tasks)}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
            result = future.result()
            index = futures[future]
            # Assign the results to the new columns
            df.at[index, 'focal_class_code'] = result['focal_class_code']
            df.at[index, 'test_class_code'] = result['test_class_code']


In [10]:
# The new columns initialized
df['focal_class_code'] = None
df['test_class_code'] = None

In [11]:
# Perform the downloading and updating
download_files_concurrently(df)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=78388.0, style=ProgressStyle(descriptioâ€¦

In [6]:
df.head()

Unnamed: 0,url,class_file,raw_content
0,https://github.com/bytefish/JavaElasticSearchE...,JavaElasticSearchExperiment/src/main/java/csv/...,// Copyright (c) Philipp Wagner. All rights re...


In [5]:
print(df['raw_content'][0])
df.to_csv("test.csv", index=False)

// Copyright (c) Philipp Wagner. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

package csv.converter;

import de.bytefish.jtinycsvparser.typeconverter.ITypeConverter;
import utils.StringUtils;

import java.lang.reflect.Type;
import java.util.Arrays;
import java.util.List;

public class IgnoreMissingValuesConverter implements ITypeConverter<Float> {

    private List<String> missingValueRepresentation;

    public IgnoreMissingValuesConverter(String... missingValueRepresentation) {
        this(Arrays.asList(missingValueRepresentation));
    }

    public IgnoreMissingValuesConverter(List<String> missingValueRepresentation) {
        this.missingValueRepresentation = missingValueRepresentation;
    }

    @Override
    public Float convert(final String s) {

        if(StringUtils.isNullOrWhiteSpace(s)) {
            return null;
        }

        boolean isMissingValue = missingValueRepresentation
          