In [29]:
# We have scraped 25 representative repositories that use python from github.
# The goal is to sample N = m * 25 files in total, where m is the number of files in each repository.

# The sampling strategy for each repository is as follows:
# 1. Count the number of lines in each file. A developer spend more time on a file with more lines.
# 2. Calculate the probability of sampling each file and sample the files without replacement.

In [30]:
import os
import random
from tqdm import tqdm

# Set a random seed for replicability
random.seed(42)

In [31]:
def count_lines(file_path):
    """
    Count the number of lines in a file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return sum(1 for _ in file)

def sample_files(repo_dir, m):
    """
    Sample m files from a repository based on the number of lines in each file.
    """
    file_paths = [os.path.join(repo_dir, file) for file in os.listdir(repo_dir) if file.endswith('.py')]
    line_counts = [count_lines(file_path) for file_path in file_paths]
    total_lines = sum(line_counts)

    if total_lines == 0:
        return []  # No lines to sample from

    probabilities = [line_count / total_lines for line_count in line_counts]
    sampled_files = random.choices(file_paths, weights=probabilities, k=m)

    return sampled_files

def get_file_contents(file_paths):
    """
    Get the contents of the given files.
    """
    contents = []
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            contents.append(file.read())
    return contents

In [32]:
# Assuming output directory from the previous scraping script
output_dir = "scraped_repos"
m = 3  # Number of files to sample from each repository
sampled_files_contents = []

for repo_name in tqdm(os.listdir(output_dir)):
    repo_dir = os.path.join(output_dir, repo_name)
    sampled_files = sample_files(repo_dir, m)

    if sampled_files == []:
        print(f"Skipping {repo_name} because there are no files to sample from.")

    file_contents = get_file_contents(sampled_files)
    sampled_files_contents.extend(file_contents)

100%|██████████| 25/25 [00:00<00:00, 142.30it/s]


In [33]:
len(sampled_files_contents)

75