In [None]:
import pandas as pd
from datetime import datetime, timedelta
import requests
import time
import os
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

GITHUB_TOKEN = os.getenv("GITHUB_API_TOKEN")

project_file = "../data/projects.csv"
df_projects = pd.read_csv(project_file)
api_base = "https://api.github.com/repos"
headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"token {GITHUB_TOKEN}"
}

last_commit_dates = []
is_recent = []
cutoff_date = datetime.now() - timedelta(days=90)

for index, row in tqdm(df_projects.iterrows(), total=len(df_projects)):
    repo = row["full_name"]
    print(f"Checking {repo}...")

    url = f"{api_base}/{repo}/commits?per_page=1"
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        if isinstance(data, list) and data:
            commit_date_str = data[0]["commit"]["committer"]["date"]
            commit_date = datetime.strptime(commit_date_str, "%Y-%m-%dT%H:%M:%SZ")
            last_commit_dates.append(commit_date_str)
            is_recent.append(commit_date >= cutoff_date)
        else:
            last_commit_dates.append(None)
            is_recent.append(False)

    except requests.RequestException as e:
        print(f"Error for {repo}: {e}")
        last_commit_dates.append(None)
        is_recent.append(False)

    time.sleep(1)  # GitHub API rate limit: 60 requests/hour for unauthenticated

# Add to DataFrame
df_projects["last_commit_date"] = last_commit_dates
df_projects["updated_recently"] = is_recent

df_projects.to_csv(project_file, index=False)

#with open("projects_final.txt", "w", encoding="utf-8") as dest:#
#    for index, row in df_projects.iterrows():
#        dest.write(f"python3 /tmp/ssimon/config-space/experiments/analysis.py --url={row['html_url']} --name={row['name']}\n")

In [None]:
df = pd.read_csv("../data/projects.csv")

df_relevant = df[(df["relevance"] == "y") & (df["updated_recently"] == True)]

print("Length of df_relevant:", len(df_relevant))

df_relevant.to_csv("../data/projects_final.csv", index=False)

with open("projects_final.txt", "w", encoding="utf-8") as dest:
    for index, row in df_relevant.iterrows():
        dest.write(f"python3 /tmp/ssimon/config-space/experiments/analysis.py --url={row['html_url']} --name={row['name']}\n")

In [None]:
df = pd.read_csv("../data/projects_final.csv")

# Average size
avg_size = df['size'].mean()
# Largest and smallest size
max_size = df['size'].max()
min_size = df['size'].min()
# Range of languages
languages = df['language'].dropna().unique()
# Average stars
avg_stars = df['stargazers_count'].mean()

most_common_language = df['language'].mode()[0]
language_counts = df['language'].value_counts()
print(f"Most common language: {most_common_language} ({language_counts[most_common_language]} repositories)")

print(f"Average repository size: {avg_size/1024:.2f} MB")
print(f"Largest repository size: {max_size/1024:.2f} MB")
print(f"Smallest repository size: {min_size} kB")
print(f"Number of unique languages: {len(languages)}")
print(f"Languages: {languages}")
print(f"Average number of stars: {avg_stars:.2f}")

python_count = language_counts['Python']
total_count = len(df)
python_percent = (python_count / total_count) * 100
print(f"Percentage of repositories written in Python: {python_percent:.2f}%")

largest_project = df.loc[df['size'].idxmax()]
print(f"Largest project: {largest_project['name']} ({largest_project['full_name']})")
print(f"Size: {largest_project['size']/1024/1024:.2f} GB")
print(f"URL: {largest_project['html_url']}")

In [None]:
# Show all projects with size smaller than 1000 kB
small_projects = df[df['size'] < 1000][['name', 'full_name', 'size', 'html_url']]
print(small_projects["html_url"])


**Copy files from tesla to local machine**

In [None]:
import os
import paramiko
from scp import SCPClient
from dotenv import load_dotenv

load_dotenv()

projects_dir = 'projects'
local_host = '172.26.92.174'
local_user = 'ssimon'
local_password = os.getenv('LOCAL_PASSWORD')
local_target_dir = '/home/ssimon/github/config-space/data/projects'

ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(local_host, username=local_user, password=local_password)

with SCPClient(ssh.get_transport()) as scp:
    for folder in os.listdir(projects_dir):
        folder_path = os.path.join(projects_dir, folder)
        if os.path.isdir(folder_path):
            for f in os.listdir(folder_path):
                if f.endswith('.json'):
                    file_path = os.path.join(folder_path, f)
                    local_file = os.path.join(local_target_dir, f)
                    # Prüfen, ob Datei schon existiert
                    stdin, stdout, stderr = ssh.exec_command(f'test -f "{local_file}" && echo exists || echo missing')
                    result = stdout.read().decode().strip()
                    if result == 'exists':
                        print(f"Übersprungen (existiert bereits): {local_file}")
                        continue
                    print(f"Kopiere {file_path} nach {local_file}")
                    scp.put(file_path, remote_path=local_target_dir)
                    print(f"Kopiert: {file_path} -> {local_file}")

ssh.close()

**Copy files from tesla to other VM**

In [None]:
import os
import paramiko
from scp import SCPClient
from dotenv import load_dotenv

load_dotenv()

projects_dir = 'projects'

# Second VM credentials
second_vm_host = '172.26.92.174'  # replace with the second VM's IP or hostname
second_vm_user = 'ssimon'
second_vm_password = os.getenv('VM_PASSWORD')
second_vm_target_dir = '/home/ssimon/github/config-space/data/projects'

ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(second_vm_host, username=second_vm_user, password=second_vm_password)

with SCPClient(ssh.get_transport()) as scp:
    for folder in os.listdir(projects_dir):
        folder_path = os.path.join(projects_dir, folder)
        if os.path.isdir(folder_path):
            for f in os.listdir(folder_path):
                if f.endswith('.json'):
                    file_path = os.path.join(folder_path, f)
                    remote_file = os.path.join(second_vm_target_dir, f)
                    
                    # Check if the file exists on the second VM
                    stdin, stdout, stderr = ssh.exec_command(f'test -f "{remote_file}" && echo exists || echo missing')
                    result = stdout.read().decode().strip()
                    if result == 'exists':
                        print(f"Übersprungen (existiert bereits): {remote_file}")
                        continue
                    print(f"Kopiere {file_path} nach {remote_file}")
                    scp.put(file_path, remote_path=second_vm_target_dir)
                    print(f"Kopiert: {file_path} -> {remote_file}")

ssh.close()


Übersprungen (existiert bereits): /home/ssimon/github/config-space/data/projects/ExplorerPatcher.json
Übersprungen (existiert bereits): /home/ssimon/github/config-space/data/projects/ComfyUI.json
Übersprungen (existiert bereits): /home/ssimon/github/config-space/data/projects/react-spring.json
Übersprungen (existiert bereits): /home/ssimon/github/config-space/data/projects/lo.json
Kopiere projects/haystack/haystack.json nach /home/ssimon/github/config-space/data/projects/haystack.json
Kopiert: projects/haystack/haystack.json -> /home/ssimon/github/config-space/data/projects/haystack.json
Übersprungen (existiert bereits): /home/ssimon/github/config-space/data/projects/maui.json
Kopiere projects/GSYVideoPlayer/GSYVideoPlayer.json nach /home/ssimon/github/config-space/data/projects/GSYVideoPlayer.json
Kopiert: projects/GSYVideoPlayer/GSYVideoPlayer.json -> /home/ssimon/github/config-space/data/projects/GSYVideoPlayer.json
Übersprungen (existiert bereits): /home/ssimon/github/config-space/

In [None]:
import os

projects_dir = 'projects'
folders_without_json = []

folders = os.listdir(projects_dir)

print("Number of projects analyzed:", len(folders))

for folder in os.listdir(projects_dir):
    folder_path = os.path.join(projects_dir, folder)
    if os.path.isdir(folder_path):
        has_json = any(f.endswith('.json') for f in os.listdir(folder_path))
        if not has_json:
            folders_without_json.append(folder)

print(f"Folders without JSON files: {len(folders_without_json)}")
for folder in folders_without_json:
    print(folder)