In [1]:
from time import sleep

import requests
import json
import pandas as pd
import time
from collections import defaultdict
from datetime import datetime

from dotenv import load_dotenv
import os


In [5]:
load_dotenv(dotenv_path='app.env')

token = os.getenv('GITHUB_TOKEN')

In [14]:
# Read the CSV file
file_path = 'Data/Application_types.csv'
df = pd.read_csv(file_path)

# Convert the 'Keyword' column to a list
topic_list = df['Keyword'].tolist()

# Display the list
print(topic_list)


def write_json_data(fileName, data):
    with open(fileName, 'w') as file:
        json.dump(data, file, indent=4)


def append_json_data(fileName, data):
    # Try to read existing data if the file exists
    try:
        with open(fileName, 'r') as file:
            existing_data = json.load(file)
    except FileNotFoundError:
        existing_data = []  # Initialize as an empty list if the file doesn't exist

    # Append new data to existing data
    existing_data.append(data)

    # Write the updated data back to the file
    with open(fileName, 'w') as file:
        json.dump(existing_data, file, indent=4)


def sleep_if_rate_exhausted(response):
    core_limits = response.headers._store
    rate_left = core_limits['x-ratelimit-remaining'][1]

    if int(rate_left) > 0:
        return

    reset_time = core_limits['x-ratelimit-reset'][1]
    reset_timestamp = datetime.fromtimestamp(int(reset_time))
    current_time = datetime.now()
    sleep_duration = (reset_timestamp - current_time).total_seconds()
    print(f"Rate limit reached. Sleeping until reset at {reset_timestamp}")
    time.sleep(sleep_duration+5)  # Sleep until reset time + some delay time
    print("Rate limit reset. Resuming requests.")


['Dashboard', 'CMS', 'E-commerce', 'Portfolio', 'Blog', 'Chatbot', 'Social Network', 'Messaging', 'Forum', 'Community', 'News', 'Wiki', 'File Manager', 'Analytics', 'Project Management', 'Task Manager', 'Calendar', 'Notes', 'Kanban', 'ERP', 'CRM', 'POS', 'Inventory', 'To-do', 'Video Streaming', 'Music Player', 'Photo Gallery', 'Weather', 'Finance', 'Budget Tracker', 'Expense Tracker', 'Banking', 'Travel', 'Booking', 'Reservation', 'Hotel Management', 'Real Estate', 'Job Portal', 'Classroom', 'Learning Management System', 'Online Exam', 'Survey', 'Poll', 'E-learning', 'Quiz', 'Recipe', 'Cookbook', 'Portfolio', 'Resume Builder', 'Event Management', 'Gaming', 'Game Tracker', 'Game Leaderboard', 'VR', 'AR', '3D Modeling', 'Fitness', 'Workout Tracker', 'Health Tracker', 'Diet Planner', 'Meal Planner', 'Meditation', 'Habit Tracker', 'Productivity', 'Chat', 'Collaboration', 'Help Desk', 'Customer Support', 'Feedback', 'Documentation', 'Startup', 'Portfolio', 'Map', 'Geo-location', 'Travel Gui

In [15]:
def get_github_projects_of_topic(num_results, topic):
    headers = {'Authorization': f'token {token}'}
    url = 'https://api.github.com/search/repositories'
    params = {
        'q': f'(language:JavaScript OR language:TypeScript) {topic} stars:>50',
        'sort': 'stars',
        'order': 'desc',
        'per_page': num_results,
        'page': 1
    }

    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        data = response.json()
        # write_json_data('Data/API_Response/topic_data.json', data)
        # pretty_json = json.dumps(data['items'], indent=4)
        # print(pretty_json)
        projects = [{'id': item['id'], 'name': item['name'], 'owner': item['owner']['login'],
                     'watchers': item['watchers_count'],
                     'created_at': item['created_at'], 'updated_at': item['updated_at']
                        , 'size': item['size'], 'stars': item['stargazers_count']
                        , 'open_issues_count': item['open_issues_count']
                        , 'forks': item['forks']} for item in data['items']]
        # df = pd.DataFrame(projects)

        return projects, response

    else:
        print(f"Error: {response.status_code}")
        return None, response


topic_dict = {}
num_topics = 40
for i in range(num_topics):
    currentTopic = topic_list[i]
    projects, response = get_github_projects_of_topic(num_results=10, topic=currentTopic)
    if projects is not None:
        topic_dict[currentTopic] = projects
    if response is not None and i <= num_topics - 2:
        sleep_if_rate_exhausted(response)

print(topic_dict)








Rate limit reached. Sleeping until reset at 2024-11-08 23:20:57
Rate limit reset. Resuming requests.
{'Dashboard': [{'id': 88464704, 'name': 'vue-element-admin', 'owner': 'PanJiaChen', 'watchers': 87925, 'created_at': '2017-04-17T03:35:49Z', 'updated_at': '2024-11-08T11:06:05Z', 'size': 15433, 'stars': 87925, 'open_issues_count': 1351, 'forks': 30460}, {'id': 15111821, 'name': 'grafana', 'owner': 'grafana', 'watchers': 64921, 'created_at': '2013-12-11T15:59:56Z', 'updated_at': '2024-11-08T15:07:50Z', 'size': 1171156, 'stars': 64921, 'open_issues_count': 4253, 'forks': 12119}, {'id': 43441403, 'name': 'strapi', 'owner': 'strapi', 'watchers': 63694, 'created_at': '2015-09-30T15:34:48Z', 'updated_at': '2024-11-08T14:51:26Z', 'size': 528051, 'stars': 63694, 'open_issues_count': 904, 'forks': 8104}, {'id': 20619036, 'name': 'pi-hole', 'owner': 'pi-hole', 'watchers': 49129, 'created_at': '2014-06-08T15:02:55Z', 'updated_at': '2024-11-08T14:39:47Z', 'size': 8178, 'stars': 49129, 'open_issues_

In [17]:
all_projects = []

filename = 'github_40_projects_with_topics'
for topic, projects in topic_dict.items():
    for project in projects:
        # Add the topic as a new field in each project dictionary
        project['topic'] = topic
        all_projects.append(project)

# Convert the list of all projects with topics into a DataFrame
df = pd.DataFrame(all_projects)

# Save the DataFrame to a CSV file
df.to_csv(f'{filename}.csv', index=False)

print(f"Data saved to {filename}.csv")

error_from = ""

Data saved to github_40_projects_with_topics.csv


In [19]:
def get_Sbom_of_projects(repo_df):
    headers = {'Authorization': f'token {token}'}
    url = 'https://api.github.com/repos/'
    owner = repo_df['owner']
    repo = repo_df['name']
    url = url + f'{owner}/{repo}/dependency-graph/sbom'

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        # write_json_data('./sbom.json',data)
        packages = data['sbom']['packages']
        # Extract npm packages with name and version
        pkg_dict = defaultdict(dict)
        for package in packages:
            if package['externalRefs'][0]['referenceLocator'].startswith('pkg:npm'):
                try:
                    if len(package['versionInfo']) > 0:
                        if 'versionInfo' in package:
                            if package['name'].startswith('npm:'):
                                sbom_dict[repo][package['name'][4:]] = package['versionInfo']
                                pkg_dict[repo][package['name'][4:]] = package['versionInfo']
                            else:
                                sbom_dict[repo][package['name']] = package['versionInfo']
                                pkg_dict[repo][package['name']] = package['versionInfo']
                except:
                    print(package)
        append_json_data('Data/40_Projects/append_dependencies_40.json', pkg_dict)
        
    else:
        print(f"Error: {response.status_code}")
    
    return response


sbom_dict = defaultdict(dict)
for topic, projects in topic_dict.items():
    for project in projects:
        response = get_Sbom_of_projects(project)
        sleep_if_rate_exhausted(response)
    print(f'{topic} done')
    



# print(sbom_dict)

{'name': 'gensync', 'SPDXID': 'SPDXRef-npm-gensync-4d6d2a', 'downloadLocation': 'NOASSERTION', 'filesAnalyzed': False, 'externalRefs': [{'referenceCategory': 'PACKAGE-MANAGER', 'referenceType': 'purl', 'referenceLocator': 'pkg:npm/gensync'}]}
{'name': 'kbar', 'SPDXID': 'SPDXRef-npm-kbar-f58513', 'downloadLocation': 'NOASSERTION', 'filesAnalyzed': False, 'externalRefs': [{'referenceCategory': 'PACKAGE-MANAGER', 'referenceType': 'purl', 'referenceLocator': 'pkg:npm/kbar'}]}
{'name': '@strapi/plugin-users-permissions', 'SPDXID': 'SPDXRef-npm-strapi-plugin-users-permissions-b951db', 'downloadLocation': 'NOASSERTION', 'filesAnalyzed': False, 'externalRefs': [{'referenceCategory': 'PACKAGE-MANAGER', 'referenceType': 'purl', 'referenceLocator': 'pkg:npm/%40strapi/plugin-users-permissions'}]}
{'name': '@strapi/strapi', 'SPDXID': 'SPDXRef-npm-strapi-strapi-ec90fc', 'downloadLocation': 'NOASSERTION', 'filesAnalyzed': False, 'externalRefs': [{'referenceCategory': 'PACKAGE-MANAGER', 'referenceType