In [1]:
import requests
import json
import pandas as pd
from datetime import datetime as dt
import pytz
import time

In [2]:
grants = pd.read_excel('gr15_grants.xlsx')

gr_aplic = pd.read_json('grants_applications_gr15.json').T

df  = gr_aplic.merge(grants, on = 'grant_id' )

In [3]:
df.columns

Index(['grant_id', 'active_x', 'approved', 'address_x', 'title_x', 'url',
       'description_x', 'created_on_x', 'active_y', 'title_y', 'address_y',
       'amount_received', 'amount_received_in_round', 'contribution_count',
       'contributor_count', 'description_y', 'website', 'github_project_url',
       'twitter_handle_2', 'twitter_handle_1', 'twitter_verified',
       'created_on_y', 'last_update'],
      dtype='object')

In [52]:

## functions 


def github_code_stats( owner, repo):
    url = "https://api.github.com/repos/{owner}/{repo}/stats/code_frequency"
    headers = {
     'X-GitHub-Api-Version': '2022-11-28', 
     'accept':'application/vnd.github+json'}
    

    return { 'status_code':requests.get(url.format(owner=owner, repo=repo), headers=headers).status_code , 
            'repo_data' : requests.get(url.format(owner=owner, repo=repo), headers=headers).json()}




## get the owner and the repo name of projects github urls, used on the 'start_stats_aggregation' function return a df
def get_owner_repo(github_urls):
    
    owner = []
    repo = []
    url = []


    for i in github_urls:
        try:
            if i.split('/')[2] == 'github.com':
                try:
                    if type(i.split('/')[4]) == str:
                        repo.append(i.split('/')[4])
                        owner.append(i.split('/')[3])
                        url.append(i)

                except IndexError:
                    # Handle the IndexError when the split method doesn't produce enough elements
                    repo.append(None)
                    owner.append(None)
                    url.append(None)

            else: 
                repo.append(None)
                owner.append(None)
                url.append(None)

        except IndexError:
            # Handle the IndexError when the split method doesn't produce enough elements
            repo.append(None)
            owner.append(None)
            url.append(None)

    
    github_owner_repo = pd.DataFrame(data= [url, owner, repo]).T
    github_owner_repo.columns = ['url','owner', 'repo']
    
    return github_owner_repo


# start the github stats aggregation as its is not in cache, returns 'good to go' whe all the data have been initialized             
def start_stats_aggregation(github_df_repos):
    start_time = time.time()
    for i in range(0,len(github_df_repos['owner'])):
        # gettin owner and repo
        git_owner = github_df_repos.iloc[i]['owner']
        git_repo = github_df_repos.iloc[i]['repo']

        # pocking the APi to start gathering the stats
        github_code_stats(git_owner, git_repo)

        
    time.sleep(60)

    status = {'start_status':'it did not run yet'}
    counting_waiting_loops = 0
    
    while github_code_stats(github_df_repos.iloc[-1]['owner'], github_df_repos.iloc[-1]['repo'])['status_code'] == 202:
        
        time.sleep(60)
        counting_waiting_loops += 1 
        status['start_status'] = f" passed the {counting_waiting_loops} waiting loop"
     
    
    else: 
        status_code = github_code_stats(github_df_repos.iloc[-1]['owner'],github_df_repos.iloc[-1]['repo'])['status_code']
        
        if status_code in [400,401,402]:
            
            status['start_status'] = f"error status code {status_code}"
 
        elif status_code == 404:
    
            print('github url not valid or doesnt exist')


        elif status_code == 200:
            
            status['start_status'] = 'good to go'
        
        else:
            
            status['start_status'] = 'unknown error'
    
    return  status['start_status']




def sunday_timestamp(week_number, year):
    # Create a datetime object for the first day of the given year
    first_day = datetime.datetime(year, 1, 1, tzinfo=pytz.utc)
    
    # Calculate the number of days to the first Sunday of the year
    days_to_first_sunday = (6 - first_day.weekday()) % 7
    
    # Calculate the number of days to the Sunday of the given week
    days_to_sunday = (week_number - 1) * 7 + days_to_first_sunday
    
    # Create a datetime object for the Sunday of the given week
    sunday = first_day + datetime.timedelta(days=days_to_sunday)
    
    # Convert the datetime object to a UTC timestamp
    return int(sunday.timestamp())


# repo_data in json 
# start and end date aggregation in number of weeks

def timeframing_data(repo_data, start_date_aggregation, end_date_aggregation):
    weeks = []
    addition = []
    deletions = []

    for i in range(0,len(repo_data)):
        weeks.append(repo_data[i][0])
        addition.append(repo_data[i][1])
        deletions.append(repo_data[i][2])


    week_addition = pd.DataFrame( data = [weeks, addition, deletions]).T

    week_addition.columns = ['weeks', 'addition', 'deletions']

    additions_by_week = week_addition[(week_addition['weeks']<= sunday_timestamp(start_date_aggregation,2022)) & 
                  (week_addition['weeks'] > (sunday_timestamp(start_date_aggregation,2022) - end_date_aggregation* 604800 ))]
    
    return additions_by_week


## pipeline

In [16]:
sample =  df[0:20]['github_project_url']

In [17]:
sample

0                      https://github.com/socathie/zkML
1                  https://github.com/DIMCHERRY/NFT-Ads
2                        https://github.com/Energy-Node
3         https://github.com/SaveWithBuckets/Buckets_v1
4                     https://github.com/pshdev0/dexode
5                     https://github.com/pshdev0/dexode
6                             https://github.com/anspar
7                   https://github.com/Nawarat-Protocol
8                           https://github.com/VMLVaske
9            https://github.com/AthanorLabs/atomic-swap
10                                                  NaN
11                        https://github.com/dailyfeeds
12                                                  NaN
13                              https://github.com/0xpm
14               https://github.com/holic/web3-scaffold
15             https://github.com/lenstube-xyz/lenstube
16          https://github.com/heacare/habitat-sdk-dart
17          https://github.com/organizations/Met

In [53]:
# from a list get the owner and repo names
sample =  df[5:17]['github_project_url']
sample = sample[~sample.isna()]
owner_repo_names  = get_owner_repo(sample)

#clean the none values
owner_repo_names = owner_repo_names[~owner_repo_names['repo'].isna()]

# poke the github APi to start aggregation of data for the repos we want 
aggregation_status = start_stats_aggregation(owner_repo_names[['owner', 'repo']])

# aqui eu tenho um df com 




In [54]:
aggregation_status

'good to go'

In [43]:

sample =  df[5:17]['github_project_url']
sample = sample[~sample.isna()]

owner_repo_names  = get_owner_repo(sample)

owner_repo_names = owner_repo_names[~owner_repo_names['repo'].isna()]

owner_repo_names

Unnamed: 0,url,owner,repo
0,https://github.com/pshdev0/dexode,pshdev0,dexode
4,https://github.com/AthanorLabs/atomic-swap,AthanorLabs,atomic-swap
7,https://github.com/holic/web3-scaffold,holic,web3-scaffold
8,https://github.com/lenstube-xyz/lenstube,lenstube-xyz,lenstube
9,https://github.com/heacare/habitat-sdk-dart,heacare,habitat-sdk-dart


In [58]:
# if aggregation_status = 'good to go':

end_date_aggregation  = 27 # !!!!! precisa ser definido pelo usu[arios]!!!
start_date_aggregation = 23 # !!!precisa ser definido pelo usu[arios]!!!


# build df to receive data 
data = []
columns_names = ['url','weeks', 'addition']
repo_hitorie = pd.DataFrame(data = data, columns = columns_names )


for i in range(0, len(owner_repo_names['url'])):
    
        data_json = github_code_stats(owner_repo_names.iloc[i]['owner'], owner_repo_names.iloc[i]['repo'])['repo_data'] 
        df_url_data = timeframing_data(repo_data = data_json, start_date_aggregation = start_date_aggregation, 
                            end_date_aggregation =  end_date_aggregation)[['weeks', 'addition']]   
        df_url_data.insert(loc=0,
          column='url',
          value= i )
        repo_data = pd.concat([repo_hitorie, df_url_data])
        

KeyError: 0

In [24]:
repo_historie

Unnamed: 0,"url,weeks",additions


In [19]:

# repo_data in json 
# start and end date aggregation in number of weeks
def timeframing_data(repo_data, start_date_aggregation, end_date_aggregation):
    weeks = []
    addition = []
    deletions = []

    for i in range(0,len(repo_data)):
        weeks.append(repo_data[i][0])
        addition.append(repo_data[i][1])
        deletions.append(repo_data[i][2])


    week_addition = pd.DataFrame( data = [weeks, addition, deletions]).T

    week_addition.columns = ['weeks', 'addition', 'deletions']

    additions_by_week = week_addition[(week_addition['weeks']<= sunday_timestamp(start_date_aggregation,2022)) & 
                  (week_addition['weeks'] > (sunday_timestamp(start_date_aggregation,2022) - end_date_aggregation* 604800 ))]
    
    return additions_by_week


In [68]:

# remaining rate limit

import requests
import time
import json

# Define API endpoint and personal access token
url = 'https://api.github.com/user/repos'
headers = {'Authorization': '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'}

# Make API request
response = requests.get(url, headers=headers)

# Access X-RateLimit-Remaining header
remaining_requests = int(response.headers['X-RateLimit-Remaining'])
print(f'You have {remaining_requests} API requests remaining')

# Handle API response
if response.status_code == 200:
    data = json.loads(response.text)
    # Process data as needed
else:
    print(f'API request failed with status code {response.status_code}')

# Wait if rate limit is close to being reached
if remaining_requests < 10:
    print('Rate limit approaching. Waiting before making additional requests...')
    time.sleep(60)  # Wait for 60 seconds

You have 59 API requests remaining
API request failed with status code 401


In [51]:
requests.get(url, headers=headers).headers

{'Server': 'GitHub.com', 'Date': 'Sat, 15 Apr 2023 14:54:23 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '149', 'X-GitHub-Media-Type': 'github.v3; format=json', 'x-github-api-version-selected': '2022-11-28', 'X-RateLimit-Limit': '60', 'X-RateLimit-Remaining': '18', 'X-RateLimit-Reset': '1681573299', 'X-RateLimit-Used': '42', 'X-RateLimit-Resource': 'core', 'Access-Control-Expose-Headers': 'ETag, Link, Location, Retry-After, X-GitHub-OTP, X-RateLimit-Limit, X-RateLimit-Remaining, X-RateLimit-Used, X-RateLimit-Resource, X-RateLimit-Reset, X-OAuth-Scopes, X-Accepted-OAuth-Scopes, X-Poll-Interval, X-GitHub-Media-Type, X-GitHub-SSO, X-GitHub-Request-Id, Deprecation, Sunset', 'Access-Control-Allow-Origin': '*', 'Strict-Transport-Security': 'max-age=31536000; includeSubdomains; preload', 'X-Frame-Options': 'deny', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '0', 'Referrer-Policy': 'origin-when-cross-origin, strict-origin-when-cross-origin', 'Content-