In [None]:
import requests as rq
import json
import pandas as pd
import time

### Function Search for User

In [None]:
token =  "github_key"


headers = {"Authorization":f"Bearer {token}"}

base_url = "https://api.github.com"
def get_users_from_place_with_follower(place="",min_follower=""):

    url = f"{base_url}/search/users"
    i = 1
    full_data = []
    while True :
        params = {
            'q' : f'location:{place} followers:>{min_follower}',
            'per_page':1000,
            'page':i
        }

        response = rq.get(url=url,params=params,headers=headers)

        if not response.json()['items']:
            break

        full_data.extend(response.json()['items'])

        i += 1

    return full_data

def get_user_info(username):
    url = f"{base_url}/users/{username}"
    response = rq.get(url,headers=headers)
    return response.json()


In [None]:
res = get_users_from_place_with_follower("Zurich",50)

searched_users = res

all_user_info = []

for user in searched_users:
    user_info = get_user_info(username = user['login'])
    all_user_info.append(user_info)
    time.sleep(0.1)

##### Convert into DataFrame

In [None]:
user_info_df = pd.DataFrame(all_user_info,columns=['login','name','company','location','email','hireable','bio','public_repos','followers','following','created_at'])

In [None]:
def clean_company_name(name):
    if name is None :
        return ''
    return name.strip().upper().replace("@","",1)


user_info_df["company"] = user_info_df["company"].apply(clean_company_name)
user_info_df["hireable"] = user_info_df["hireable"]==True

#### Save User data as CSV

In [None]:
user_info_df.to_csv('users.csv',index=False)

#### Function for fetching repositories by username 

In [None]:
def get_user_repositories(username):
    repos = []
    page = 1
    max_repos = 500
    while True:
        url = f"{base_url}/users/{username}/repos"
        params = {
            'per_page': 100,  # Maximum allowed per page
            'page': page,
            'sort': 'pushed',
            'direction': 'desc',
        }
        response = rq.get(url, headers=headers, params=params)
        data = response.json()

        if not data:  # Exit loop if no more repos
            break

        repos.extend(data)
        if len(repos) >= max_repos:  # Stop if we have reached the maximum required repos
            break

        page += 1
        time.sleep(0.1)  # Avoid rate limits

    return repos

In [None]:
columns=["login","full_name","created_at","stargazers_count","watchers_count","language","has_projects","has_wiki","license","license_name"]
all_repos_df = pd.DataFrame(all_repos,columns=columns)

def get_login(fullame):
    return fullame.split('/')[0]
all_repos_df["login"] = all_repos_df["full_name"].apply(get_login)

all_repos_df["license_name"] = all_repos_df["license"].apply(lambda x : x['key'] if x else '')
all_repos_df = all_repos_df.drop("license",axis=1)

In [None]:
all_repos_df.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,IDouble,IDouble/Binary-Calculator-Android-Java-App,2018-10-10T07:36:38Z,30,30,Java,True,True,mit
1,IDouble,IDouble/Binary-Calculator-JavaScript,2018-06-24T09:30:35Z,90,90,JavaScript,True,True,mit
2,IDouble,IDouble/Blazor-Razor-Playground-C-Sharp,2019-08-22T04:19:05Z,26,26,HTML,True,True,mit
3,IDouble,IDouble/BMI-Calculator-Android-Java-App,2018-10-17T06:00:02Z,27,27,Java,True,True,mit
4,IDouble,IDouble/BMI-Calculator-JavaScript,2018-10-15T02:35:49Z,35,35,JavaScript,True,True,mit


#### Save repos as CSV from DataFrame

In [None]:

all_repos_df.to_csv('repositories.csv',index=False)

In [None]:
user_info_df['leader'] = user_info_df['followers'] / (1 + user_info_df['following'])

In [None]:

user_info_df['followers'].corr(user_info_df['public_repos'])


0.0653228767047157

In [None]:
user_info_df.sort_values(by="leader",ascending=False).head().login.to_list()

['riscv', 'bpasero', 'Seldaek', 'egamma', 'ethz-asl']

In [None]:
user_info_df['followers'].corr(user_info_df['public_repos'])

0.0653228767047157

In [None]:

all_repos_df['has_wiki'].corr(all_repos_df['has_projects'])

0.3116476904795809

In [None]:
user_info_df[user_info_df['hireable']==True].following.mean()

nan

In [None]:
user_info_df[user_info_df.email !=None]

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at,leader
0,IDouble,Alp ₿📈🚀🌕,IDEX/USD IDEXIO,"Zurich, Switzerland",,False,🗽 Be greedy when others are fearful and be fea...,61,33111,323036,2016-03-31T09:16:13Z,0.102499
1,TheOfficialFloW,Andy Nguyen,,Zurich,theofficialflow1996@gmail.com,False,Information Security Engineer,39,4599,32,2015-09-12T08:16:45Z,139.363636
2,Seldaek,Jordi Boggiano,PACKAGIST,"Zürich, Zurich, Switzerland",j.boggiano@seld.be,False,\r\n Working on https://packagist.com and h...,259,4564,1,2010-01-16T18:28:47Z,2282.000000
3,riscv,RISC-V,,"Zurich, CH",info@riscv.org,False,The Open-Standard Instruction Set Architecture,55,3189,0,2015-02-05T21:49:09Z,3189.000000
4,JonnyBurger,Jonny Burger,REMOTION-DEV,"Zurich, Switzerland",hi@jonny.io,False,Creative hacker @remotion-dev \r\n,239,2463,30,2012-04-10T14:57:36Z,79.451613
...,...,...,...,...,...,...,...,...,...,...,...,...
469,vxsx,Vadim Sikora,DIVIO,Zurich,vadim.sikora@gmail.com,False,Boring guy.,110,51,15,2010-08-17T05:26:11Z,3.187500
470,bianan,Yatao (An) Bian,ETH ZURICH,Zurich,,False,Machine Learning Researcher,43,51,28,2016-12-17T23:40:15Z,1.758621
471,dmengelt,Dominik Mengelt,GOOGLE,Zurich,dominik.mengelt@gmail.com,False,"Developer Relations Engineer, Payments at Google",14,51,15,2013-05-09T17:57:36Z,3.187500
472,xarthurx,ZhaoMA,ETH ZURICH,Zurich,ma@arch.ethz.ch,False,Senior Researcher @ ETH Zurich.,17,51,46,2012-07-04T12:00:24Z,1.085106


In [None]:
user_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 474 entries, 0 to 473
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   login         474 non-null    object 
 1   name          467 non-null    object 
 2   company       474 non-null    object 
 3   location      474 non-null    object 
 4   email         228 non-null    object 
 5   hireable      474 non-null    bool   
 6   bio           316 non-null    object 
 7   public_repos  474 non-null    int64  
 8   followers     474 non-null    int64  
 9   following     474 non-null    int64  
 10  created_at    474 non-null    object 
 11  leader        474 non-null    float64
dtypes: bool(1), float64(1), int64(3), object(7)
memory usage: 41.3+ KB
