### Github Access Token

In [5]:
import os
from dotenv import load_dotenv
import requests
import csv
from datetime import datetime

load_dotenv()

GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")
HEADERS = {"Authorization": f"token {GITHUB_ACCESS_TOKEN}"}

### Fetch Users

In [8]:
def fetch_users(location="Toronto", min_followers=100):
    url = "https://api.github.com/search/users"
    params = {
        "q": f"location:{location} followers:>{min_followers}",
        "per_page": 100,
        "page": 1
    }
    users = []
    while True:
        response = requests.get(url, headers=HEADERS, params=params).json()
        users.extend(response.get("items", []))
        if "next" not in response or not response["items"]:
            break
        params["page"] += 1
    return users

In [9]:
def fetch_user_details(username):
    url = f"https://api.github.com/users/{username}"
    return requests.get(url, headers=HEADERS).json()


### Fetch Repos

In [10]:
def fetch_repositories(username, max_repos=500):
    url = f"https://api.github.com/users/{username}/repos"
    params = {"per_page": 100, "page": 1}
    repos = []
    while len(repos) < max_repos:
        response = requests.get(url, headers=HEADERS, params=params).json()
        if not response:
            break
        repos.extend(response)
        params["page"] += 1
    return repos[:max_repos]

### Cleaning

In [11]:
def clean_company_name(name):
    if not name:
        return ""
    name = name.strip()
    if name.startswith("@"):
        name = name[1:]
    return name.upper()


### Generating the CSV files

In [12]:
def write_users_csv(users):
    with open("users.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([
            "login", "name", "company", "location", "email", "hireable", "bio",
            "public_repos", "followers", "following", "created_at"
        ])
        for user in users:
            user_details = fetch_user_details(user['login'])
            writer.writerow([
                user_details.get("login", ""),
                user_details.get("name", ""),
                clean_company_name(user_details.get("company", "")),
                user_details.get("location", ""),
                user_details.get("email", ""),
                user_details.get("hireable", ""),
                user_details.get("bio", ""),
                user_details.get("public_repos", 0),
                user_details.get("followers", 0),
                user_details.get("following", 0),
                user_details.get("created_at", "")
            ])

In [15]:
def write_repositories_csv(users):
    with open("repositories.csv", "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow([
            "login", "full_name", "created_at", "stargazers_count",
            "watchers_count", "language", "has_projects", "has_wiki", "license_name"
        ])
        for user in users:
            repos = fetch_repositories(user['login'])
            for repo in repos:
                writer.writerow([
                    user['login'],
                    repo.get("full_name", ""),
                    repo.get("created_at", ""),
                    repo.get("stargazers_count", 0),
                    repo.get("watchers_count", 0),
                    repo.get("language", ""),
                    repo.get("has_projects", False),
                    repo.get("has_wiki", False),
                    repo.get("license", {}).get("name", "") if repo.get("license") else ""
                ])

In [16]:
users = fetch_users()
write_users_csv(users)
write_repositories_csv(users)

Question 1

In [17]:
import duckdb

con = duckdb.connect()

In [21]:
query = "SELECT * FROM read_csv_auto('users.csv')"
con.execute(query).fetchdf()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,aneagoie,Andrei Neagoie,,"Toronto, Canada",,True,Senior Software Dev turned Instructor. Founder...,145,10278,1,2015-01-30 17:05:43
1,ZhangMYihua,Yihua Zhang,,Toronto,yihuazhang2@gmail.com,,Toronto Software Developer,143,5801,11,2015-01-18 00:01:02
2,susanli2016,Susan Li,,Toronto Canada,,,Chief Data Scientist,34,4921,68,2016-11-28 04:22:39
3,thedaviddias,David Dias,KIJIJICA,"Toronto, Canada",,,💻 Passionate Front-End Dev & 🎨 UI/UX fan. Cont...,88,4546,303,2010-04-05 14:40:12
4,ange-yaghi,Ange Yaghi,,Toronto,me@angeyaghi.com,,C++ Developer,32,4023,11,2016-07-13 21:01:21
...,...,...,...,...,...,...,...,...,...,...,...
95,nikolovlazar,Lazar Nikolov,GETSENTRY,"Toronto, Canada",hello@nikolovlazar.com,True,Full stack engineer & educator. @getsentry + @...,85,532,46,2013-09-06 02:07:23
96,mobinni,Mo Binni,ZERO TO MASTERY,"Toronto, Ontario",mo@binni.io,,,71,529,6,2012-09-17 09:17:31
97,kulkarniankita,Ankita Kulkarni,HTTPS://FRONTENDSNACKS.DEV/,"Toronto, ON",kulkarni.ankita09@gmail.com,True,I'm a Creator and Educator!\r\n\r\nTo help you...,70,527,0,2012-10-14 22:46:52
98,ploopyco,,PLOOPY,"Toronto, Canada",,,,18,523,1,2019-09-04 22:37:14


In [22]:
query = "SELECT login from read_csv_auto('users.csv') ORDER BY followers DESC LIMIT 5"
con.execute(query).fetchdf()

Unnamed: 0,login
0,aneagoie
1,ZhangMYihua
2,susanli2016
3,thedaviddias
4,ange-yaghi


In [24]:
query = "SELECT login from read_csv_auto('users.csv') ORDER BY created_at ASC LIMIT 5"
con.execute(query).fetchdf()

Unnamed: 0,login
0,michaelklishin
1,vito
2,benvinegar
3,shazow
4,petertodd


In [25]:
query = "SELECT license_name from read_csv_auto('repositories.csv') WHERE license_name != '' GROUP BY license_name ORDER BY COUNT(*) DESC LIMIT 3"
con.execute(query).fetchdf()

Unnamed: 0,license_name
0,MIT License
1,Other
2,Apache License 2.0


In [28]:
query = "SELECT company, COUNT(*) as count from read_csv_auto('users.csv') WHERE company != '' GROUP BY company ORDER BY COUNT(*) DESC"
con.execute(query).fetchdf()

Unnamed: 0,company,count
0,NX,2
1,GETSENTRY,2
2,GITHUB,2
3,UNIVERSITY OF TORONTO,2
4,NVIDIA,2
5,PINTEREST,1
6,SNAPLII @SNAPPAYINC,1
7,KIJIJICA,1
8,UBER @UBER-ATG @UBER-RESEARCH,1
9,PAGERDUTY,1


In [33]:
query = f"""
    SELECT language
    FROM read_csv_auto('repositories.csv') AS r
    JOIN read_csv_auto('users.csv') AS u ON r.login = u.login
    WHERE u.created_at > '2020-01-01' AND language != ''
    GROUP BY language
    ORDER BY COUNT(*) DESC
    LIMIT 1 OFFSET 1
"""
con.execute(query).fetchdf()

Unnamed: 0,language
0,JavaScript


In [36]:
query = f"""
    select language, avg(stargazers_count) as avg_stars
    from read_csv_auto('repositories.csv') as r
    group by language
    order by avg_stars desc
"""
con.execute(query).fetchdf()

Unnamed: 0,language,avg_stars
0,Forth,1191.000000
1,ASP.NET,414.000000
2,Jupyter Notebook,246.060150
3,Cython,209.000000
4,SCSS,180.454545
...,...,...
123,Blade,0.000000
124,Logos,0.000000
125,Apex,0.000000
126,Processing,0.000000


In [37]:
query = """
    select login, followers / (1 + following) as leader_strength
    from read_csv_auto('users.csv')
    order by leader_strength desc
    limit 5
"""
con.execute(query).fetchdf()

Unnamed: 0,login,leader_strength
0,aneagoie,5139.0
1,nayuki,3546.0
2,GrapheneOS,3524.0
3,hlissner,2424.0
4,rspivak,2180.0


In [None]:
query = f"""
    SELECT followers, public_repos
    FROM read_csv_auto('users.csv')
"""

toronto_users_df = con.execute(query).fetchdf()
correlation = toronto_users_df['followers'].corr(toronto_users_df['public_repos'])
print(correlation)

-0.05782766813920423


In [43]:
import pandas as pd
import statsmodels.api as sm

In [48]:
X = toronto_users_df['public_repos']
y = toronto_users_df['followers']
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
slope = model.params['public_repos']

print(slope)

-0.35360211676936415


In [49]:
query = "select * from read_csv_auto('repositories.csv')"
repos_df = con.execute(query).fetchdf()

In [54]:
print(toronto_users_df)

    followers  public_repos
0       10278           145
1        5801           143
2        4921            34
3        4546            88
4        4023            32
..        ...           ...
95        532            85
96        529            71
97        527            70
98        523            18
99        505           176

[100 rows x 2 columns]


In [51]:
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])
print(correlation)

0.47926561266968015


In [55]:
query = "select * from read_csv_auto('users.csv')"
users_df = con.execute(query).fetchdf()

In [58]:
print(users_df)

             login             name                      company  \
0         aneagoie   Andrei Neagoie                         None   
1      ZhangMYihua      Yihua Zhang                         None   
2      susanli2016         Susan Li                         None   
3     thedaviddias       David Dias                     KIJIJICA   
4       ange-yaghi       Ange Yaghi                         None   
..             ...              ...                          ...   
95    nikolovlazar    Lazar Nikolov                    GETSENTRY   
96         mobinni         Mo Binni              ZERO TO MASTERY   
97  kulkarniankita  Ankita Kulkarni  HTTPS://FRONTENDSNACKS.DEV/   
98        ploopyco             None                       PLOOPY   
99       DylanVann       Dylan Vann                         None   

                      location                        email hireable  \
0              Toronto, Canada                         None     True   
1                      Toronto        y

In [62]:
import numpy as np

In [None]:
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
print(avg_following_hireable)
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
print(avg_following_non_hireable)
difference = avg_following_hireable - avg_following_non_hireable
print(difference)

58.0
nan
nan


In [65]:
query = f"""
    SELECT bio, followers
    FROM read_csv_auto('users.csv')
    WHERE bio IS NOT NULL AND bio != ''
"""

users_df = con.execute(query).fetchdf()

users_df['bio_word_count'] = users_df['bio'].str.split().str.len()

X = users_df['bio_word_count']
y = users_df['followers']
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
slope = model.params['bio_word_count']

print(slope)

-3.706201683805979


In [66]:
query = f"""
    SELECT login, created_at
    FROM read_csv_auto('repositories.csv')
"""

repos_df = con.execute(query).fetchdf()

repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek >= 5

weekend_repos_count = repos_df[repos_df['is_weekend']].groupby('login').size().reset_index(name='count')

top_weekend_users = weekend_repos_count.sort_values(by='count', ascending=False).head(5)

top_users_logins = top_weekend_users['login'].tolist()

print("Top 5 users who created the most repositories on weekends (UTC):", ', '.join(top_users_logins))

Top 5 users who created the most repositories on weekends (UTC): GunterMueller, jsoref, vladikoff, vito, rokon12


In [68]:
query = f"""
    SELECT name
    FROM read_csv_auto('users.csv')
    WHERE name IS NOT NULL AND name != ''
"""

users_df = con.execute(query).fetchdf()

users_df['surname'] = users_df['name'].str.strip().str.split().str[-1]

surname_counts = users_df['surname'].value_counts()

max_count = surname_counts.max()

most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

most_common_surnames.sort()

print("Most common surname(s):", ', '.join(most_common_surnames))

Most common surname(s): Ahmed
