In [None]:
!pip install PyGithub

Collecting PyGithub
  Downloading PyGithub-2.4.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.4.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.6/362.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.4.0 pynacl-1.5.0


In [None]:
import requests
from github import Github

# Step 1: Define the GitHub API token and headers
GITHUB_TOKEN = '' # GitHub token
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

# Step 2: Define functions to get users and their repositories
def get_all_users_from_Tokyo(min_followers=200):
    url = "https://api.github.com/search/users"
    query = f"location:tokyo followers:>{min_followers}"
    params = {'q': query, 'per_page': 100}

    users = []
    page = 1

    while True:
        params['page'] = page
        response = requests.get(url, headers=HEADERS, params=params)
        data = response.json()
        if 'items' in data:
            users.extend([(user['login'], user['followers_url']) for user in data['items']])
            if len(data['items']) < 100:
                break
        else:
            break
        page += 1

    return users

def get_repositories(username):
    url = f"https://api.github.com/users/{username}/repos"
    params = {
        'sort': 'pushed',  # Sort by most recently pushed
        'direction': 'desc',  # Descending order
        'per_page': 100,  # Max per page
        'page': 1  # Start at the first page
    }

    all_repos = []

    while True:
        response = requests.get(url, headers=HEADERS, params=params)
        repos = response.json()

        if not repos:
            break

        all_repos.extend(repos)

        # Check if we need to paginate
        if len(repos) < 100:
            break

        params['page'] += 1  # Move to the next page

    repo_info = [
        {
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['name'] if repo['license'] else 'No license'
        }
        for repo in all_repos[:500]  # Limit to 500 most recently pushed
    ]
    return repo_info

In [None]:
# Initialize GitHub API
g = Github(GITHUB_TOKEN)

# Step 3: Fetch users from Tokyo with over 200 followers
users = get_all_users_from_Tokyo()

# Extract user details and repositories
user_details = []
repo_details = []

for username, _ in users:
    user = g.get_user(username)
    user_details.append({
        'login': user.login,
        'name': user.name or '',
        'company': (user.company or '').strip().lstrip('@').upper(),
        'location': user.location or '',
        'email': user.email or '',
        'hireable': user.hireable or False,
        'bio': user.bio or '',
        'public_repos': user.public_repos,
        'followers': user.followers,
        'following': user.following,
        'created_at': user.created_at.isoformat()
    })

    # Get repositories for the user
    repo_details.extend(get_repositories(username))



In [None]:
# Step 4: Create DataFrames
import pandas as pd

df_users = pd.DataFrame(user_details)
df_repos = pd.DataFrame(repo_details)

df_users.to_csv('users.csv', index=False)
df_repos.to_csv('repositories.csv', index=False)

In [None]:
import pandas as pd
df_users = pd.read_csv('users.csv')
df_repos = pd.read_csv('repositories.csv')

1. Who are the top 5 users in Tokyo with the highest number of followers? List their login in order, comma-separated.
Users

In [None]:
# Filter for users in Tokyo and sort by followers
top_users_tokyo = df_users.sort_values(by='followers', ascending=False)

# Get the logins in order
top_logins = top_users_tokyo['login'].tolist()
top_logins= top_logins[:5]

# Output the result as a comma-separated string
result = ','.join(top_logins)
print(result)

dennybritz,wasabeef,dai-shi,rui314,domenic


2. Who are the 5 earliest registered GitHub users in Tokyo? List their login in ascending order of created_at, comma-separated.
Users

In [None]:
earliest = df_users.sort_values(by='created_at', ascending=True)

# Get the logins in order
top_logins = earliest['login'].tolist()
top_logins= top_logins[:5]

# Output the result as a comma-separated string
result = ','.join(top_logins)
print(result)

kana,kakutani,mootoh,lhl,walf443


3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.
Licenses

In [None]:
df = df_repos.copy()
# Step 1: Filter out missing licenses
filtered_licenses = df['license_name'].dropna()

# Step 2: Count occurrences of each license
license_counts = filtered_licenses.value_counts()

# Step 3: Get the top 3 licenses
top_licenses = license_counts.nlargest(3)

# Get license names in order, comma-separated
result = ','.join(top_licenses.index)

print(result)

No license,MIT License,Apache License 2.0


4. Which company do the majority of these developers work at?
Company (cleaned up as explained above)

In [None]:
company_counts = df_users['company'].value_counts()
top_companies = company_counts.nlargest(1)
print(top_companies.index[0])

GOOGLE


5. Which programming language is most popular among these users?
Language

In [None]:
language_counts = df_repos['language'].value_counts()
top_language = language_counts.nlargest(1)
print(top_language.index[0])

JavaScript


6. Which programming language is the second most popular among users who joined after 2020?
Language

In [None]:
repos = df_repos.copy()
users = df_users.copy()
# Step 1: Convert join_date to datetime
users['join_date'] = pd.to_datetime(users['created_at'])

# Step 2: Filter for users who joined after 2020
filtered_users = users[users['join_date'].dt.year >= 2020]

# Step 3: Merge with the repos
merged_df = pd.merge(filtered_users, repos, on='login')

# Step 4: Count occurrences of each programming language
language_counts = merged_df['language'].value_counts()

# Step 5: Get the second most popular language
top_language = language_counts.nlargest(2)
print(top_language.index[1])

Rust


7. Which language has the highest average number of stars per repository?
Language

In [None]:
df=df_repos.groupby('language').agg({'stargazers_count': 'mean'})
df=df.sort_values(by='stargazers_count', ascending=False)
print(df.index[0])

Assembly


8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
User login

In [None]:
df = df_users.copy()
# Step 1: Calculate leader_strength
df['leader_strength'] = df['followers'] / (1 + df['following'])

# Step 2: Sort by leader_strength
top_users = df.sort_values(by='leader_strength', ascending=False)

# Step 3: Get the top 5 logins
top_5_logins = top_users.head(5)['login']

# Convert to a comma-separated string
result = ','.join(top_5_logins)

print(result)

blueimp,dai-shi,asahilina,pilcrowonpaper,marcan


9. What is the correlation between the number of followers and the number of public repositories among users in Tokyo?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
users = df_users.copy()
correlation = users['followers'].corr(users['public_repos'])
round(correlation,3)

0.051

10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

X = users['public_repos'].values.reshape(-1,1)  # Independent variable
y = users['followers']  # Dependent variable

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the slope (coefficient) of the regression line
slope = model.coef_[0]  # This is the coefficient for public_repos
slope=round(slope,3)
print(slope)

0.279


11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
repos = df_repos.copy()
correlation = repos['has_projects'].corr(repos['has_wiki'])
correlation=round(correlation,3)
print(correlation)

0.427


As the correlation is weak postive, association between the 2 variables cannot be justified, thus answer is NaN.

12. Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
df = df_users.copy()
# Calculate average following for hireable users
avg_hireable = df[df['hireable'] == True]['following'].mean()

# Calculate average following for non-hireable users
avg_non_hireable = df[df['hireable'] == False]['following'].mean()

# Calculate the difference
difference = round(avg_hireable - avg_non_hireable, 3)

print("Difference in average following (hireable - non-hireable):", difference)

Difference in average following (hireable - non-hireable): -79.652


As the difference is negative, not hireable users follow more people than those who are hireable, thus answer is NaN.

13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
from sklearn.linear_model import LinearRegression
df = df_users.copy()
# Step 1: Filter out entries without bios
df = df[df['bio'].notna() & (df['bio'] != '')]

# Step 2: Calculate the word count of each bio
df['bio_word_count'] = df['bio'].str.strip().str.split().str.len()

# Step 3: Prepare data for regression
X = df[['bio_word_count']]
y = df['followers']

# Step 4: Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the slope
slope = model.coef_[0]

# Print the slope rounded to three decimal places
print(f'Slope of followers on bio word count: {slope:.3f}')

Slope of followers on bio word count: 18.520


14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
Users login

In [None]:
df = df_repos.copy()
# Convert the 'created_at' column to datetime
df['created_at'] = pd.to_datetime(df['created_at'])

# Filter for weekend days (Saturday=5, Sunday=6)
weekend_df = df[df['created_at'].dt.dayofweek.isin([5, 6])]

# Count the number of repositories created by each user
user_counts = weekend_df['login'].value_counts()

# Get the top 5 users
top_users = user_counts.nlargest(5)

# Convert to a comma-separated string of user logins
top_users_login = ','.join(top_users.index)

print(top_users_login)

azu,suzuki-shunsuke,yuiseki,xuwei-k,zchee


15. Do people who are hireable share their email addresses more often?
[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
df = df_users.copy()
# Create boolean columns for email presence
df['has_email'] = df['email'].notna() & (df['email'] != '')

# Calculate the fractions
fraction_hireable = df[df['hireable']]['has_email'].mean()
fraction_non_hireable = df[~df['hireable']]['has_email'].mean()

# Calculate the difference
email_fraction_difference = round(fraction_hireable - fraction_non_hireable, 3)

print(email_fraction_difference)

0.132


As the diffrence is minimal, cannot justify the above statement, thus answer is NaN.

16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
Number of users with the most common surname

In [None]:
df = df_users.copy()
# Remove missing names and split by whitespace
df['surname'] = df['name'].dropna().str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = df['surname'].value_counts()

# Identify the most common surnames
most_common_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == most_common_count].index.tolist()

# Sort surnames alphabetically
most_common_surnames.sort()

# Count of users with the most common surname
count_most_common = surname_counts[most_common_surnames[0]] if most_common_surnames else 0

# Prepare the output
most_common_surnames_str = ','.join(most_common_surnames)

print(f"Most common surname(s): {most_common_surnames_str}")
print(f"Number of users with the most common surname: {count_most_common}")

Most common surname(s): Kato,Tanaka
Number of users with the most common surname: 5
