In [1]:
import requests
import pandas as pd
import time
import re

GITHUB_TOKEN = "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"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Function to get users from Tokyo with over 200 followers
def get_users_in_tokyo(min_followers=200):
    url = f"https://api.github.com/search/users?q=location:tokyo+followers:>{min_followers}&per_page=100"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json().get('items', [])
    else:
        print("Error fetching users:", response.status_code, response.text)
        return []

# Function to get detailed information about a user
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching user details for {username}: {response.status_code}")
        return None

# Function to clean company names
def clean_company_name(company):
    if company:
        company = company.lstrip('@').strip().upper()
        company = re.sub(r"\s+", " ", company)  # Remove multiple spaces
    return company if company else ""

# Function to clean names
def clean_name(name):
    if name:
        name = re.sub(r"\s+", " ", name).strip()  # Remove multiple spaces and trim
    return name if name else ""

# Function to clean bio
def clean_bio(bio):
    if bio:
        bio = re.sub(r"\s+", " ", bio).strip()  # Remove multiple spaces and trim
    return bio if bio else ""

# Function to get up to 500 of a user's repositories
def get_user_repos(username, max_repos=500):
    url = f"https://api.github.com/users/{username}/repos"
    params = {"per_page": 100, "sort": "updated"}
    repos = []
    while url and len(repos) < max_repos:
        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code == 200:
            repos.extend(response.json())
            url = response.links.get('next', {}).get('url')  # Move to the next page if available
        else:
            print(f"Error fetching repos for {username}: {response.status_code}")
            break
        time.sleep(1)  # Rate limit control
    return repos[:max_repos]

# Data storage
users_data = []
repos_data = []

# Get users from Tokyo with over 200 followers
users = get_users_in_tokyo()
print(f"Found {len(users)} users in Tokyo with over 200 followers.")

# Process each user
for user in users:
    username = user['login']
    
    # Get detailed user info
    user_details = get_user_details(username)
    if user_details:
        # Clean and process user data
        user_data = {
            "login": user_details.get("login", ""),
            "name": clean_name(user_details.get("name", "")),
            "company": clean_company_name(user_details.get("company", "")),
            "location": user_details.get("location", ""),
            "email": user_details.get("email", ""),
            "hireable": str(user_details.get("hireable", "")),
            "bio": clean_bio(user_details.get("bio", "")),
            "public_repos": user_details.get("public_repos", 0),
            "followers": user_details.get("followers", 0),
            "following": user_details.get("following", 0),
            "created_at": user_details.get("created_at", "")
        }
        users_data.append(user_data)
        
        # Get repositories for the user
        user_repos = get_user_repos(username)
        for repo in user_repos:
            # Process repository data
            repo_data = {
                "login": username,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": str(repo.get("has_projects", False)).lower(),
                "has_wiki": str(repo.get("has_wiki", False)).lower(),
                "license_name": repo.get("license", {}).get("name", "") if repo.get("license") else ""
            }
            repos_data.append(repo_data)
    
    # Rate limit handling
    time.sleep(1)

# Convert lists to DataFrames
users_df = pd.DataFrame(users_data)
repos_df = pd.DataFrame(repos_data)

# Save to CSV
users_df.to_csv("users.csv", index=False)
repos_df.to_csv("repositories.csv", index=False)

print("Data saved to users.csv and repositories.csv.")


Found 100 users in Tokyo with over 200 followers.
Data saved to users.csv and repositories.csv.


In [4]:
import pandas as pd

users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# Convert 'created_at' column to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'], errors='coerce')

# 1. Top 5 users in Tokyo with the highest number of followers
top_5_followers = users_df.nlargest(5, 'followers')['login'].tolist()
print("Top 5 users in Tokyo by followers:", ', '.join(top_5_followers))

# 2. 5 earliest registered GitHub users in Tokyo
earliest_users = users_df.nsmallest(5, 'created_at')['login'].tolist()
print("Earliest registered users in Tokyo:", ', '.join(earliest_users))

# 3. 3 most popular licenses among these users
popular_licenses = repos_df['license_name'].dropna().value_counts().nlargest(3).index.tolist()
print("3 most popular licenses:", ', '.join(popular_licenses))

# 4. Company with the majority of developers
most_common_company = users_df['company'].value_counts().idxmax()
print("Company with majority of developers:", most_common_company)

# 5. Most popular programming language
most_popular_language = repos_df['language'].dropna().value_counts().idxmax()
print("Most popular programming language:", most_popular_language)

# 6. Second most popular programming language among users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']
second_popular_language = repos_df[repos_df['login'].isin(users_after_2020['login'])]['language'].dropna().value_counts().nlargest(2).index.tolist()[-1]
print("Second most popular programming language (after 2020):", second_popular_language)

# 7. Language with the highest average number of stars per repository
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("Language with highest average stars per repository:", avg_stars_per_language)

# 8. Top 5 users in terms of leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login'].tolist()
print("Top 5 users by leader strength:", ', '.join(top_5_leader_strength))

# 9. Correlation between followers and public repositories
correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])
print("Correlation between followers and public repositories:", round(correlation_followers_repos, 3))

# 10. Regression slope of followers on repos
import statsmodels.api as sm

X = users_df['public_repos']
y = users_df['followers']
X = sm.add_constant(X)  # Add a constant term to the predictor
model = sm.OLS(y, X).fit()
slope_followers_repos = model.params[1]  # slope for public_repos
print("Slope of followers on public repositories:", round(slope_followers_repos, 3))

# 11. Correlation between having projects and having wiki enabled
correlation_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print("Correlation between projects and wiki enabled:", round(correlation_projects_wiki, 3))

# 12. Average following per user for hireable true vs false
avg_following_hireable = users_df[users_df['hireable'] == 'true']['following'].mean()
avg_following_not_hireable = users_df[users_df['hireable'] != 'true']['following'].mean()
difference_hireable = avg_following_hireable - avg_following_not_hireable
print("Difference in average following (hireable vs not):", round(difference_hireable, 3))

# 13. Correlation of bio length with followers
users_df['bio_length'] = users_df['bio'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
correlation_bio_followers = users_df['bio_length'].corr(users_df['followers'])
print("Correlation of bio length with followers:", round(correlation_bio_followers, 3))

# 14. Top 5 users who created the most repositories on weekends (UTC)
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek >= 5
weekend_repos = repos_df[repos_df['is_weekend']]
top_5_weekend_users = weekend_repos['login'].value_counts().nlargest(5).index.tolist()
print("Top 5 users with most weekend repositories:", ', '.join(top_5_weekend_users))

# 15. Do people who are hireable share their email addresses more often?
fraction_hireable = users_df[users_df['hireable'] == 'true']['email'].notnull().mean()
fraction_not_hireable = users_df[users_df['hireable'] != 'true']['email'].notnull().mean()
email_difference = fraction_hireable - fraction_not_hireable
print("Difference in email sharing (hireable vs not):", round(email_difference, 3))

# 16. Most common surname
users_df['surname'] = users_df['name'].apply(lambda x: str(x).strip().split()[-1] if pd.notnull(x) else '')
common_surnames = users_df['surname'].value_counts().nlargest(1).index.tolist()
common_surname_count = users_df['surname'].value_counts().max()
print("Most common surname(s):", ', '.join(common_surnames), "Count:", common_surname_count)


Top 5 users in Tokyo by followers: dennybritz, wasabeef, dai-shi, rui314, domenic
Earliest registered users in Tokyo: kana, naoya, ko1, amatsuda, masuidrive
3 most popular licenses: MIT License, Apache License 2.0, Other
Company with majority of developers: MERCARI
Most popular programming language: JavaScript
Second most popular programming language (after 2020): JavaScript
Language with highest average stars per repository: Assembly
Top 5 users by leader strength: blueimp, dai-shi, asahilina, pilcrowonpaper, marcan
Correlation between followers and public repositories: -0.131
Slope of followers on public repositories: -0.733
Correlation between projects and wiki enabled: 0.497
Difference in average following (hireable vs not): nan
Correlation of bio length with followers: 0.141
Top 5 users with most weekend repositories: azu, xuwei-k, rhysd, lambdalisue, making
Difference in email sharing (hireable vs not): nan
Most common surname(s):  Count: 3


  slope_followers_repos = model.params[1]  # slope for public_repos


In [7]:
#Updated code for question 12 with data processing and cleaning
import pandas as pd

# Load the CSV files into DataFrames
users_df = pd.read_csv('users.csv')

# Clean the 'hireable' column
# Replace NaN with False (assuming NaN means not hireable)
users_df['hireable'] = users_df['hireable'].fillna(False)

# Ensure that the hireable column is boolean
users_df['hireable'] = users_df['hireable'].astype(bool)

# Count hireable and non-hireable users
hireable_users_count = users_df[users_df['hireable'] == True].shape[0]
non_hireable_users_count = users_df[users_df['hireable'] == False].shape[0]

# Calculate average following for hireable users
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate average following for non-hireable users
avg_following_not_hireable = users_df[users_df['hireable'] == False]['following'].mean()

# Check if either average is NaN and handle it
if pd.isna(avg_following_hireable):
    avg_following_hireable = 0  # Assign a default value if needed

if pd.isna(avg_following_not_hireable):
    avg_following_not_hireable = 0  # Assign a default value if needed

# Calculate the difference
difference_hireable = avg_following_hireable - avg_following_not_hireable

# Output the result
print("Hireable users count:", hireable_users_count)
print("Non-hireable users count:", non_hireable_users_count)
print("Difference in average following (hireable vs not):", round(difference_hireable, 3))


Hireable users count: 37
Non-hireable users count: 63
Difference in average following (hireable vs not): -767.787


  users_df['hireable'] = users_df['hireable'].fillna(False)


In [6]:
#Updated code for question 15 with data processing and cleaning
import pandas as pd

# Load the CSV files into DataFrames
users_df = pd.read_csv('users.csv')

# Clean the 'hireable' column
# Replace NaN with False (assuming NaN means not hireable)
users_df['hireable'] = users_df['hireable'].fillna(False)

# Ensure that the hireable column is boolean
users_df['hireable'] = users_df['hireable'].astype(bool)

# Calculate the fraction of users with email addresses for hireable users
fraction_hireable = users_df[users_df['hireable'] == True]['email'].notnull().mean()

# Calculate the fraction of users with email addresses for non-hireable users
fraction_not_hireable = users_df[users_df['hireable'] == False]['email'].notnull().mean()

# Check if either fraction is NaN and handle it
if pd.isna(fraction_hireable):
    fraction_hireable = 0  # Assign a default value if needed

if pd.isna(fraction_not_hireable):
    fraction_not_hireable = 0  # Assign a default value if needed

# Calculate the difference in fractions
email_difference = fraction_hireable - fraction_not_hireable

# Output the result
print("Fraction of hireable users with email:", round(fraction_hireable, 3))
print("Fraction of non-hireable users with email:", round(fraction_not_hireable, 3))
print("Difference in email sharing (hireable vs not):", round(email_difference, 3))


Fraction of hireable users with email: 0.541
Fraction of non-hireable users with email: 0.508
Difference in email sharing (hireable vs not): 0.033


  users_df['hireable'] = users_df['hireable'].fillna(False)
