In [2]:
import requests
import pandas as pd

# Replace with your GitHub Personal Access Token
token = ""  # Replace with your actual token
headers = {"Authorization": f"token {token}"}

# Function to fetch all users in Melbourne with over 100 followers
def fetch_all_users():
    all_users = []
    page = 1

    # Loop to paginate through users
    while True:
        url = f"https://api.github.com/search/users?q=location:Melbourne+followers:>100&per_page=100&page={page}"
        response = requests.get(url, headers=headers)

        # Check for rate limit
        if response.status_code == 403:
            print("Rate limit exceeded. Waiting for reset...")
            continue

        data = response.json()
        users = data.get('items', [])

        # Stop if no users are returned
        if not users:
            break

        all_users.extend(users)
        print(f"Fetched {len(users)} users from page {page}")
        page += 1

    return all_users

# Fetch all user data
all_users_data = fetch_all_users()
print(f"Total users fetched: {len(all_users_data)}")

# Extract detailed information for each user
user_details = []
for user in all_users_data:
    user_url = f"https://api.github.com/users/{user['login']}"
    response = requests.get(user_url, headers=headers)
    user_data = response.json()

    # Collect relevant fields
    user_details.append({
        'login': user_data.get('login', ''),
        'name': user_data.get('name', ''),
        'company': user_data.get('company', '').lstrip('@').strip().upper() if user_data.get('company') else '',
        'location': user_data.get('location', ''),
        'email': user_data.get('email', ''),
        'hireable': user_data.get('hireable', ''),
        'bio': user_data.get('bio', ''),
        'public_repos': user_data.get('public_repos', 0),
        'followers': user_data.get('followers', 0),
        'following': user_data.get('following', 0),
        'created_at': user_data.get('created_at', '')
    })

# Convert user data to DataFrame and save to CSV
users_df = pd.DataFrame(user_details)
users_df.to_csv("users.csv", index=False)
print("Saved user details to users.csv")


# Function to fetch repositories for each user with pagination
def get_user_repositories(user_logins):
    repo_details = []
    for login in user_logins:
        page = 1
        while True:
            repos_url = f"https://api.github.com/users/{login}/repos?per_page=100&page={page}"
            response = requests.get(repos_url, headers=headers)

            # Check for rate limit
            if response.status_code == 403:
                print("Rate limit exceeded. Waiting for reset...")
                continue

            repos = response.json()

            # Stop if no repos are returned
            if not repos:
                break

            for repo in repos:
                repo_details.append({
                    'login': login,
                    'full_name': repo.get('full_name', ''),
                    'created_at': repo.get('created_at', ''),
                    'stargazers_count': repo.get('stargazers_count', 0),
                    'watchers_count': repo.get('watchers_count', 0),
                    'language': repo.get('language', ''),
                    'has_projects': repo.get('has_projects', False),
                    'has_wiki': repo.get('has_wiki', False),
                    'license_name': repo.get('license', {}).get('key', '') if repo.get('license') else ''
                })

            print(f"Fetched page {page} of repos for user {login}")
            page += 1

    return repo_details

# Fetch repository data for each user in users.csv
user_logins = users_df['login'].tolist()
repo_details = get_user_repositories(user_logins)

# Convert repository data to DataFrame and save to CSV
repos_df = pd.DataFrame(repo_details)
repos_df.to_csv("repositories.csv", index=False)
print("Saved repository details to repositories.csv")


Fetched 100 users from page 1
Fetched 100 users from page 2
Fetched 100 users from page 3
Fetched 32 users from page 4
Total users fetched: 332
Saved user details to users.csv
Fetched page 1 of repos for user mosh-hamedani
Fetched page 1 of repos for user TheCherno
Fetched page 1 of repos for user haileys
Fetched page 2 of repos for user haileys
Fetched page 3 of repos for user haileys
Fetched page 4 of repos for user haileys
Fetched page 1 of repos for user rstacruz
Fetched page 2 of repos for user rstacruz
Fetched page 3 of repos for user rstacruz
Fetched page 4 of repos for user rstacruz
Fetched page 5 of repos for user rstacruz
Fetched page 6 of repos for user rstacruz
Fetched page 1 of repos for user jesseduffield
Fetched page 1 of repos for user basarat
Fetched page 2 of repos for user basarat
Fetched page 3 of repos for user basarat
Fetched page 4 of repos for user basarat
Fetched page 5 of repos for user basarat
Fetched page 1 of repos for user markdalgleish
Fetched page 2 of r

In [3]:
import pandas as pd
from scipy.stats import linregress

# Load the users and repositories data
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")


Ques 1


In [4]:
# Find the top 5 users by number of followers
top_5_users = users_df.nlargest(5, 'followers')['login'].tolist()
print("Top 5 users by followers:", ', '.join(top_5_users))


Top 5 users by followers: mosh-hamedani, TheCherno, haileys, rstacruz, jesseduffield


Ques 2

In [5]:
# Ensure 'created_at' is a datetime object
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Find the 5 earliest registered users
earliest_users = users_df.nsmallest(5, 'created_at')['login'].tolist()
print("Earliest users by registration date:", ', '.join(earliest_users))


Earliest users by registration date: toolmantim, crafterm, dgoodlad, Sutto, mdub


Ques 3

In [6]:
# Find the top 3 licenses by counting occurrences, ignoring missing licenses
top_licenses = repos_df['license_name'].dropna().value_counts().nlargest(3).index.tolist()
print("Top 3 licenses:", ', '.join(top_licenses))


Top 3 licenses: mit, other, apache-2.0


Ques 4

In [7]:
# Identify the most common company
most_common_company = users_df['company'].mode()[0]
print("Most common company:", most_common_company)


Most common company: MONASH UNIVERSITY


Ques 5

In [8]:
# Identify the most popular programming language
most_common_language = repos_df['language'].mode()[0]
print("Most popular language:", most_common_language)


Most popular language: JavaScript


Ques 6

In [9]:
# Filter users who joined after 2020
recent_users = users_df[users_df['created_at'] > '2020-01-01']

# Find repositories for recent users and identify the second most common language
recent_repos = repos_df[repos_df['login'].isin(recent_users['login'])]
language_counts = recent_repos['language'].value_counts()

# Check if there is a second most popular language
if len(language_counts) > 1:
    second_most_popular_language = language_counts.index[1]
    print("Second most popular language for users joined after 2020:", second_most_popular_language)
else:
    print("No second popular language available.")


Second most popular language for users joined after 2020: JavaScript


Ques 7

In [10]:
# Calculate average stars per language and find the language with the highest average
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()
top_language_by_stars = avg_stars_per_language.idxmax()
print("Language with highest average stars:", top_language_by_stars)


Language with highest average stars: D


Ques 8

In [11]:
# Calculate leader_strength for each user
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5 users
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login'].tolist()

# Print the top 5 users by leader_strength as a comma-separated list
print("Top 5 users by leader strength:", ', '.join(top_5_leader_strength))


Top 5 users by leader strength: mosh-hamedani, binarythistle, TheCherno, TuPayChain, rogerclarkmelbourne


Ques 9

In [12]:
# Calculate the correlation between followers and public repositories
followers_repos_corr = users_df['followers'].corr(users_df['public_repos'])
print(f"Correlation between followers and public repos: {followers_repos_corr:.3f}")


Correlation between followers and public repos: 0.188


Ques 10

In [13]:
# Linear regression of followers on public repos
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])
print(f"Estimated followers per additional repo: {slope:.3f}")


Estimated followers per additional repo: 2.243


Ques 11

In [32]:
# Assuming repos_df contains repository information with 'has_projects' and 'has_wiki' columns

# Convert 'has_projects' and 'has_wiki' columns to integers for correlation calculation
repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)

# Calculate the correlation between 'has_projects' and 'has_wiki'
projects_wiki_corr = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation, rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {projects_wiki_corr:.3f}")


Correlation between projects and wiki enabled: 0.377


Ques 12

In [34]:
# Ensure 'hireable' has no NaN values by filling missing entries with False
users_df['hireable'] = users_df['hireable'].fillna(False)

# Calculate the average 'following' count for hireable users
hireable_avg_following = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average 'following' count for non-hireable users
non_hireable_avg_following = users_df[users_df['hireable'] == False]['following'].mean()

# Calculate the difference between hireable and non-hireable average following counts
following_difference = hireable_avg_following - non_hireable_avg_following

# Print the result, rounded to 3 decimal places
print(f"Difference in following between hireable and non-hireable users: {following_difference:.3f}")


Difference in following between hireable and non-hireable users: 45.901


Ques 13

In [16]:
# Filter out users with empty bios
users_with_bios = users_df[users_df['bio'].notna()]
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))

# Perform linear regression of followers on bio word count
slope, intercept, r_value, p_value, std_err = linregress(users_with_bios['bio_word_count'], users_with_bios['followers'])
print(f"Impact of bio length on followers: {slope:.3f} followers per word")


Impact of bio length on followers: 7.352 followers per word


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


Ques 14

In [17]:
# Convert 'created_at' to datetime to enable weekday extraction
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repos created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.weekday >= 5]

# Count repositories created on weekends by each user and get top 5
top_weekend_creators = weekend_repos['login'].value_counts().nlargest(5).index.tolist()
print("Top 5 users who created most repos on weekends:", ', '.join(top_weekend_creators))


Top 5 users who created most repos on weekends: roachhd, wolfeidau, karkranikhil, rstacruz, plutext


Ques 15

In [35]:
# Calculate the fraction of hireable users who have an email
hireable_with_email_fraction = users_df[(users_df['hireable'] == True) & (users_df['email'].notna())].shape[0] / users_df[users_df['hireable'] == True].shape[0]

# Calculate the fraction of non-hireable users who have an email
non_hireable_with_email_fraction = users_df[(users_df['hireable'] == False) & (users_df['email'].notna())].shape[0] / users_df[users_df['hireable'] == False].shape[0]

# Calculate the difference
email_fraction_difference = hireable_with_email_fraction - non_hireable_with_email_fraction

# Print the result, rounded to 3 decimal places
print(f"Fraction difference (hireable vs non-hireable) sharing emails: {email_fraction_difference:.3f}")


Fraction difference (hireable vs non-hireable) sharing emails: -0.048


In [36]:
# Ensure 'name' column has no NaN values by replacing them with an empty string
users_df['name'] = users_df['name'].fillna('')

# Extract the last word in the 'name' column as the surname, handling empty and whitespace-only names
users_df['surname'] = users_df['name'].apply(lambda x: x.strip().split()[-1] if x.strip() else '')

# Count occurrences of each surname
surname_counts = users_df['surname'].value_counts()

# Find the maximum occurrence count
most_common_count = surname_counts.max()

# Get all surnames with this maximum count (to handle ties)
most_common_surnames = surname_counts[surname_counts == most_common_count].index.tolist()

# Print the most common surname(s), sorted alphabetically
print("Most common surname(s):", ', '.join(sorted(most_common_surnames)))


Most common surname(s): 
