In [1]:
import requests
import time
import csv
from IPython.display import clear_output, display
import pandas as pd

In [47]:
# Initialize request counter
request_count = 0

In [48]:
base_url = "https://api.github.com/search/users"
token = "HIDDEN"

headers={
    "Accept": "application/vnd.github+json",
    "Authorization": f"Bearer {token}",
    "X-GitHub-Api-Version": "2022-11-28"
}

In [49]:
def get_all_users_in_location(location, min_followers=100):

    clear_output(wait=True)
    display(f"Getting all users")
    
    global request_count

    users = []
    page = 1
    per_page = 100  # Maximum allowed by Github
    
    while True:
        
        params = {
            "q": f"location:{location} followers:>{min_followers}",
            "per_page": per_page,
            "page": page
        }

        # Make the request
        response = requests.get(base_url, headers=headers, params=params)

        request_count += 1

        clear_output(wait=True)
        display(f"Requests made: {request_count}")        
        
        
        # Check for rate limit and handle if reached
        if (response.status_code == 403 or response.status_code == 429) and int(response.headers.get("X-RateLimit-Remaining", 1)) == 0:
            reset_time = int(response.headers.get("X-RateLimit-Reset"))
            wait_time = max(reset_time - time.time(), 0)
            print(f"Rate limit exceeded, sleeping for {wait_time} seconds.")
            time.sleep(wait_time)
            continue  # Retry after waiting
        
        if response.status_code != 200:
            print(f"Failed to retrieve data: {response.status_code}")
            break
        
        data = response.json()
        items = data.get("items", [])
        users.extend(items)
        
        # Check if there are more pages
        if len(items) < per_page:  # Last page might have fewer results
            break
        
        page += 1
        time.sleep(3)  # Avoid hitting the rate limit too quickly
    
    return users

In [50]:
def get_user_details(username):

    clear_output(wait=True)
    display(f"Getting details of {username}")

    global request_count
    
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    
    request_count += 1

    clear_output(wait=True)
    display(f"Requests made: {request_count}")
    
    # Handle rate limit
    if (response.status_code == 403 or response.status_code == 429) and int(response.headers.get("X-RateLimit-Remaining", 1)) == 0:
        reset_time = int(response.headers.get("X-RateLimit-Reset"))
        wait_time = max(reset_time - time.time(), 0)
        print(f"Rate limit exceeded, sleeping for {wait_time} seconds.")
        time.sleep(wait_time)
        return get_user_details(username)  # Retry after waiting
    
    if response.status_code != 200:
        print(f"Failed to retrieve data for {username}: {response.status_code}")
        return None
    
    return response.json()

In [51]:
def clean_company_name(company):
    # Trim whitespace, Strip leading @, convert to uppercase
    if company:
        return company.strip().lstrip('@').upper()
    return None

In [52]:
def fetch_user_details_and_save(users, filename):
    
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Header
        writer.writerow(["login", "name", "company", "location", "email", "hireable", "bio", 
                         "public_repos", "followers", "following", "created_at"])
        
        for user in users:
            username = user["login"]
            user_details = get_user_details(username)
            if user_details:
                
                row = [
                    user_details.get("login"),
                    user_details.get("name"),
                    clean_company_name(user_details.get("company")),
                    user_details.get("location"),
                    user_details.get("email"),
                    user_details.get("hireable"),
                    user_details.get("bio"),
                    user_details.get("public_repos"),
                    user_details.get("followers"),
                    user_details.get("following"),
                    user_details.get("created_at")
                ]
                writer.writerow(row)
                time.sleep(2.5)

In [65]:
# Get 500 most recently pushed repositories for a given username

def get_repo_details(username):

    clear_output(wait=True)
    display(f"Getting Repo details of {username}")
    
    global request_count
    
    repos = []
    page = 1
    per_page = 100  # Max per page

    while len(repos) < 500:

        url = f"https://api.github.com/users/{username}/repos"
        
        params = {
            "sort": "pushed",  # Sort by the last pushed date
            "direction": "desc",  # Get the most recent first
            "per_page": per_page,
            "page": page
        }

        response = requests.get(url, headers=headers, params=params)

        request_count += 1

        clear_output(wait=True)
        display(f"Requests made: {request_count}")

        # Check for rate limit and handle if reached
        if (response.status_code == 403 or response.status_code == 429) and int(response.headers.get("X-RateLimit-Remaining", 1)) == 0:
            reset_time = int(response.headers.get("X-RateLimit-Reset"))
            wait_time = max(reset_time - time.time(), 0)
            print(f"Rate limit exceeded, sleeping for {wait_time} seconds.")
            time.sleep(wait_time)
            continue  # Retry after waiting
        
        if response.status_code != 200:
            print(f"Failed to retrieve data: {response.status_code}")
            break
        
        data = response.json()
        repos.extend(data)

        if len(data) < per_page:
            break  # No more repositories to fetch
        
        page += 1
        time.sleep(3)  # Avoid hitting the rate limit too quickly

    return repos[:500]  # Return only the first 500 repositories


In [75]:
def fetch_repo_details_and_save(users, filename):
    
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Header
        writer.writerow(["login", "full_name", "created_at", "stargazers_count", "watchers_count", "language", "has_projects", "has_wiki", "license_name"])
        
        for user in users:
            username = user["login"]
            repo_details = get_repo_details(username)
            for repo in repo_details:
                row = [
                    username,
                    repo.get("full_name"),
                    repo.get("created_at"),
                    repo.get("stargazers_count"),
                    repo.get("watchers_count"),
                    repo.get("language"),
                    repo.get("has_projects"),
                    repo.get("has_wiki"),
                    repo.get("license").get("key") if repo.get("license") else None
                ]
                writer.writerow(row)
            time.sleep(2.5)

In [76]:
# Main execution
location = "Stockholm"
min_followers = 100


all_users = get_all_users_in_location(location, min_followers)

# Fetch user details and save 
fetch_user_details_and_save(all_users, 'users.csv')

# Fetch repo details and save 
fetch_repo_details_and_save(all_users, 'repositories.csv')

clear_output(wait=True)
display(f"Total requests made: {request_count}")
display("User data has been saved to users.csv.")
display("Repository data has been saved to repositories.csv")

'Total requests made: 994'

'User data has been saved to users.csv.'

'Repository data has been saved to repositories.csv'

In [2]:
users_df = pd.read_csv('users.csv')
repositories_df = pd.read_csv('repositories.csv')

**Q1**

In [81]:
top_5_users = users_df.sort_values(by='followers', ascending=False).head(5)
top_5_logins = ','.join(top_5_users['login'].tolist())

print(top_5_logins)

emmabostian,emilk,mpj,hrydgard,eriklindernoren


**Q2**

In [82]:
earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)
earliest_logins = ','.join(earliest_users['login'].tolist())

print(earliest_logins)

Mange,kallepersson,fesplugas,etnt,pirelenito


**Q3**

In [83]:
# Filter out rows with missing license names and count the occurrences
popular_licenses = (
    repositories_df['license_name']
    .dropna()  # Remove missing values
    .value_counts()  # Count occurrences of each license
    .head(3)  # Select the top 3 most common licenses
)

top_3_licenses = ','.join(popular_licenses.index.tolist())

print(top_3_licenses)

mit,apache-2.0,other


**Q4**

In [84]:
most_common_company = users_df['company'].value_counts().idxmax()

print(most_common_company)

SPOTIFY


**Q5**

In [85]:
most_common_language = repositories_df['language'].dropna().value_counts().idxmax()

print(most_common_language)

JavaScript


**Q6**

In [90]:
users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']

merged_df = pd.merge(users_after_2020, repositories_df, on='login')

language_counts = merged_df['language'].value_counts()

second_most_common_language = language_counts.index[1]

print(second_most_common_language)

TypeScript


**Q7**

In [91]:
average_stars_per_language = repositories_df.groupby('language')['stargazers_count'].mean()

highest_average_language = average_stars_per_language.idxmax()

print(highest_average_language)

RAML


**Q8**

In [92]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)

top_5_logins = ','.join(top_5_leader_strength['login'].tolist())

print(top_5_logins)

spotify,Mojang,fornwall,joearms,EmbarkStudios


**Q9**

In [93]:
correlation = users_df['followers'].corr(users_df['public_repos'])

print(correlation)

0.033207482463390205


**Q10**

In [96]:
import statsmodels.api as sm

# Step 1: Define the independent (X) and dependent (y) variables
X = users_df['public_repos']  # Independent variable (number of public repositories)
y = users_df['followers']      # Dependent variable (number of followers)

# Step 2: Add a constant to the independent variable
X = sm.add_constant(X)

# Step 3: Fit the regression model
model = sm.OLS(y, X).fit()

# Step 4: Get the slope (coefficient) for public_repos
slope = model.params['public_repos']

print(f"The regression slope of followers on public repositories is: {slope}")


The regression slope of followers on public repositories is: 0.2169532566541143


**Q11**

In [101]:
correlation = repositories_df['has_projects'].corr(repositories_df['has_wiki'])

print(f"The correlation between projects enabled and wiki enabled is: {correlation}")

The correlation between projects enabled and wiki enabled is: 0.3747562458571214


In [102]:
# Step 1: Create a cross-tabulation of the two boolean columns
crosstab = pd.crosstab(repositories_df['has_projects'], repositories_df['has_wiki'], rownames=['Projects Enabled'], colnames=['Wiki Enabled'])

# Step 2: Calculate the correlation from the crosstab
print("Crosstab of Projects Enabled and Wiki Enabled:")
print(crosstab)

# If you want to calculate the correlation in another way:
# Calculate proportions
proportions = crosstab.div(crosstab.sum(axis=1), axis=0)

# Correlation of proportions
correlation = proportions.loc[True].corr(proportions.loc[False])

print(f"The correlation between projects enabled and wiki enabled based on proportions is: {correlation}")

Crosstab of Projects Enabled and Wiki Enabled:
Wiki Enabled      False  True 
Projects Enabled              
False              1082     49
True               5084  29128
The correlation between projects enabled and wiki enabled based on proportions is: -1.0


**Q12**

In [106]:
average_followers_hireable = users_df[users_df['hireable'] == True]['followers'].mean()

average_followers_non_hireable = users_df[users_df['hireable'] == False]['followers'].mean()

average_difference = average_followers_hireable - average_followers_non_hireable

print(f"Average followers for hireable users: {average_followers_hireable}")
print(f"Average followers for non-hireable users: {average_followers_non_hireable}")
print(f"Difference: {average_difference}")

Average followers for hireable users: 340.3838383838384
Average followers for non-hireable users: nan
Difference: nan


**Q13**

In [4]:
import statsmodels.api as sm

# Filter out users without bios
users_df = users_df[users_df['bio'].notnull()]

# Calculate the length of each bio in words
users_df['bio_word_count'] = users_df['bio'].str.split().str.len()

# Prepare the independent variable (X) and dependent variable (y)
X = users_df['bio_word_count']
y = users_df['followers'] # Adjust the column name as per your dataset

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of the bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"\nRegression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 6.554


**Q14**

In [111]:
# Step 1: Convert 'created_at' to datetime if it's not already
repositories_df['created_at'] = pd.to_datetime(repositories_df['created_at'])

# Step 2: Filter for weekend entries (Saturday = 5, Sunday = 6)
repositories_df['week_day'] = repositories_df['created_at'].dt.dayofweek
weekend_repos = repositories_df[repositories_df['week_day'].isin([5, 6])]

# Step 3: Group by user login and count the number of repositories
top_users = weekend_repos.groupby('login').size().reset_index(name='repo_count')

# Step 4: Sort the users by the number of repositories in descending order
top_users_sorted = top_users.sort_values(by='repo_count', ascending=False)

top_5_users = top_users_sorted.head(5)

top_5_logins = ','.join(top_5_users['login'])

print(f"Top 5 users who created the most repositories on weekends: {top_5_logins}")


Top 5 users who created the most repositories on weekends: HaraldNordgren,Nyholm,lydell,linhduongtuan,LinusU


**Q15**

In [112]:
hireable_users = users_df[users_df['hireable'] == True]
fraction_hireable_with_email = hireable_users['email'].notna().mean()

non_hireable_users = users_df[users_df['hireable'] == False]
fraction_non_hireable_with_email = non_hireable_users['email'].notna().mean()

fraction_difference = fraction_hireable_with_email - fraction_non_hireable_with_email

print(fraction_difference)

nan


**Q16**

In [115]:
# Step 1: Filter out missing names and trim whitespace
valid_names = users_df['name'].dropna().str.strip()

# Step 2: Extract surnames (last word in the name)
surnames = valid_names.str.split().str[-1]  # Get the last word (surname)

# Step 3: Count occurrences of each surname
surname_counts = surnames.value_counts()

# Step 4: Identify the most common surname(s)
most_common_surname_count = surname_counts.max()  # Get the highest count
most_common_surnames = surname_counts[surname_counts == most_common_surname_count].index.tolist()  # Get all surnames with the highest count

# Step 5: Sort surnames alphabetically
most_common_surnames.sort()

# Prepare output
common_surnames_str = ','.join(most_common_surnames)
print(f"Most common surname(s): {common_surnames_str}")


Most common surname(s): Gustafsson,Persson
