In [None]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict, Any

class GitHubScraper:
    def __init__(self, token: str):
        """
        Initialize the GitHub scraper with your API token.

        Args:
            token (str): GitHub Personal Access Token
        """
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _make_request(self, url: str, params: dict = None) -> Dict:
        """
        Make a request to the GitHub API with rate limit handling.
        """
        while True:
            response = requests.get(url, headers=self.headers, params=params)

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                self.logger.warning(f"Rate limit hit. Sleeping for {sleep_time} seconds")
                time.sleep(sleep_time)
            else:
                self.logger.error(f"Error {response.status_code}: {response.text}")
                response.raise_for_status()

    def clean_company_name(self, company: str) -> str:
        """
        Clean up company names according to specifications.
        """
        if not company:
            return ""

        # Strip whitespace and @ symbol
        cleaned = company.strip().lstrip('@')

        # Convert to uppercase
        return cleaned.upper()

    def search_users(self, location: str, min_followers: int) -> List[Dict]:
        """
        Search for GitHub users in a specific location with minimum followers.
        """
        users = []
        page = 1

        while True:
            self.logger.info(f"Fetching users page {page}")

            query = f"location:{location} followers:>={min_followers}"
            params = {
                'q': query,
                'per_page': 100,
                'page': page
            }

            url = f"{self.base_url}/search/users"
            response = self._make_request(url, params)

            if not response['items']:
                break

            for user in response['items']:
                user_data = self._make_request(user['url'])

                # Extract only the required fields with exact matching names
                cleaned_data = {
                    'login': user_data['login'],
                    'name': user_data['name'] if user_data['name'] else "",
                    'company': self.clean_company_name(user_data.get('company')),
                    'location': user_data['location'] if user_data['location'] else "",
                    'email': user_data['email'] if user_data['email'] else "",
                    'hireable': user_data['hireable'] if user_data['hireable'] is not None else False,
                    'bio': user_data['bio'] if user_data['bio'] else "",
                    'public_repos': user_data['public_repos'],
                    'followers': user_data['followers'],
                    'following': user_data['following'],
                    'created_at': user_data['created_at']
                }

                users.append(cleaned_data)

            page += 1

        return users

    def get_user_repositories(self, username: str, max_repos: int = 500) -> List[Dict]:
        """
        Get repositories for a specific user.
        """
        repos = []
        page = 1

        while len(repos) < max_repos:
            self.logger.info(f"Fetching repositories for {username}, page {page}")

            params = {
                'sort': 'pushed',
                'direction': 'desc',
                'per_page': 100,
                'page': page
            }

            url = f"{self.base_url}/users/{username}/repos"
            response = self._make_request(url, params)

            if not response:
                break

            for repo in response:
                # Extract only the required fields with exact matching names
                repo_data = {
                    'login': username,  # Adding owner's login as required
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'] if repo['language'] else "",
                    'has_projects': repo['has_projects'],
                    'has_wiki': repo['has_wiki'],
                    'license_name': repo['license']['key'] if repo.get('license') else ""
                }

                repos.append(repo_data)

            if len(response) < 100:
                break

            page += 1

        return repos[:max_repos]

def main():
    # Get GitHub token
    token = input("Enter your GitHub token: ").strip()
    if not token:
        print("Token is required. Exiting...")
        return

    # Initialize scraper
    scraper = GitHubScraper(token)

    # Search for users in Basel with >10 followers
    users = scraper.search_users(location='Basel', min_followers=10)

    # Save users to CSV
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)

    # Get repositories for each user
    all_repos = []
    for user in users:
        repos = scraper.get_user_repositories(user['login'])
        all_repos.extend(repos)

    # Save repositories to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)

    print(f"Scraped {len(users)} users and {len(all_repos)} repositories")

    # Create README.md
    with open('README.md', 'w') as f:
        f.write(f"""# GitHub Users in Basel

This repository contains data about GitHub users in Basel with over 10 followers and their repositories.

## Files

1. `users.csv`: Contains information about {len(users)} GitHub users in Basel with over 10 followers
2. `repositories.csv`: Contains information about {len(all_repos)} public repositories from these users
3. `gitscrap.py`: Python script used to collect this data

## Data Collection

- Data collected using GitHub API
- Date of collection: {time.strftime('%Y-%m-%d')}
- Only included users with 100+ followers
- Up to 500 most recently pushed repositories per user
""")

if __name__ == "__main__":
    main()

Enter your GitHub token: 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
Scraped 375 users and 14301 repositories


In [None]:
import requests
import csv

GITHUB_TOKEN = "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"
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_users_in_basel():
    users = []
    query = "location:Basel+followers:>10"
    page = 1
    per_page = 100
    total_users = 0

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        print(f"Fetching page {page}...")

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])
        total_users += len(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

    detailed_users = []
    for user in users:
        user_info = get_user_details(user['login'])
        detailed_users.append(user_info)

    return detailed_users

def get_user_details(username):
    user_url = f"https://api.github.com/users/{username}"
    user_data = requests.get(user_url, headers=HEADERS).json()

    return {
        'login': user_data['login'],
        'name': user_data['name'],
        'company': clean_company_name(user_data['company']),
        'location': user_data['location'],
        'email': user_data['email'],
        'hireable': user_data['hireable'],
        'bio': user_data['bio'],
        'public_repos': user_data['public_repos'],
        'followers': user_data['followers'],
        'following': user_data['following'],
        'created_at': user_data['created_at'],
    }

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith('@'):
            company = company[1:]
    return company

def get_user_repos(username):
    repos_url = f"https://api.github.com/users/{username}/repos?per_page=500"
    response = requests.get(repos_url, headers=HEADERS)
    repos_data = response.json()

    repos = []
    for repo in repos_data:
        repos.append({
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else None,
        })

    return repos

def save_users_to_csv(users):
    with open('users.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'])
        writer.writeheader()
        writer.writerows(users)

def save_repos_to_csv(repos):
    with open('repositories.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'])
        writer.writeheader()
        writer.writerows(repos)

if __name__ == "__main__":
    users = get_users_in_basel()
    save_users_to_csv(users)

    all_repos = []
    for user in users:
        repos = get_user_repos(user['login'])
        all_repos.extend(repos)

    save_repos_to_csv(all_repos)
    print("Done")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Done


In [2]:
from google.colab import files

uploaded = files.upload()

Saving users.csv to users.csv


In [34]:
# Question 1:find Who are the top 5 users with the highest number of followers? List their login in order, comma-separated.

import pandas as pd

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Sort users by followers in descending order
df_sorted = df_users.sort_values('followers', ascending=False)

# Get the top 5 users
top_5_users = df_sorted.head(5)['login'].tolist()

# Print the logins in comma-separated format
print(','.join(top_5_users))

tarsius,aalmiray,marcoroth,klmr,MrNeRF


In [9]:
#2. Who are the 5 earliest registered GitHub users ? List their login in ascending order of created_at, comma-separated.

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Convert 'created_at' to datetime objects for proper sorting
df_users['created_at'] = pd.to_datetime(df_users['created_at'])

# Sort users by 'created_at' in ascending order
df_sorted = df_users.sort_values('created_at')

# Get the logins of the 5 earliest registered users
earliest_5_users = df_sorted.head(5)['login'].tolist()

# Print the logins in comma-separated format
print(','.join(earliest_5_users))

bennyzen,aalmiray,pvillega,tarsius,amaunz


In [10]:
# 3.What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

import pandas as pd

# Assuming 'repositories.csv' is in the current working directory
df_repos = pd.read_csv('repositories.csv')

# Remove rows with missing licenses
df_repos = df_repos.dropna(subset=['license_name'])

# Count the occurrences of each license
license_counts = df_repos['license_name'].value_counts()

# Get the top 3 most popular licenses
top_3_licenses = license_counts.head(3).index.tolist()

# Print the license names in comma-separated format
print(','.join(top_3_licenses))

mit,apache-2.0,other


In [11]:
# 4.use users.csv and find Which company do the majority of these developers work at?

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Count the occurrences of each company
company_counts = df_users['company'].value_counts()

# Get the company with the maximum count
most_frequent_company = company_counts.idxmax()

most_frequent_company

'ADOBE'

In [12]:
# 5.find Which programming language is most popular among these users?

# Assuming 'repositories.csv' is in the current working directory
df_repos = pd.read_csv('repositories.csv')

# Count the occurrences of each programming language
language_counts = df_repos['language'].value_counts()

# Get the most popular programming language
most_popular_language = language_counts.idxmax()

most_popular_language

'JavaScript'

In [16]:
# 6.find Which programming language is the second most popular among users who joined after 2020?
# users_after_2020 = users[users['created_at'] > '2020-01-01']
# users_after_2020.head()
# repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
# repos_2020['language'].value_counts().head()

# Assuming 'users.csv' and 'repositories.csv' are in the current working directory
df_users = pd.read_csv('users.csv')
df_repos = pd.read_csv('repositories.csv')

# Convert 'created_at' to datetime objects for proper filtering
df_users['created_at'] = pd.to_datetime(df_users['created_at'])

# Filter users who joined after 2020
users_after_2020 = df_users[df_users['created_at'] > '2020-01-01']

# Filter repositories for users who joined after 2020
repos_2020 = df_repos[df_repos['login'].isin(users_after_2020['login'].tolist())]

# Count the occurrences of each programming language among these users
language_counts = repos_2020['language'].value_counts()

# Get the second most popular programming language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None

second_most_popular_language

'PHP'

In [35]:
# 7. Which language has the highest average number of stars per repository?

# Assuming 'repositories.csv' is in the current working directory
df_repos = pd.read_csv('repositories.csv')

# Group by language and calculate the average stargazers_count
language_avg_stars = df_repos.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average stargazers_count
highest_avg_stars_language = language_avg_stars.idxmax()

highest_avg_stars_language

'PureScript'

In [36]:
#8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Calculate leader_strength
df_users['leader_strength'] = df_users['followers'] / (1 + df_users['following'])

# Sort users by leader_strength in descending order
df_sorted = df_users.sort_values('leader_strength', ascending=False)

# Get the top 5 users
top_5_users = df_sorted.head(5)['login'].tolist()

# Print the logins in comma-separated format
print(','.join(top_5_users))

dpryan79,wasserth,ravage84,elanmart,quadbiolab


In [37]:
#9. What is the correlation between the number of followers and the number of public repositories among users

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Calculate the correlation between followers and public repositories
correlation = df_users['followers'].corr(df_users['public_repos'])

correlation

0.34406396712642345

In [38]:
# 10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

import pandas as pd
from sklearn.linear_model import LinearRegression

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Create a linear regression model
model = LinearRegression()

# Fit the model using followers as the dependent variable and public_repos as the independent variable
X = df_users[['public_repos']]
y = df_users['followers']
model.fit(X, y)

# Get the coefficient of the independent variable (public_repos)
coefficient = model.coef_[0]

print(f"Estimated additional followers per additional public repository: {coefficient}")

Estimated additional followers per additional public repository: 0.6716655813586768


In [39]:
# 11.Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
# Assuming 'repositories.csv' is in the current working directory
repos = pd.read_csv('repositories.csv')

if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].corr(repos['has_wiki'])
print(round(correlation, 3))

0.312


In [40]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

In [41]:
# 12. Do hireable users follow more people than those who are not hireable?
# Average of following per user for hireable=true minus the average following for the rest

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Calculate the average following for hireable users
avg_following_hireable = df_users[df_users['hireable'] == True]['following'].mean()

# Calculate the average following for non-hireable users
avg_following_non_hireable = df_users[df_users['hireable'] == False]['following'].mean()

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

print(f"Difference in average following: {difference}")

Difference in average following: 46.94048144876325


In [54]:
# 13.Some developers write long bios. Does that help them get more followers? What's the correlation of the word count of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

import pandas as pd
from sklearn.linear_model import LinearRegression

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

users_with_bio = df_users[(df_users['bio'].notna()) & (df_users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.split().str.len()

X = users_with_bio['bio_len'].values.reshape(-1, 1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)

lr2.coef_[0]

2.4652312189270567

In [56]:
# 14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

# Assuming 'repositories.csv' is in the current working directory
df_repos = pd.read_csv('repositories.csv')

# Convert 'created_at' to datetime objects
df_repos['created_at'] = pd.to_datetime(df_repos['created_at'])

# Extract the day of the week (0 = Monday, 6 = Sunday)
df_repos['day_of_week'] = df_repos['created_at'].dt.dayofweek

# Filter for weekend repositories (Saturday and Sunday)
weekend_repos = df_repos[df_repos['day_of_week'].isin([5, 6])]

# Count the number of repositories created by each user on weekends
user_weekend_repo_counts = weekend_repos.groupby('login')['full_name'].count()

# Sort users by the number of weekend repositories in descending order
top_5_users = user_weekend_repo_counts.sort_values(ascending=False).head(5).index.tolist()

# Print the top 5 users' logins, comma-separated
print(','.join(top_5_users))

marcossegovia,tbreuss,ioolkos,BaselHack,maysam


In [49]:
# 15. Do people who are hireable share their email addresses more often?
# [fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Calculate the fraction of users with email when hireable is True
hireable_true_with_email = df_users[(df_users['hireable'] == True) & (df_users['email'].notna())].shape[0]
hireable_true_total = df_users[df_users['hireable'] == True].shape[0]
fraction_hireable_true = hireable_true_with_email / hireable_true_total if hireable_true_total > 0 else 0

# Calculate the fraction of users with email when hireable is False or NaN
hireable_false_with_email = df_users[(df_users['hireable'] == False) & (df_users['email'].notna())].shape[0]
hireable_false_total = df_users[df_users['hireable'] == False].shape[0]
fraction_hireable_false = hireable_false_with_email / hireable_false_total if hireable_false_total > 0 else 0


# Calculate the difference
difference = fraction_hireable_true - fraction_hireable_false

print(round(difference, 3))

0.051


In [57]:
#16.Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

# Assuming 'users.csv' is in the current working directory
df_users = pd.read_csv('users.csv')

# Clean and extract surnames
def get_surname(name):
  if isinstance(name, str):
    name_parts = name.strip().split()
    if name_parts:
      return name_parts[-1]
  return None

df_users['surname'] = df_users['name'].apply(get_surname)

# Count surname occurrences
surname_counts = df_users['surname'].value_counts()

# Find the most common surnames (handling ties)
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort and print the result
print(','.join(sorted(most_common_surnames)))

Arnold,Brand,Christensen,Fink,GmbH,Group,Guggisberg,Landolt,Roth,Tan
