In [None]:
!pip install tqdm



In [1]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm
import re

# Replace 'your_github_token_here' with your actual GitHub token
GITHUB_TOKEN = 'GITHUB_TOKEN'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
BASE_URL = 'https://api.github.com'


# Modify functions to add progress bars

# Retrieve users with over 50 followers in Dublin
def get_dublin_users(min_followers=50):
    users = []
    page = 1
    with tqdm(desc="Fetching Dublin Users", unit="page") as pbar:
        while True:
            # Search for users in Dublin with the specified number of followers
            url = f"{BASE_URL}/search/users?q=location:Dublin+followers:>{min_followers}&page={page}&per_page=100"
            response = requests.get(url, headers=HEADERS)
            data = response.json()

            if 'items' not in data:
                break
            users.extend(data['items'])

            if len(data['items']) < 100:
                break
            page += 1
            pbar.update(1)  # Update progress bar for each page
    return users

# Fetch user details
def get_user_details(username):
    url = f"{BASE_URL}/users/{username}"
    response = requests.get(url, headers=HEADERS)
    return response.json()

# Fetch repositories for a user
def get_user_repos(username, max_repos=500):
    repos = []
    page = 1
    while len(repos) < max_repos:
        url = f"{BASE_URL}/users/{username}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        data = response.json()
        if not data:
            break
        repos.extend(data[:max_repos - len(repos)])
        page += 1
    return repos[:max_repos]
# Fetch user details and add progress bar for each user
dublin_users = get_dublin_users()

# Initialize lists for user and repository data
users_data = []
repos_data = []

for user in tqdm(dublin_users, desc="Processing Users", unit="user"):
    details = get_user_details(user['login'])

    # Clean company name
    company = details.get('company', '') or ''
    company = re.sub(r'^@', '', company.strip()).upper()

    # Append to users data
    users_data.append({
        'login': details['login'],
        'name': details.get('name', ''),
        'company': company,
        'location': details.get('location', ''),
        'email': details.get('email', ''),
        'hireable': details.get('hireable', ''),
        'bio': details.get('bio', ''),
        'public_repos': details.get('public_repos', 0),
        'followers': details.get('followers', 0),
        'following': details.get('following', 0),
        'created_at': details.get('created_at', '')
    })

    # Fetch up to 500 repositories per user
    repos = get_user_repos(details['login'])
    for repo in repos:
        repos_data.append({
            'login': details['login'],
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'] or '',
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else ''
        })

# Convert to DataFrames
users_df = pd.DataFrame(users_data)
repos_df = pd.DataFrame(repos_data)

# Save as CSV files
users_df.to_csv('users.csv', index=False)
repos_df.to_csv('repositories.csv', index=False)

# Write README.md content
readme_content = """
* This project scrapes GitHub data for users in Dublin with over 50 followers.
* Analysis shows interesting patterns in user engagement and repository activity.
* Developers should consider enabling wiki and project features to improve repository visibility.

"""

with open('README.md', 'w') as f:
    f.write(readme_content)

# Display data for question analysis
users_df.head(), repos_df.head()

Fetching Dublin Users: 4page [00:02,  1.88page/s]
Processing Users: 100%|██████████| 477/477 [15:12<00:00,  1.91s/user]


(                  login                    name  \
 0                  orta             Orta Therox   
 1         jeromeetienne          Jerome Etienne   
 2            jonataslaw            Jonny Borges   
 3  steventroughtonsmith  Steven Troughton-Smith   
 4                  axic        Alex Beregszaszi   
 
                                    company  \
 0                                            
 1                   MAKING WEBAR A REALITY   
 2                                     IRIS   
 3                    HIGH CAFFEINE CONTENT   
 4  ETHEREUM @IPSILON @SPEARBIT @ETHEREUMJS   
 
                                        location                     email  \
 0  Huddersfield / NYC / Dublin / Rio de Janeiro               git@orta.io   
 1                               Dublin, Ireland  jerome.etienne@gmail.com   
 2                               Dublin, Ireland                      None   
 3                               Dublin, Ireland                      None   
 4          

In [None]:
# Sort users by the number of followers in descending order and select the top 5
top_5_users = users_df.sort_values(by='followers', ascending=False).head(5)['login'].tolist()

# Join the login names as a comma-separated string
top_5_users_str = ','.join(top_5_users)
top_5_users_str


'orta,jeromeetienne,jonataslaw,steventroughtonsmith,axic'

In [None]:
# Convert the 'created_at' column to datetime format if not already in that format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by 'created_at' in ascending order and select the top 5
earliest_5_users = users_df.sort_values(by='created_at').head(5)['login'].tolist()

# Join the login names as a comma-separated string
earliest_5_users_str = ','.join(earliest_5_users)
earliest_5_users_str


'paulca,adrian,GavinJoyce,amir,ciaranlee'

In [None]:
# Filter out repositories without a license
repos_with_license = repos_df[repos_df['license_name'] != '']

# Count the occurrences of each license and get the top 3
top_3_licenses = repos_with_license['license_name'].value_counts().head(3).index.tolist()

# Join the license names as a comma-separated string
top_3_licenses_str = ','.join(top_3_licenses)
top_3_licenses_str


'mit,apache-2.0,other'

In [None]:
# Filter out empty company values
companies = users_df[users_df['company'] != '']

# Count occurrences of each company and get the most common one
most_common_company = companies['company'].value_counts().idxmax()
most_common_company


'AWS'

In [None]:
#  Which programming language is most popular among these users?

In [None]:
language_counts = repos_df.groupby('language')['login'].nunique().sort_values(ascending=False)

In [None]:
most_popular_language = language_counts.index[0]
print(f"The most popular programming language is: {most_popular_language}")

The most popular programming language is: 


In [None]:
# Filter out empty language values
languages = repos_df[repos_df['language'] != '']

# Count occurrences of each language and get the most common one
most_popular_language = languages['language'].value_counts().idxmax()
most_popular_language


'JavaScript'

In [None]:
# Ensure 'created_at' is in datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-12-31']

# Get logins of these users
logins_after_2020 = users_after_2020['login'].tolist()

# Filter repositories for users who joined after 2020
repos_after_2020 = repos_df[repos_df['login'].isin(logins_after_2020)]

# Filter out empty languages and count occurrences of each language
language_counts = repos_after_2020['language'].value_counts()

# Get the second most popular language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None
second_most_popular_language


'Python'

In [None]:
# Filter out repositories without a specified language
repos_with_language = repos_df[repos_df['language'] != '']

# Calculate the average number of stars per language
average_stars_per_language = repos_with_language.groupby('language')['stargazers_count'].mean()

# Get the language with the highest average number of stars
highest_avg_stars_language = average_stars_per_language.idxmax()
highest_avg_stars_language


'MDX'

In [None]:
# Define leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5 users
top_5_leader_strength = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].tolist()

# Join the login names as a comma-separated string
top_5_leader_strength_str = ','.join(top_5_leader_strength)
top_5_leader_strength_str


'flaviohenriquealmeida,zalando,AnikSarker,wix,CardinalHealth'

In [None]:
# Calculate the Pearson correlation coefficient between followers and public_repos
correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])

# Round to 3 decimal places
correlation_followers_repos = round(correlation_followers_repos, 3)
correlation_followers_repos


0.555

In [None]:
from scipy.stats import linregress

# Perform linear regression with public_repos as independent variable and followers as dependent variable
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])

# Round the slope to 3 decimal places
slope = round(slope, 3)
slope


2.825

In [None]:
# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5 users
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract the 'login' column and join as a comma-separated string
top_leader_logins = ', '.join(top_leaders['login'].tolist())
top_leader_logins


'flaviohenriquealmeida, zalando, AnikSarker, wix, CardinalHealth'

In [None]:
# Calculate correlation between followers and public repositories
correlation = users_df['followers'].corr(users_df['public_repos'])

# Display the result rounded to 3 decimal places
correlation_rounded = round(correlation, 3)
correlation_rounded


0.555

In [None]:
import statsmodels.api as sm

# Define the independent (public_repos) and dependent (followers) variables
X = users_df['public_repos']
y = users_df['followers']

# Add a constant to the independent variable (for intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Extract the slope (coefficient of public_repos)
slope_repos_followers = round(model.params['public_repos'], 3)
slope_repos_followers


2.825

In [None]:
import pandas as pd

# Convert 'created_at' to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for weekend repositories (Saturday: 5, Sunday: 6)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]

# Count the number of weekend repositories created by each user
weekend_counts = weekend_repos['login'].value_counts().head(5)

# Get the top 5 users
top_5_users = weekend_counts.index.tolist()
top_5_users_logins = ', '.join(top_5_users)

top_5_users_logins


'orta, joshuacassidy, No9, wafuwafu13, lmammino'

In [None]:
from collections import Counter

# Filter out users with missing names
names = users_df['name'].dropna().str.strip()

# Extract surnames (last word)
surnames = names.str.split().str[-1]

# Count the occurrences of each surname
surname_counts = Counter(surnames)

# Find the maximum count
max_count = max(surname_counts.values())

# Get all surnames with the maximum count and sort them alphabetically
most_common_surnames = sorted([surname for surname, count in surname_counts.items() if count == max_count])

# Join the surnames into a comma-separated string
most_common_surnames_str = ', '.join(most_common_surnames)

most_common_surnames_str


"Chen, Kenny, O'Sullivan, Quinn"