In [3]:
import requests
import pandas as pd
from datetime import datetime
import time

# GitHub Personal Access Token (optional if you face rate limits)
GITHUB_TOKEN = ''
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}


# Helper function to clean company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Fetch users in Delhi with more than 100 followers


def get_github_users(location='Delhi', min_followers=100):
    url = f"https://api.github.com/search/users?q=location:{location}+followers:>{min_followers}&per_page=100"
    users = []
    while url:
        response = requests.get(url, headers=HEADERS).json()
        users.extend(response.get('items', []))
        if 'next' in response:
            url = response['next']
        else:
            url = None
        time.sleep(2)
    return users

# Fetch detailed user info


def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    return requests.get(url, headers=HEADERS).json()

# Fetch repositories for each user


def get_user_repositories(username):
    repos = []
    url = f"https://api.github.com/users/{username}/repos?per_page=100"

    while url:
        response = requests.get(url, headers=HEADERS)
        response_data = response.json()
        repos.extend(response_data)

        # Check for pagination in headers
        if 'Link' in response.headers:
            links = response.headers['Link']
            next_link = [link for link in links.split(
                ',') if 'rel="next"' in link]
            if next_link:
                url = next_link[0].split(';')[0].strip()[
                    1:-1]  # Extract the URL
            else:
                url = None  # No more pages
        else:
            url = None  # No pagination header

        time.sleep(2)  # To respect the rate limits

    return repos


# Fetch users
users = get_github_users()

# Initialize lists to hold user and repository data
user_data = []
repo_data = []

# Fetch details and repos for each user
for user in users:
    user_details = get_user_details(user['login'])

    user_info = {
        'login': user_details.get('login'),
        'name': user_details.get('name'),
        'company': clean_company_name(user_details.get('company')),
        'location': user_details.get('location'),
        'email': user_details.get('email'),
        'hireable': user_details.get('hireable', False),
        'bio': user_details.get('bio'),
        'public_repos': user_details.get('public_repos'),
        'followers': user_details.get('followers'),
        'following': user_details.get('following'),
        'created_at': user_details.get('created_at')
    }

    user_data.append(user_info)

    # Fetch user repositories
    repos = get_user_repositories(user['login'])
    for repo in repos[:500]:  # Limit to 500 repos per user
        repo_info = {
            'login': user['login'],
            'full_name': repo.get('full_name'),
            'created_at': repo.get('created_at'),
            'stargazers_count': repo.get('stargazers_count', 0),
            'watchers_count': repo.get('watchers_count', 0),
            'language': repo.get('language'),
            'has_projects': repo.get('has_projects', False),
            'has_wiki': repo.get('has_wiki', False),
            'license_name': repo.get('license')['key'] if repo.get('license') else None
        }
        repo_data.append(repo_info)

# Convert to DataFrames
df_users = pd.DataFrame(user_data)
df_repos = pd.DataFrame(repo_data)

# Save to CSV files
df_users.to_csv('users.csv', index=False)
df_repos.to_csv('repositories.csv', index=False)

## Step 2: Perform Data Analysis

In [10]:
from scipy.stats import linregress


# Load the data from CSV files
df_users = pd.read_csv('users.csv')
df_repos = pd.read_csv('repositories.csv')

df_users['created_at'] = pd.to_datetime(
    df_users['created_at'], errors='coerce')

# 1. Top 5 users by followers
top_5_followers = df_users.nlargest(5, 'followers')['login'].tolist()
print(f"Top 5 users by followers: {', '.join(top_5_followers)}")

# 2. Top 5 earliest registered users
earliest_users = df_users.nsmallest(5, 'created_at')['login'].tolist()
print(f"Top 5 earliest users: {', '.join(earliest_users)}")

# 3. Top 3 most popular licenses
popular_licenses = df_repos['license_name'].dropna(
).value_counts().head(3).index.tolist()
print(f"Top 3 licenses: {', '.join(popular_licenses)}")

# 4. Most common company
most_common_company = df_users['company'].mode()[0]
print(f"Most common company: {most_common_company}")

# 5. Most popular language
most_popular_language = df_repos['language'].mode()[0]
print(f"Most popular language: {most_popular_language}")

# 6. Second most popular language for users who joined after 2020
df_recent_users = df_users[pd.to_datetime(
    df_users['created_at']).dt.year > 2020]
second_popular_language = df_repos[df_repos['login'].isin(
    df_recent_users['login'])]['language'].value_counts().index[1]
print(f"Second most popular language (after 2020): {second_popular_language}")

# 7. Language with the highest average stars per repository
avg_stars_per_language = df_repos.groupby(
    'language')['stargazers_count'].mean().idxmax()
print(
    f"Language with highest average stars per repo: {avg_stars_per_language}")

# 8. Top 5 by leader_strength (followers / (1 + following))
df_users['leader_strength'] = df_users['followers'] / \
    (1 + df_users['following'])
top_5_leader_strength = df_users.nlargest(
    5, 'leader_strength')['login'].tolist()
print(f"Top 5 by leader_strength: {', '.join(top_5_leader_strength)}")

# 9. Correlation between followers and public repos
correlation_followers_repos = df_users[[
    'followers', 'public_repos']].corr().loc['followers', 'public_repos']
print(
    f"Correlation between followers and public repos: {correlation_followers_repos:.3f}")

# 10. Regression slope of followers on public repos
slope, intercept, r_value, p_value, std_err = linregress(
    df_users['public_repos'], df_users['followers'])
print(f"Regression slope of followers on repos: {slope:.3f}")

# 11. Correlation between projects enabled and wiki enabled
correlation_projects_wiki = df_repos[[
    'has_projects', 'has_wiki']].corr().loc['has_projects', 'has_wiki']
print(
    f"Correlation between projects and wiki enabled: {correlation_projects_wiki:.3f}")

# 12. Do hireable users follow more people than those who are not hireable?
hireable_users_following = df_users[df_users['hireable']
                                    == True]['following'].mean()
non_hireable_users_following = df_users[df_users['hireable']
                                        == False]['following'].mean()
difference_in_following = hireable_users_following - non_hireable_users_following
print(
    f"Average following for hireable minus non-hireable users: {difference_in_following:.3f}")

# 13. Correlation between bio length and followers
df_users['bio_length'] = df_users['bio'].apply(lambda x: len(
    x) if isinstance(x, str) else 0)  # Length of bio in Unicode characters
slope_bio_followers, intercept, r_value, p_value, std_err = linregress(
    df_users['bio_length'], df_users['followers'])
print(
    f"Regression slope of followers on bio length: {slope_bio_followers:.3f}")

# 14. Who created the most repositories on weekends (UTC)?
df_repos['created_at'] = pd.to_datetime(df_repos['created_at'])
# 5 and 6 correspond to Saturday and Sunday
df_repos['is_weekend'] = df_repos['created_at'].dt.dayofweek >= 5
weekend_repo_counts = df_repos[df_repos['is_weekend']].groupby(
    'login')['full_name'].count().nlargest(5).index.tolist()
print(
    f"Top 5 users who created the most repositories on weekends: {', '.join(weekend_repo_counts)}")

# 15. Do hireable users share their email addresses more often?
hireable_with_email = df_users[df_users['hireable']
                               == True]['email'].notna().mean()
non_hireable_with_email = df_users[df_users['hireable']
                                   == False]['email'].notna().mean()
difference_in_email_sharing = hireable_with_email - non_hireable_with_email
print(
    f"Fraction of hireable users with email minus non-hireable: {difference_in_email_sharing:.3f}")

# 16. Most common surname
df_users['surname'] = df_users['name'].apply(lambda x: x.split(
)[-1].upper() if isinstance(x, str) and len(x.split()) > 1 else None)
most_common_surname = df_users['surname'].dropna(
).value_counts().nlargest(1).index.tolist()
print(f"Most common surname: {', '.join(most_common_surname)}")

Top 5 users by followers: amitshekhariitbhu, shradha-khapra, loveBabbar, Nakshatra05, Anuj-Kumar-Sharma
Top 5 earliest users: dufferzafar, nathvarun, aviaryan, softvar, rishikksh20
Top 3 licenses: mit, apache-2.0, gpl-3.0
Most common company: CODING BLOCKS
Most popular language: JavaScript
Second most popular language (after 2020): HTML
Language with highest average stars per repo: Java
Top 5 by leader_strength: Anuj-Kumar-Sharma, Ignitetechnologies, shradha-khapra, loveBabbar, amitshekhariitbhu
Correlation between followers and public repos: -0.134
Regression slope of followers on repos: -2.379
Correlation between projects and wiki enabled: 0.228
Average following for hireable minus non-hireable users: nan
Regression slope of followers on bio length: 9.109
Top 5 users who created the most repositories on weekends: Ayush7614, shivaylamba, amitsrivastava4all, manrajgrover, AkshayAnand2002
Fraction of hireable users with email minus non-hireable: nan
Most common surname: SINGH


Correlation between projects and wiki enabled: 0.228
Average following for hireable minus non-hireable users: nan
Regression slope of followers on bio length: 9.109
Top 5 users who created the most repositories on weekends: Ayush7614, shivaylamba, amitsrivastava4all, manrajgrover, AkshayAnand2002
Fraction of hireable users with email minus non-hireable: nan
Most common surname: SINGH
