In [16]:
import requests
import pandas as pd
import time
import re
from urllib.parse import parse_qs, urlparse
import sys

def validate_token(token):
    """Validate the GitHub token before proceeding"""
    headers = {"Authorization": f"token {token}"}
    response = requests.get("https://api.github.com/user", headers=headers)
    
    if response.status_code == 401:
        print("Error: Invalid GitHub token. Please check your token and try again.")
        print("Make sure you've replaced 'your_personal_access_token' with an actual token.")
        print("\nTo create a new token:")
        print("1. Go to GitHub.com → Settings → Developer settings → Personal access tokens → Tokens (classic)")
        print("2. Generate new token (classic)")
        print("3. Select at least these scopes: 'public_repo', 'read:user', 'user:email'")
        return False
    elif response.status_code != 200:
        print(f"Error: Unable to validate token. Status code: {response.status_code}")
        print(f"Response: {response.text}")
        return False
    return True

def get_users_in_tokyo(token, min_followers=200):
    base_url = "https://api.github.com/search/users"
    headers = {"Authorization": f"token {token}"}
    params = {
        "q": f"location:tokyo followers:>{min_followers}",
        "per_page": 100,
        "page": 1
    }
    all_users = []
    
    while True:
        try:
            response = requests.get(base_url, headers=headers, params=params)
            
            if response.status_code == 200:
                data = response.json()
                items = data.get('items', [])
                all_users.extend(items)
                
                # Print progress
                print(f"Fetched page {params['page']}, Total users so far: {len(all_users)}")
                total_count = data.get('total_count', 0)
                print(f"Total users available according to GitHub: {total_count}")
                
                # Check if there are more pages
                if 'next' not in response.links or not items:
                    break
                    
                # Extract next page number from the 'next' URL
                next_url = response.links['next']['url']
                parsed_url = urlparse(next_url)
                params['page'] = int(parse_qs(parsed_url.query)['page'][0])
                
                # Handle rate limiting
                remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
                if remaining < 5:
                    reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                    sleep_time = max(reset_time - time.time(), 0) + 1
                    print(f"Rate limit nearly exceeded. Sleeping for {sleep_time:.0f} seconds...")
                    time.sleep(sleep_time)
                else:
                    time.sleep(2)  # Normal rate limit control
            else:
                print(f"Error fetching users: {response.status_code}")
                print(f"Response: {response.text}")
                if response.status_code == 403:  # Rate limit exceeded
                    reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                    sleep_time = max(reset_time - time.time(), 0) + 1
                    print(f"Rate limit exceeded. Sleeping for {sleep_time:.0f} seconds...")
                    time.sleep(sleep_time)
                    continue
                break
                
        except requests.exceptions.RequestException as e:
            print(f"Network error occurred: {e}")
            time.sleep(5)  # Wait before retrying
            continue
        except Exception as e:
            print(f"Unexpected error occurred: {e}")
            break

    print(f"Total users found: {len(all_users)}")
    return all_users

def get_user_details(username, token):
    url = f"https://api.github.com/users/{username}"
    headers = {"Authorization": f"token {token}"}
    max_retries = 3
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:  # Rate limit exceeded
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                print(f"Rate limit exceeded. Sleeping for {sleep_time:.0f} seconds...")
                time.sleep(sleep_time)
                retry_count += 1
            else:
                print(f"Error fetching user details for {username}: {response.status_code}")
                print(f"Response: {response.text}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Network error occurred while fetching user {username}: {e}")
            time.sleep(5)
            retry_count += 1
    return None

def get_user_repos(username, token, max_repos=500):
    headers = {"Authorization": f"token {token}"}
    base_url = f"https://api.github.com/users/{username}/repos"
    params = {"per_page": 100, "sort": "updated"}
    repos = []
    page = 1
    
    while len(repos) < max_repos:
        try:
            params['page'] = page
            response = requests.get(base_url, headers=headers, params=params)
            
            if response.status_code == 200:
                page_repos = response.json()
                if not page_repos:  # No more repos
                    break
                repos.extend(page_repos)
                page += 1
                
                remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
                if remaining < 5:
                    reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                    sleep_time = max(reset_time - time.time(), 0) + 1
                    print(f"Rate limit nearly exceeded. Sleeping for {sleep_time:.0f} seconds...")
                    time.sleep(sleep_time)
                else:
                    time.sleep(1)
            elif response.status_code == 403:  # Rate limit exceeded
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                print(f"Rate limit exceeded. Sleeping for {sleep_time:.0f} seconds...")
                time.sleep(sleep_time)
                continue
            else:
                print(f"Error fetching repos for {username}: {response.status_code}")
                print(f"Response: {response.text}")
                break
        except requests.exceptions.RequestException as e:
            print(f"Network error occurred while fetching repos for {username}: {e}")
            time.sleep(5)
            continue
        except Exception as e:
            print(f"Unexpected error occurred while fetching repos: {e}")
            break

    return repos[:max_repos]

def clean_company_name(company):
    """Clean company name"""
    if company:
        company = company.lstrip('@').strip().upper()
        company = re.sub(r"\s+", " ", company)
    return company if company else ""

def clean_name(name):
    """Clean name"""
    if name:
        name = re.sub(r"\s+", " ", name).strip()
    return name if name else ""

def clean_bio(bio):
    """Clean bio"""
    if bio:
        bio = re.sub(r"\s+", " ", bio).strip()
    return bio if bio else ""

def clean_email(email):
    """Clean email"""
    return email.strip() if email else ""

def safe_get_license_name(repo):
    """Safely get license name from repository data"""
    try:
        license_data = repo.get('license')
        if isinstance(license_data, dict):
            return license_data.get('name', '')
        return ''
    except:
        return ''

def main():
    # Get GitHub token
    GITHUB_TOKEN = "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"  # Replace with your actual token
    
    # Validate the token before proceeding
    if not validate_token(GITHUB_TOKEN):
        sys.exit(1)
        
    print("GitHub token validated successfully!")
    
    # Data storage
    users_data = []
    repos_data = []

    try:
        # Get users from Tokyo with over 200 followers
        users = get_users_in_tokyo(GITHUB_TOKEN)
        if not users:
            print("No users found. Exiting...")
            sys.exit(1)
            
        print(f"Processing {len(users)} users...")

        # Process each user
        for i, user in enumerate(users, 1):
            username = user['login']
            print(f"\nProcessing user {i}/{len(users)}: {username}")
            
            try:
                # Get detailed user info
                user_details = get_user_details(username, GITHUB_TOKEN)
                if user_details:
                    # Clean and process user data
                    user_data = {
                        "login": user_details.get("login", ""),
                        "name": clean_name(user_details.get("name", "")),
                        "company": clean_company_name(user_details.get("company", "")),
                        "location": user_details.get("location", ""),
                        "email": clean_email(user_details.get("email", "")),
                        "hireable": str(user_details.get("hireable", "")),
                        "bio": clean_bio(user_details.get("bio", "")),
                        "public_repos": user_details.get("public_repos", 0),
                        "followers": user_details.get("followers", 0),
                        "following": user_details.get("following", 0),
                        "created_at": user_details.get("created_at", "")
                    }
                    users_data.append(user_data)
                    
                    # Get repositories for the user
                    print(f"Fetching repositories for {username}")
                    user_repos = get_user_repos(username, GITHUB_TOKEN)
                    for repo in user_repos:
                        try:
                            repo_data = {
                                "login": username,
                                "full_name": repo.get("full_name", ""),
                                "created_at": repo.get("created_at", ""),
                                "stargazers_count": repo.get("stargazers_count", 0),
                                "watchers_count": repo.get("watchers_count", 0),
                                "language": repo.get("language", ""),
                                "has_projects": repo.get("has_projects", False),
                                "has_wiki": repo.get("has_wiki", False),
                                "license_name": safe_get_license_name(repo)
                            }
                            repos_data.append(repo_data)
                        except Exception as e:
                            print(f"Error processing repository data for {username}: {e}")
                            continue
                    
                    print(f"Added {len(user_repos)} repositories for {username}")
            except Exception as e:
                print(f"Error processing user {username}: {e}")
                continue

        # Convert lists to DataFrames
        users_df = pd.DataFrame(users_data)
        repos_df = pd.DataFrame(repos_data)

        # Save to CSV with timestamp
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        users_csv = f"tokyo_users_{timestamp}.csv"
        repos_csv = f"tokyo_repos_{timestamp}.csv"
        
        users_df.to_csv(users_csv, index=False)
        repos_df.to_csv(repos_csv, index=False)

        print(f"\nData saved to {users_csv} and {repos_csv}")
        print(f"Total users processed: {len(users_data)}")
        print(f"Total repositories processed: {len(repos_data)}")

    except Exception as e:
        print(f"An error occurred in the main process: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

GitHub token validated successfully!
Fetched page 1, Total users so far: 100
Total users available according to GitHub: 542
Fetched page 2, Total users so far: 200
Total users available according to GitHub: 542
Fetched page 3, Total users so far: 300
Total users available according to GitHub: 542
Fetched page 4, Total users so far: 400
Total users available according to GitHub: 542
Fetched page 5, Total users so far: 500
Total users available according to GitHub: 542
Fetched page 6, Total users so far: 542
Total users available according to GitHub: 542
Total users found: 542
Processing 542 users...

Processing user 1/542: dennybritz
Fetching repositories for dennybritz
Added 61 repositories for dennybritz

Processing user 2/542: wasabeef
Fetching repositories for wasabeef
Added 46 repositories for wasabeef

Processing user 3/542: dai-shi
Fetching repositories for dai-shi
Added 128 repositories for dai-shi

Processing user 4/542: rui314
Fetching repositories for rui314
Added 44 reposito

In [18]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

def load_data(users_file, repos_file):
    users_df = pd.read_csv(users_file)
    repos_df = pd.read_csv(repos_file)
    return users_df, repos_df

def analyze_github_data(users_df, repos_df):
    results = {}
    
    # 1. Top 5 users by followers
    results['q1'] = ','.join(users_df.nlargest(5, 'followers')['login'].tolist())
    
    # 2. 5 earliest registered users
    results['q2'] = ','.join(users_df.sort_values('created_at').head(5)['login'].tolist())
    
    # 3. Top 3 licenses
    license_counts = repos_df['license_name'].value_counts()
    license_counts = license_counts[license_counts.index != '']  # Ignore missing licenses
    results['q3'] = ','.join(license_counts.head(3).index.tolist())
    
    # 4. Most common company
    companies = users_df['company'][users_df['company'] != ''].value_counts()
    results['q4'] = companies.index[0] if not companies.empty else ''
    
    # 5. Most popular language
    lang_counts = repos_df['language'].value_counts()
    results['q5'] = lang_counts.index[0]
    
    # 6. Second most popular language for users who joined after 2020
    recent_users = users_df[pd.to_datetime(users_df['created_at']).dt.year > 2020]['login']
    recent_repos = repos_df[repos_df['login'].isin(recent_users)]
    recent_langs = recent_repos['language'].value_counts()
    results['q6'] = recent_langs.index[1] if len(recent_langs) > 1 else ''
    
    # 7. Language with highest average stars
    avg_stars = repos_df.groupby('language')['stargazers_count'].mean()
    results['q7'] = avg_stars.idxmax()
    
    # 8. Top 5 by leader_strength
    users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
    results['q8'] = ','.join(users_df.nlargest(5, 'leader_strength')['login'].tolist())
    
    # 9. Correlation between followers and public repos
    results['q9'] = f"{users_df['followers'].corr(users_df['public_repos']):.3f}"
    
    # 10. Regression slope of followers on repos
    from scipy import stats
    slope, _, _, _, _ = stats.linregress(users_df['public_repos'], users_df['followers'])
    results['q10'] = f"{slope:.3f}"
    
    # 11. Correlation between projects and wiki
    results['q11'] = f"{repos_df['has_projects'].corr(repos_df['has_wiki']):.3f}"
    
    # 12. Difference in following between hireable and non-hireable
    hireable_following = users_df[users_df['hireable'] == 'True']['following'].mean()
    non_hireable_following = users_df[users_df['hireable'] != 'True']['following'].mean()
    results['q12'] = f"{(hireable_following - non_hireable_following):.3f}"
    
    # 13. Correlation between bio length and followers
    users_df['bio_words'] = users_df['bio'].fillna('').apply(lambda x: len(str(x).split()))
    users_with_bio = users_df[users_df['bio_words'] > 0]
    slope, _, _, _, _ = stats.linregress(users_with_bio['bio_words'], users_with_bio['followers'])
    results['q13'] = f"{slope:.3f}"
    
    # 14. Top 5 weekend repository creators
    repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
    repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek.isin([5, 6])
    weekend_creators = repos_df[repos_df['is_weekend']].groupby('login').size()
    results['q14'] = ','.join(weekend_creators.nlargest(5).index.tolist())
    
    # 15. Email sharing difference between hireable and non-hireable
    hireable_email = (users_df[users_df['hireable'] == 'True']['email'] != '').mean()
    non_hireable_email = (users_df[users_df['hireable'] != 'True']['email'] != '').mean()
    results['q15'] = f"{(hireable_email - non_hireable_email):.3f}"
    
    # 16. Most common surname
    def get_surname(name):
        if pd.isna(name) or name.strip() == '':
            return None
        return name.strip().split()[-1]
    
    surnames = users_df['name'].apply(get_surname).value_counts()
    max_count = surnames.max()
    most_common = ','.join(sorted(surnames[surnames == max_count].index.tolist()))
    results['q16'] = most_common
    
    return results

def main():
    # Replace these with your actual file paths
    users_df, repos_df = load_data('tokyo_users_20241031_004140.csv', 'tokyo_repos_20241031_004140.csv')
    results = analyze_github_data(users_df, repos_df)
    
    # Print results
    for q in range(1, 17):
        print(f"Q{q}: {results[f'q{q}']}")

if __name__ == "__main__":
    main()

Q1: dennybritz,wasabeef,dai-shi,rui314,domenic
Q2: kana,kakutani,mootoh,lhl,walf443
Q3: MIT License,Apache License 2.0,Other
Q4: GOOGLE
Q5: JavaScript
Q6: JavaScript
Q7: Assembly
Q8: blueimp,dai-shi,asahilina,pilcrowonpaper,marcan
Q9: 0.050
Q10: 0.271
Q11: 0.425
Q12: nan
Q13: 18.742
Q14: azu,suzuki-shunsuke,yuiseki,xuwei-k,rhysd
Q15: nan
Q16: Kato,Tanaka


In [20]:
def analyze_hireable_metrics(users_df):
    """
    Analyze metrics related to hireable status
    Returns: (following_diff, email_diff)
    """
    # Convert hireable to boolean, handling different string representations
    users_df['hireable_clean'] = users_df['hireable'].fillna('False')
    users_df['hireable_clean'] = users_df['hireable_clean'].apply(
        lambda x: str(x).lower() in ['true', '1', 't', 'yes']
    )
    
    # Question 12: Following difference
    hireable_following = users_df[users_df['hireable_clean']]['following'].mean()
    non_hireable_following = users_df[~users_df['hireable_clean']]['following'].mean()
    following_diff = hireable_following - non_hireable_following
    
    # Question 15: Email sharing difference
    users_df['has_email'] = users_df['email'].notna() & (users_df['email'] != '')
    hireable_email_rate = users_df[users_df['hireable_clean']]['has_email'].mean()
    non_hireable_email_rate = users_df[~users_df['hireable_clean']]['has_email'].mean()
    email_diff = hireable_email_rate - non_hireable_email_rate
    
    # Print debug information
    print("\nDebug Information:")
    print(f"Total users: {len(users_df)}")
    print(f"Hireable users: {users_df['hireable_clean'].sum()}")
    print(f"Non-hireable users: {(~users_df['hireable_clean']).sum()}")
    print(f"Hireable following avg: {hireable_following:.2f}")
    print(f"Non-hireable following avg: {non_hireable_following:.2f}")
    print(f"Hireable email rate: {hireable_email_rate:.2f}")
    print(f"Non-hireable email rate: {non_hireable_email_rate:.2f}")
    
    return following_diff, email_diff

# Example usage:
import pandas as pd

# Load the data
users_df = pd.read_csv('tokyo_users_20241031_004140.csv')

# Calculate the metrics
following_diff, email_diff = analyze_hireable_metrics(users_df)

print(f"\nQuestion 12 - Following difference: {following_diff:.3f}")
print(f"Question 15 - Email sharing difference: {email_diff:.3f}")


Debug Information:
Total users: 542
Hireable users: 180
Non-hireable users: 362
Hireable following avg: 182.14
Non-hireable following avg: 260.63
Hireable email rate: 0.66
Non-hireable email rate: 0.53

Question 12 - Following difference: -78.491
Question 15 - Email sharing difference: 0.131
