## Importing the required libs

In [75]:
import pandas as pd
import requests
import time
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')

In [None]:
## setting the api token
from google.colab import userdata
headers = {
    'Authorization': userdata.get("GITHUB_TOKEN")
}

### Supporting Functions

In [None]:
def get_users_in_location_with_followers(min_followers=200, location="Paris"):
  users = []
  page = 1
  while True:
      # GitHub Search API for users based on location and followers
      url = f"https://api.github.com/search/users?q=location:{location}+followers:>{min_followers}&per_page=30&page={page}"
      response = requests.get(url, headers=headers)

      if response.status_code != 200:
          print("Error fetching users:", response.json())
          break

      data = response.json()
      users.extend(data['items'])

      # Check if there are more pages
      if 'next' not in response.links:
          break  # Exit if no more pages

      page += 1
      time.sleep(2)  # Sleep to respect rate limits
  return users

In [None]:
def get_user_details(username):
  url = f"https://api.github.com/users/{username}"
  response = requests.get(url, headers=headers)
  if response.status_code != 200:
      print(f"Error fetching details for {username}:", response.json())
      return None
  return response.json()

In [None]:
def clean_company(company):
  if company:
      return company.strip().lstrip('@').upper()
  return company

In [None]:
def safe_strip(value):
  return value.strip() if value else ""

def handle_bool_vab(value):
  return "" if value is None else value


In [None]:
def get_users_csv(min_followers=200, location="Paris"):
  users = get_users_in_location_with_followers(min_followers, location)
  user_details = []

  csv_filename = 'users.csv'
  fieldnames = [
      'login', 'name', 'company', 'location', 'email',
      'hireable', 'bio', 'public_repos', 'followers',
      'following', 'created_at'
  ]

  for user in users:
    username = user['login']
    details = get_user_details(username)
    if details:
      user_record = {
          'login': safe_strip(details.get('login', '')),
          'name': safe_strip(details.get('name', '')),
          'company': clean_company(details.get('company')),
          'location': safe_strip(details.get('location', '')),
          'email': safe_strip(details.get('email', '')),
          'hireable': handle_bool_vab(details.get('hireable')),
          'bio': safe_strip(details.get('bio', '')),
          'public_repos': details.get('public_repos', 0),
          'followers': details.get('followers', 0),
          'following': details.get('following', 0),
          'created_at': details.get('created_at')
      }
      user_details.append(user_record)

  df = pd.DataFrame(user_details,columns=fieldnames)
  df.to_csv(csv_filename, index=False)
  print(f"CSV file '{csv_filename}' created successfully.")
  return df.head()

In [None]:
def get_user_repositories(username, max_repos=500):
  repos = []
  page = 1
  while len(repos) < max_repos:
      # GitHub API for listing a user's repositories
      url = f"https://api.github.com/users/{username}/repos?sort=pushed&per_page=100&page={page}"
      response = requests.get(url, headers=headers)

      if response.status_code != 200:
          print(f"Error fetching repositories for {username}:", response.json())
          break

      data = response.json()
      repos.extend(data)

      if len(data) < 100:  # Exit if fewer than 100 repos on the page (no more pages)
          break

      page += 1
      time.sleep(1)  # Sleep to respect rate limits

  return repos[:max_repos]

In [None]:
def get_repos_csv(filename='users.csv'):
  # Load user data from users.csv
  users_df = pd.read_csv(filename)
  usernames = users_df['login'].tolist()

  # Prepare CSV file for repositories
  csv_filename = 'repositories.csv'
  fieldnames = [
      'login', 'full_name', 'created_at', 'stargazers_count',
      'watchers_count', 'language', 'has_projects', 'has_wiki',
      'license_name'
  ]

  repos_list = []
  for username in usernames:
    #print(f"Fetching repositories for user: {username}")
    repos = get_user_repositories(username)

    for repo in repos:
      # Structure and clean repository data
      repo_data = {
          'login': username,
          'full_name': repo.get('full_name', ''),
          'created_at': repo.get('created_at', ''),
          'stargazers_count': repo.get('stargazers_count', 0),
          'watchers_count': repo.get('watchers_count', 0),
          'language': repo.get('language', ''),
          'has_projects': repo.get('has_projects', False),
          'has_wiki': repo.get('has_wiki', False),
          'license_name': repo.get('license', {}).get('key', '') if repo.get('license') else ''
      }
      repos_list.append(repo_data)

      time.sleep(1)  # Sleep to respect rate limits

  df = pd.DataFrame(repos_list, columns=fieldnames)
  df.to_csv(csv_filename, index=False)

  print(f"Data saved to '{csv_filename}'.")
  return df.head()

### Creating the CSV files

In [None]:
get_users_csv()

In [None]:
get_repos_csv()

In [4]:
users = pd.read_csv('users.csv')
repos = pd.read_csv('repositories.csv')

## Solving questions related to assignment

In [72]:
def list_to_csv_string(lst):
    if not lst:
        print("")
    else:
     print(str(lst)[2:-2].replace("', '",","))

In [None]:
## 1. Who are the top 5 users in Paris with the highest number of followers? List their login in order, comma-separated.

ans = users.sort_values(by='followers', ascending=False).head(5)['login'].to_list()
list_to_csv_string(ans)

huggingface,brunosimon,fabpot,Charles-Chrismann,posva


In [None]:
##  Who are the 5 earliest registered GitHub users in Paris? List their login in ascending order of created_at, comma-separated.

ans = users.sort_values(by='created_at', ascending=True).head(5)['login'].to_list()
list_to_csv_string(ans)

sunny,nkallen,nono,tdd,luislavena


In [None]:
## 3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

ans = repos["license_name"].value_counts().head(3).index.to_list()
list_to_csv_string(ans)

mit,apache-2.0,other


In [None]:
## 4. Which company do the majority of these developers work at?
ans = users["company"].value_counts().head(1).index.to_list()
list_to_csv_string(ans)

HUGGINGFACE


In [None]:
## 5. Which programming language is most popular among these users?
ans = repos["language"].value_counts().head(1).index.to_list()
list_to_csv_string(ans)

JavaScript


In [None]:
## 6. Which programming language is the second most popular among users who joined after 2020?
ans_df = pd.merge(users[['login','created_at']], repos[['login','language']], on='login')
ans = ans_df[ans_df['created_at'] > '2020-01-01']['language'].value_counts().head(2).index.to_list()
list_to_csv_string([ans[1]])

Python


In [None]:
## 7. Which language has the highest average number of stars per repository?
ans = repos.groupby('language')['stargazers_count'].mean().sort_values(ascending=False).head(1).index.to_list()
list_to_csv_string(ans)

Blade


In [76]:
## 8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.
ans_df = users[['login','followers','following']]
ans_df['leader_strength'] = ans_df['followers'] / (1 + ans_df['following'])
ans = ans_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].to_list()
list_to_csv_string(ans)

huggingface,brunosimon,fabpot,lewagon,BartoszMilewski


In [None]:
## 9. What is the correlation between the number of followers and the number of public repositories among users in Paris?
users[['followers','public_repos']].corr()

Unnamed: 0,followers,public_repos
followers,1.0,0.084415
public_repos,0.084415,1.0


In [44]:
## 10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

model = LinearRegression()
X = users['public_repos'].values.reshape(-1, 1)
y = users['followers']
model.fit(X, y)
ans = model.coef_[0]
print(ans)

1.813231075703095


In [77]:
## 11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

ans_df = repos[['has_wiki','has_projects']]
ans_df['has_wiki'] = ans_df['has_wiki'].astype(int)
ans_df['has_projects'] = ans_df['has_projects'].astype(int)
ans_df.corr()

Unnamed: 0,has_wiki,has_projects
has_wiki,1.0,0.358866
has_projects,0.358866,1.0


In [34]:
## 12. Do hireable users follow more people than those who are not hireable?
## Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

ans_df = users[['hireable','following']]
ans_T = ans_df[ans_df['hireable']==True]['following'].mean()
ans_A = ans_df['following'].mean()
print(ans_T)
print(ans_A)

131.1237113402062
772.9554973821989


In [68]:
## 13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)
## Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

users['bio_word_count'] = users['bio'].apply(lambda x: len(str(x).split()) if isinstance(x, str) else 0)
users_with_bios = users[users['bio_word_count'] > 0]

model = LinearRegression()
X = users_with_bios['bio_word_count'].values.reshape(-1, 1)
y = users_with_bios['followers']
model.fit(X, y)
ans = model.coef_[0]
print(f'{ans:.3f}')

-16.579


In [109]:
##  14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

ans_df = repos[['login','created_at']]
ans_df['created_at'] = pd.to_datetime(repos['created_at'])
ans_df['day_of_week'] = ans_df['created_at'].dt.dayofweek  # Monday=0, Sunday=6
weekend_repos = ans_df[ans_df['day_of_week'] >= 5]  # Select weekend repos (Saturday and Sunday)
ans = weekend_repos.groupby('login').count().sort_values(by='created_at',ascending=False).head(5).index.to_list()
list_to_csv_string(ans)

MysteriousSonOfGod,rishistyping,vincentbernat,gre,KOUISAmine


In [107]:
## 15. Do people who are hireable share their email addresses more often?

hireable_email_fraction = users[users['hireable'] == True]['email'].count() / len(users[users['hireable'] == True])

non_hireable_email_fraction = users[users['hireable'] != True]['email'].count() / len(users[users['hireable'] != True])

# Calculate the difference and print the result to 3 decimal places
print(f"{hireable_email_fraction - non_hireable_email_fraction:.3f}")

-0.045


In [108]:
## 16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically), users already loded

def most_common_surname(users_df):
    surnames = []
    for name in users_df['name']:
        if isinstance(name, str):
            name_parts = name.strip().split()
            if name_parts:
                surnames.append(name_parts[-1])

    if not surnames:
        return ""

    surname_counts = {}
    for surname in surnames:
        surname_counts[surname] = surname_counts.get(surname, 0) + 1

    max_count = 0
    most_common = []
    for surname, count in surname_counts.items():
        if count > max_count:
            most_common = [surname]
            max_count = count
        elif count == max_count:
            most_common.append(surname)

    most_common.sort()
    return ",".join(most_common)

print(most_common_surname(users))

Simon
