In [61]:
!pip install sf-hamilton-sdk

In [62]:
import requests
%load_ext hamilton.plugins.jupyter_magic

In [138]:
%%cell_to_module -m enricher --display

import pandas as pd
from hamilton.htypes import Parallelizable, Collect
import time
import math
import requests
import json
import os

def base_data(base_data_path: str) -> pd.DataFrame:
    _df = pd.read_csv(base_data_path)
    _df = _df[~_df.username.isna()]  # drop bad usernames
    _df.username = _df.username.apply(lambda x: x.split()[0])  # strip weird whitespace
    _df = _df[_df.email.isna()]  # filter to empty emails
    return _df

def headers(access_token: str) -> dict:
    return {
        'Authorization': f'token {access_token}',
        "Accept": "application/vnd.github.v3.star+json",
    }

def username(base_data: pd.DataFrame, directory: str) -> Parallelizable[str]:
    skipped = 0
    for idx, row in base_data.iterrows():
        _username = row["username"]
        _email = row["email"]
        if not math.isnan(_email):
            continue
        if os.path.exists(f"{directory}/{_username}.json"):
            skipped += 1
            continue
        yield _username
    print(f"skipped {skipped}")

def user_emails(username: str, headers: dict, directory: str, max_repos: int = 5,) -> tuple[str,set[str]]:
    repos_url = f"https://api.github.com/users/{username}/repos"
    repos_response = requests.get(repos_url, headers=headers)
    repos = repos_response.json()
    time.sleep(1)
    if not repos:
        # with open(f"{directory}/{username}.json", "w") as f:
        #     json.dump({"username": username, "emails": []}, f)
        return username, set()
    user_emails = set()
    iterations = 0
    if isinstance(repos, dict):
        print(username, "repos is empty")
        return username, set()
    for repo in repos[0:max_repos]:
        repo_name = repo["name"]
        commits_url = f"https://api.github.com/repos/{username}/{repo_name}/commits"
        commits_response = requests.get(commits_url, headers=headers)
        time.sleep(1)
        commits = commits_response.json()

        if not commits:
            continue
        if isinstance(commits, dict):
            print(f"{username}: dict for commits -- skipping")
            continue
        for commit in commits:
            if not commit:
                continue
            author = commit.get("author", {})
            if not author or author.get("login", None) != username:
                continue
            user_emails.add(commit["commit"]["author"]["email"])
    with open(f"{directory}/{username}.json", "w") as f:
        json.dump({"username": username, "emails": list(user_emails)}, f)
    return username, user_emails



def emails_df(user_emails: Collect[tuple[str, set[str]]]) -> pd.DataFrame:
    print(user_emails)
    emails_df = pd.DataFrame(user_emails, columns=["username", "emails"])
    return emails_df


def cached_emails_df(directory: str) -> pd.DataFrame:
    files = [f for f in os.listdir(directory) if f.endswith(".json") and os.path.isfile(os.path.join(directory, f))]
    results = []
    for f in files:
        with open(os.path.join(directory, f), 'r') as fh:
            r = json.load(fh)
            results.append((r["username"], r["emails"]))

    return pd.DataFrame(results, columns=["username", "emails"])

def enriched_df(emails_df: pd.DataFrame, base_data: pd.DataFrame, cached_emails_df: pd.DataFrame) -> pd.DataFrame:
    #import pdb; pdb.set_trace()
    _emails_df = pd.concat([emails_df, cached_emails_df])
    _emails_df = _emails_df[_emails_df.emails.apply(lambda x: len(x) > 0 if x else False)]  # remove people we couldn't get emails for
    enriched_df = base_data.join(_emails_df, on="username", how="outer")
    return enriched_df
    

In [139]:
from hamilton import driver
from hamilton.execution import executors
from hamilton.io.materialization import to
from hamilton_sdk import adapters

tracker = adapters.HamiltonTracker(
   project_id=43,  # modify this as needed
   username="elijah@dagworks.io",
   dag_name="enrich_df",
   tags={"environment": "DEV", "team": "MY_TEAM", "version": "X"}
)

dr = (
    driver.Builder()
    .enable_dynamic_execution(allow_experimental_mode=True)
    .with_modules(enricher)
    .with_remote_executor(executors.SynchronousLocalTaskExecutor())
    .with_materializers(to.csv(id="saver", dependencies=["enriched_df"], path="enriched1.csv"))
    .with_adapters(tracker)
    .build()
)


In [140]:
inputs = {
  "base_data_path": "/Users/stefankrawczyk/scraping/stitchfixgithub/results.csv",
    "access_token": "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",
    "max_repos": 4,
    "directory": "/Users/stefankrawczyk/scraping/stitchfixgithub/",
}

In [141]:

result = dr.execute(["enriched_df", "saver"], inputs=inputs)

In [154]:
e

In [170]:
c = cached_emails_df(inputs["directory"])
d = base_data(inputs["base_data_path"])
e = d.merge(c, on="username", how="outer")
def choose_email(row):
    print(row.username, row.email, row.emails)
    chosen_email = ""
    if math.isnan(row.email):
        if not isinstance(row.emails, list):
            return None
        # take gmail
        gmail_address = ""
        candidates = []
        for email in row.emails:
            if "github.com" in email:
                continue
            if email.endswith("@gmail.com"):
                gmail_address = email
            candidates.append((email, len(set(row.username).intersection(set(email)))))
        if candidates:
            candidates = sorted(candidates, key=lambda x: x[1], reverse=True)
            chosen_email = candidates[0][0] if gmail_address == "" else gmail_address
        else:
            chosen_email = None
    else:
        chosen_email = row.email
    return chosen_email
e.to_csv(inputs["directory"]+"/enriched_results_raw.csv")
e["email_to_use"] = e[["username", "email", "emails"]].apply(choose_email, axis=1)

In [172]:
e.email_to_use.count()

In [178]:
e[~e.email_to_use.isna()].to_csv(inputs["directory"] + "/valid_emails.csv")

In [158]:
set('huyenntkvn@gmail.com').intersection(set('chiphuyen'))

In [159]:
set('huyenn@stanford.edu').intersection(set('chiphuyen'))

In [161]:
sorted([('huyenntkvn@gmail.com',7), ('huyenn@stanford.edu',5)], key=lambda x: x[1], reverse=True)

In [44]:
import requests
max_repos = 5
access_token = "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"
headers = {
        'Authorization': f'token {access_token}',
        "Accept": "application/vnd.github.v3.star+json",
    }
def user_emails(username):
    # Step 1: Get the list of repositories
    repos_url = f"https://api.github.com/users/{username}/repos"
    repos_response = requests.get(repos_url, headers=headers)
    repos = repos_response.json()
    if not repos:
        return None

    # Step 2: Get the last commit for each repository
    last_commit = None
    user_emails = set()
    iterations = 0
    for repo in repos:
        repo_name = repo['name']
        # print(repo_name)
        commits_url = f"https://api.github.com/repos/{username}/{repo_name}/commits"
        commits_response = requests.get(commits_url, headers=headers)
        commits = commits_response.json()

        if commits:
            if isinstance(commits, dict):
                print("dict for commits -- skipping")
                continue
            for commit in commits:
                if not commit:
                    continue
                if isinstance(commit, str):
                    print(commit)
                    continue
                author = commit.get("author", {})
                if not author or author.get("login", None) != username:
                    continue
                user_emails.add(commit["commit"]["author"]["email"])
        import time
        time.sleep(2)
        iterations += 1
        if iterations > max_repos:
            break

    return user_emails

# Example usage
# _username = "amcclosky"  # Replace with the GitHub username
# _username = "saareliad"
# _username = "NeroHin"
# _username = "perrygeo"
_username = "benhadad"
_user_emails = user_emails(_username)
print(_user_emails)
if _user_emails:
    print(f"{_username} emails: {_user_emails}")
else:
    print("No public commits found for this user.")


In [3]:
repo_name = "hello-run-fastapi"
username = "amcclosky"
commits_url = f"https://api.github.com/repos/{username}/{repo_name}/commits"
commits_response = requests.get(commits_url)
commits = commits_response.json()

In [7]:
len(commits)

In [8]:
commits[0]

In [9]:
commits[1]

In [10]:
commits[2]

In [11]:
commits[0]["author"]["node_id"] == commits[1]["author"]["node_id"]

In [12]:
repo_name = "property-app"
username = "almostprod"
commits_url = f"https://api.github.com/repos/{username}/{repo_name}/commits"
commits_response = requests.get(commits_url)
commits = commits_response.json()

In [14]:
commits[0]

In [15]:
commits[0]["commit"]["author"]["email"]

In [19]:
import pprint
pprint.pprint(commits[0])

In [22]:
repo_name = "hello-run-fastapi"
username = "amcclosky"
commits_url = f"https://api.github.com/repos/{username}/{repo_name}/commits"
commits_response = requests.get(commits_url)
commits = commits_response.json()

In [29]:
emails = set()
for commit in commits:
    if not commit:
        continue
    author = commit.get("author", {})
    # print(author)
    if not author or author.get("login", None) != username:
        continue
    emails.add(commit["commit"]["author"]["email"])

In [30]:
emails

In [116]:
no_repos = [('rinsoft-sf', set()), ('lss1017', set()), ('wbr-certi', set()), ('jacopone', set()), ('nbilenko-sf', set()), ('gibson552', set()), ('nulluint', set()), ('akhorshidiarz', set()), ('rasol0909', set()), ('philippatterson', set()), ('Doppp', set()), ('dansmarandaspire', set()), ('ian-andriot', set()), ('jeschultz12', set()), ('andywang416', set()), ('BSeboo', set()), ('lipsa-faction', set()), ('teofilak-michal', set()), ('kurtosis-dd', set()), ('nwt-patrick', set()), ('nickvazztau', set()), ('stackbeard', set()), ('submartingales', set()), ('DominiqueLade', set()), ('fabrikant-alex', set()), ('vietphan-billidentity', set()), ('kylashk', set()), ('dereckwpaul', set()), ('riteley', set()), ('ecolson', set()), ('arcticgreen', set()), ('meowmeow69420', set())]


In [117]:
len(no_repos)