## Preparation

In [4]:
import pandas as pd
import numpy as np
import re

### Identify AI related topics by looking top 2000 project's topic

In [23]:
seed = {
    'neural': 1,
    'ml': 1,
    'learning': 1,
    'gpt': 1,
    'llm': 1,
    'nlp': 1,
    'assistant': 1,
    'cnn': 1,
    'rnn': 1,
    'gan': 1,
    'model': 1,
    'transformer': 1,
    'recognition': 1,
    'detection': 1,
    'ai': 1,
    'diffusion': 1,
    'yolo': 1,
    'classifier': 1,
    'classification': 1,
    'artificial': 1,
    'bert': 1,
    'torch': 1,
    'jax': 1,
    'tensor': 1,
    'seq2seq': 1,
    'agent': 1,
    'bot': 1,
    'gpu': 1,
    'img': 1,
    'image': 1,
    'film': 1,
    'face': 1,
    'music': 1,
    'caffe': 1,
    'mxnet': 1,
    'vector': 1,
    'scikit': 1,
    'deep': 1,
    'natural': 1,
    'ocr': 1,
    'cv': 1,
    'llama': 1,
    'bard': 1,
    'claude': 1,
    'gpu': 1,
    'tpu': 1,
    'parameters': 1,
    'regression': 1,
    'intelligent': 1,
    'intelligence': 1,
    'att': 1,
    'adaboost': 1,
    'age-prediction': 1,
    'alpaca': 1,
    'bayesian': 1,
    'bias': 1,
    'causal': 1,
    'chatglm': 1,
    'computer-vision': 1,
    'decision-tree': 1,
    'embeddings': 1,
    'factorization': 1,
    'game-theory': 1,
    'cuda': 1,
}

def load_main(file):
    df_main = pd.read_csv(
        file,
        parse_dates=[ "pushed_at", "created_at", "updated_at" ]
    )
    df_main.drop_duplicates(ignore_index=True, inplace=True)
    df_main = df_main.astype({'owner_type': 'category', 'default_branch': 'category', 'language': 'category', 'lang': 'category'})
    return df_main

def is_known(topic):
    for k in seed:
        if topic.find(k) >= 0:
            return True
    return False        


def _split_by_topics(row):
    topics = row['topics']
    row['topic'] = [m.group(1) for m in re.finditer(r"'([-\w]+)'", topics)]
    if len(row['topic']) == 0: row['topic'] = 'NOTOPIC'
    row['primary_topic'] = row['topic'][0]    
    return row

def _walk_dataframe(df, func):
    df = df.apply(func, axis=1)
    return df

def expand(df):
    df = _walk_dataframe(df, _split_by_topics)
    df = df.explode(['topic'])
    return df

df_commit = pd.read_csv("commits-sum.csv", parse_dates=["author_date"], dtype={"verified": "int8", "branch": "category"})
df_topics = df_commit.groupby(["full_name"], as_index=False).agg(contributors=pd.NamedAgg(column="author_name", aggfunc="nunique")).sort_values("contributors", ascending=False).head(5000)
df_main = load_main("main.csv")
df_topics = df_topics.merge(df_main, how="inner", left_on="full_name", right_on="full_name")
df_topics = expand(df_topics)

df_top = df_topics.groupby("topic", as_index=False).agg(cnt=pd.NamedAgg(column="full_name", aggfunc="count")).sort_values("cnt", ascending=False)

In [27]:
df_top.describe(percentiles=[.75, .90, .95, .97, .98, .99])

Unnamed: 0,cnt
count,11143.0
mean,3.899578
std,38.588373
min,1.0
50%,1.0
75%,2.0
90%,5.0
95%,9.0
97%,15.0
98%,21.0


In [30]:
df_top.head(250).to_csv("to-pick.csv", index=False)

In [11]:
    
topics = list(df_topics['topic'].unique())
topics.sort()
j = 0
for i, topic in enumerate(topics):
    #if not is_known(topic):
    if is_known(topic):
        print(f"{j}-{i}: {topic}")
        j += 1

0-0: 100-days-of-ml-code
1-2: 100daysofmlcode
2-11: 3d-computer-vision
3-25: aaai
4-35: accuracy-models
5-43: action-recognition
6-45: active-learning
7-60: adversarial-attack-and-defense
8-61: adversarial-attacks
9-64: adversarial-learning
10-65: adversarial-machine-learning
11-68: adversarial-training
12-70: aerial-imagery
13-74: age-prediction
14-75: agent
15-76: agent-based
16-77: agent-llm
17-78: agentgpt
18-79: agents
19-85: ai
20-86: ai-agent
21-87: ai-agents
22-88: ai-aimbot
23-89: ai-alignment
24-90: ai-api
25-91: ai-application-deployment
26-92: ai-application-development
27-93: ai-applications
28-94: ai-art
29-95: ai-as-a-service
30-96: ai-assistant
31-97: ai-assistants
32-98: ai-bot
33-99: ai-challenges
34-100: ai-commit
35-101: ai-commits
36-102: ai-experiments
37-103: ai-for-science
38-104: ai-framework
39-105: ai-functions
40-106: ai-ide
41-107: ai-inference
42-108: ai-machine-learning
43-109: ai-models
44-110: ai-native
45-111: ai-pipelines
46-112: ai-powered-search
47-