In [1]:
import pandas as pd


df = pd.read_json('data/aws_repos.jsonl', lines=True)
df.head(3)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,mirror_url,archived,disabled,open_issues_count,license,forks,open_issues,watchers,default_branch,score
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,,False,False,10,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,,False,False,15,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,,False,False,4,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1


## Get READMEs for each repository

The [Github README API](https://developer.github.com/v3/repos/contents/#get-the-readme) makes it very easy to download the README of a project. Let's fetch the README of every Amazon open source project on Github.

In [None]:
import base64
import os
import time

import github


GITHUB_PERSONAL_TOKEN = os.environ.get('GITHUB_PERSONAL_TOKEN')
if not GITHUB_PERSONAL_TOKEN:
    raise Exception('GITHUB_PERSONAL_TOKEN undefined!')

g = github.Github(GITHUB_PERSONAL_TOKEN)

In [None]:
def get_readme(full_name):
    """Get the repo for full_name, get the readme, then decode base64->bytes->utf8"""
    repo = g.get_repo(full_name)
    readme_bin = repo.get_readme().content
    readme = base64.b64decode(readme_bin).decode()
    return readme

# Given a project full_name (owner/repo), fetch the README and return as a UTF-8 string
readmes = []
for i, full_name in df['full_name'].iteritems():
    try:
        readme = get_readme(full_name)
        readmes.append(readme)
    except github.UnknownObjectException as e:
        print(e)
        readmes.append('')
    except github.RateLimitExceededException as e:
        print(e)
        
        # Sleep for an hour + a 2 minute safety margin, if we hit the 5,000/hr rate limit
        time.sleep(60 * 62)
        
        # Get the repo again, now that we aren't rate limited
        readme = get_readme(full_name)
        readmes.append(readme)

In [None]:
df['readme'] = pd.Series(readmes)

In [None]:
df.head(3)

## Store the Data for Hand Labeling of a Sample

Store the data as CSV for hand labeling to guide our Labeling Function development. Also store to Parquet.

In [22]:
import pyarrow


# Save to CSV for hand labeling
df[[
    'id', 'full_name', 'description', 'fork', 'forks_count', 'language', 'homepage',
    'open_issues_count', 'watchers', 'readme', 
]].to_csv(
    'data/aws_readmes.csv',
    index_label='index',
)

# Save relevant features to Parquet for safe keeping
df[['id', 'full_name', 'description', 'fork', 'forks_count', 'language', 'homepage',
    'open_issues_count', 'watchers', 'readme',
]].to_parquet(
    'data/aws_readmes.parquet',
    engine='pyarrow'
)

In [12]:
# # Temporary load from other machine
# import pyarrow

# readme_df = pd.read_parquet('data/aws_readmes.parquet', engine='pyarrow')

# # Join READMEs in and drop duplicate ID column
# df_join = df.join(readme_df, rsuffix='_readme')
# del df_join['id_readme']

# df = df_join

# df.head(3)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,archived,disabled,open_issues_count,license,forks,open_issues,watchers,default_branch,score,readme
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,False,False,10,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1,"<p align=""center"">\n <img src=""https://m.medi..."
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,False,False,15,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1,\n# Alexa Skill Building Cookbook\n\n<div styl...
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,False,False,4,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1,"# Build An Alexa Fact Skill\n<img src=""https:/..."


## Create spaCy Documents from READMEs

Setup the large english language model and have it merge multi-token named entities.

In [13]:
import spacy
from spacy.pipeline import merge_entities


# Enable a GPU if you have one
spacy.prefer_gpu()

# Download the spaCy english model
spacy.cli.download('en_core_web_lg')
nlp = spacy.load("en_core_web_lg")

# Merge multi-token entities together
nlp.add_pipe(merge_entities)

nlp.pipeline


[93m    Linking successful[0m
    /home/rjurney/anaconda3/envs/amazon/lib/python3.7/site-packages/en_core_web_lg
    -->
    /home/rjurney/anaconda3/envs/amazon/lib/python3.7/site-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



[('tagger', <spacy.pipeline.Tagger at 0x7f1544a32a50>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7f14fda1da10>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7f14fda1dfb0>),
 ('merge_entities', <function spacy.pipeline.merge_entities>)]

In [14]:
df['spacy'] = df['readme'].apply(nlp)
df.head(3)

## Load the Gold Labeled Data

Data was labeled via a [Google Sheet](https://docs.google.com/spreadsheets/d/1ULt0KxIdb5HUJCEMt_AmOuPbTvN1zg8UA_4RvjlVwXQ/edit?usp=sharing) and exported to CSV at [data/Amazon_Open_Source_Analysis_Gold.csv](data/Amazon_Open_Source_Analysis_Gold.csv).

### Submitting Corrections or Additions

If you feel any labels are wrong, first read the definitions in the README and comment on the sheet. You may also copy the Google Sheet and continue labeling yourself if you want to ensure the accuracy of this analysis.

In [60]:
# Load all 2,469 records and then filter out the unlabeled ones
df_gold = pd.read_csv('data/Amazon_Open_Source_Analysis_Gold.csv')

df_gold = df_gold[df_gold['label'].notnull()]
print(f'Gold labeled records: {len(df_gold.index):,}')

df_gold = df.set_index('id').join(
    df_gold.set_index('id'),
    how='inner',
    on='id',
    rsuffix='_gold'
)

# Drop duplicate columns
df_gold = df_gold.drop(
    [
        'full_name_gold','url_gold','description_gold','fork_gold','forks_count_gold',
        'language_gold','homepage_gold','open_issues_count_gold','watchers_gold', 
        'readme_gold',
        
    ],
    axis=1
)

# Map the original
def label_to_number(x):
    """Convert string labels from the Google Sheet to their numeric values"""
    if x == 'ABSTAIN':
        return ABSTAIN
    if x == 'GENERAL':
        return GENERAL
    if x == 'API':
        return API
    if x == 'EDUCATION':
        return EDUCATION
    if x == 'DATASET':
        return DATASET

df_gold.head(3)

Gold labeled records: 203


Unnamed: 0_level_0,node_id,name,full_name,private,owner,html_url,description,fork,url,forks_url,...,license,forks,open_issues,watchers,default_branch,score,readme,spacy,index,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,https://api.github.com/repos/alexa/alexa-skill...,...,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1,"<p align=""center"">\n <img src=""https://m.medi...","(<, p, align=""center, "", >, \n , <, img, src=...",0,API
84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,https://api.github.com/repos/alexa/alexa-cookb...,...,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1,\n# Alexa Skill Building Cookbook\n\n<div styl...,"(\n, #, Alexa, Skill, Building, Cookbook, \n\n...",1,EDUCATION
63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,https://api.github.com/repos/alexa/skill-sampl...,...,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1,"# Build An Alexa Fact Skill\n<img src=""https:/...","(#, Build, An, Alexa, Fact, Skill, \n, <, img,...",2,EDUCATION


## Now Create a Random Forest Model using a Sparse Representation to Pick Keyword Label Functions

We will use the spaCy doc we created to lemmatize as we tokenize the words, giving us better representations for feature importances.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier


def lemmatize(doc):
    [token.lemma_ for token in doc]

vectorizer = TfidfVectorizer(
    analyzer='word',
    min_df=3,
    stop_words='english',
    tokenizer=lemmatize,
    token_pattern=None,
)  

text_vec = vectorizer.fit_transform(
    df['spacy']
)
labels = df['label']

left_clf = RandomForestClassifier(n_estimators=500)
left_clf.fit(left_text_vec, labels)

list(reversed(sorted(zip(left_vectorizer.get_feature_names(), left_clf.feature_importances_), key=lambda x: x[1])))

In [23]:
%%html
<style>
table {float:left}
</style>

## Label Schema

The labels for this dataset are:

| Number | Code      | Description                      |
|--------|-----------|----------------------------------|
| -1     | ABSTAIN   | No vote, for Labeling Functions  |
| 0      | GENERAL   | A FOSS project of general appeal |
| 1      | API       | An API library for AWS           |
| 2      | EDUCATION | An educational library for AWS   |
| 3      | DATASET   | An open dataset by Amazon        |

In [26]:
%%html
<style>
table {float:left}
</style>

## Labeling Functions

Labeling functions each weakly label the data and need only be better than random. Snorkel's
unsupervised generative graphical model combines these weak labels into strong labels by 
looking at the overlap, conflict and coverage of each weak label set.

| Logic                         | Fields                               | Label       | 200 Sample Accuracy |
|-------------------------------|--------------------------------------|-------------|---------------------|
| If 'sdk' is in                | `full_name`, `description`, `readme` | `API`       |                     |
| If 'sample' is in             | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'dataset' is in            | `full_name`, `description`, `readme` | `DATASET`   |                     |
| If 'demonstrate'/'demo' is in | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'walkthrough' is in        | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'skill' is in              | `full_name`, `description`           | `EDUCATION` |                     |
| If 'kit' is in                | `full_name`, `description`           | `EDUCATION` |                     |
| If 'toolbox' is in            | `description`                        | `GENERAL`   |                     |
| if 'extension' is in          | `description`                        | `API`       |                     |
