In [1]:
import pandas as pd


df = pd.read_json('data/aws_repos.jsonl', lines=True)
df.head(3)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,mirror_url,archived,disabled,open_issues_count,license,forks,open_issues,watchers,default_branch,score
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,,False,False,10,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,,False,False,15,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,,False,False,4,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1


## Get READMEs for each repository

The [Github README API](https://developer.github.com/v3/repos/contents/#get-the-readme) makes it very easy to download the README of a project. Let's fetch the README of every Amazon open source project on Github.

In [None]:
import base64
import os
import time

import github


GITHUB_PERSONAL_TOKEN = os.environ.get('GITHUB_PERSONAL_TOKEN')
if not GITHUB_PERSONAL_TOKEN:
    raise Exception('GITHUB_PERSONAL_TOKEN undefined!')

g = github.Github(GITHUB_PERSONAL_TOKEN)

In [None]:
def get_readme(full_name):
    """Get the repo for full_name, get the readme, then decode base64->bytes->utf8"""
    repo = g.get_repo(full_name)
    readme_bin = repo.get_readme().content
    readme = base64.b64decode(readme_bin).decode()
    return readme

# Given a project full_name (owner/repo), fetch the README and return as a UTF-8 string
readmes = []
for i, full_name in df['full_name'].iteritems():
    try:
        readme = get_readme(full_name)
        readmes.append(readme)
    except github.UnknownObjectException as e:
        print(e)
        readmes.append('')
    except github.RateLimitExceededException as e:
        print(e)
        
        # Sleep for an hour + a 2 minute safety margin, if we hit the 5,000/hr rate limit
        time.sleep(60 * 62)
        
        # Get the repo again, now that we aren't rate limited
        readme = get_readme(full_name)
        readmes.append(readme)

In [None]:
df['readme'] = pd.Series(readmes)

In [None]:
df.head(3)

## Store the Data for Hand Labeling of a Sample

Store the data as CSV for hand labeling to guide our Labeling Function development. Also store to Parquet.

In [22]:
import pyarrow


# Save to CSV for hand labeling
df[[
    'id', 'full_name', 'description', 'fork', 'forks_count', 'language', 'homepage',
    'open_issues_count', 'watchers', 'readme', 
]].to_csv(
    'data/aws_readmes.csv',
    index_label='index',
)

# Save relevant features to Parquet for safe keeping
df[['id', 'full_name', 'description', 'fork', 'forks_count', 'language', 'homepage',
    'open_issues_count', 'watchers', 'readme',
]].to_parquet(
    'data/aws_readmes.parquet',
    engine='pyarrow'
)

In [12]:
# # Temporary load from other machine
# import pyarrow

# readme_df = pd.read_parquet('data/aws_readmes.parquet', engine='pyarrow')

# # Join READMEs in and drop duplicate ID column
# df_join = df.join(readme_df, rsuffix='_readme')
# del df_join['id_readme']

# df = df_join

# df.head(3)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,archived,disabled,open_issues_count,license,forks,open_issues,watchers,default_branch,score,readme
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,False,False,10,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1,"<p align=""center"">\n <img src=""https://m.medi..."
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,False,False,15,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1,\n# Alexa Skill Building Cookbook\n\n<div styl...
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,False,False,4,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1,"# Build An Alexa Fact Skill\n<img src=""https:/..."


## Create spaCy Documents from READMEs

Setup the large english language model and have it merge multi-token named entities.

In [13]:
import spacy
from spacy.pipeline import merge_entities


# Enable a GPU if you have one
spacy.prefer_gpu()

# Download the spaCy english model
spacy.cli.download('en_core_web_lg')
nlp = spacy.load("en_core_web_lg")

# Merge multi-token entities together
nlp.add_pipe(merge_entities)

nlp.pipeline


[93m    Linking successful[0m
    /home/rjurney/anaconda3/envs/amazon/lib/python3.7/site-packages/en_core_web_lg
    -->
    /home/rjurney/anaconda3/envs/amazon/lib/python3.7/site-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



[('tagger', <spacy.pipeline.Tagger at 0x7f1544a32a50>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7f14fda1da10>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7f14fda1dfb0>),
 ('merge_entities', <function spacy.pipeline.merge_entities>)]

In [14]:
df['spacy'] = df['readme'].apply(nlp)
df.head(3)

## Load the Gold Labeled Data

Data was labeled via a [Google Sheet](https://docs.google.com/spreadsheets/d/1ULt0KxIdb5HUJCEMt_AmOuPbTvN1zg8UA_4RvjlVwXQ/edit?usp=sharing) and exported to CSV at [data/Amazon_Open_Source_Analysis_Gold.csv](data/Amazon_Open_Source_Analysis_Gold.csv).

### Submitting Corrections or Additions

If you feel any labels are wrong, first read the definitions in the README and comment on the sheet. You may also copy the Google Sheet and continue labeling yourself if you want to ensure the accuracy of this analysis.

In [None]:
# Load all 2,469 records and then filter out the unlabeled ones
df_gold = pd.read_csv('data/Amazon_Open_Source_Analysis_Gold.csv')

df_gold = df_gold[df_gold['label'].notnull()]
print(f'Gold labeled records: {len(df_gold.index):,}')

df_gold = df.set_index('id').join(
    df_gold.set_index('id'),
    how='inner',
    on='id',
    rsuffix='_gold'
)

# Drop duplicate columns
df_gold = df_gold.drop(
    [
        'full_name_gold','url_gold','description_gold','fork_gold','forks_count_gold',
        'language_gold','homepage_gold','open_issues_count_gold','watchers_gold', 
        'readme_gold',
        
    ],
    axis=1
)

### Defining Label Schema

The labels for this dataset are:

| Number | Code      | Description                      |
|--------|-----------|----------------------------------|
| -1     | ABSTAIN   | No vote, for Labeling Functions  |
| 0      | GENERAL   | A FOSS project of general appeal |
| 1      | API       | An API library for AWS           |
| 2      | EDUCATION | An educational library for AWS   |
| 3      | DATASET   | An open dataset by Amazon        |

In [63]:
ABSTAIN   = -1
GENERAL   = 0
API       = 1
EDUCATION = 2
DATASET   = 3

In [164]:
# Map the labels to their numeric label numbers
def label_to_number(x):
    """Convert string labels from the Google Sheet to their numeric values"""
    if x == 'ABSTAIN':
        return ABSTAIN
    if x == 'GENERAL':
        return GENERAL
    if x == 'API':
        return API
    if x == 'EDUCATION':
        return EDUCATION
    if x == 'DATASET':
        return DATASET


def number_to_label(x):
    """Convert numeric labels to their values in the Google Sheet"""
    if x == ABSTAIN:
        return 'ABSTAIN'
    if x == GENERAL:
        return 'GENERAL'
    if x == API:
        return 'API'
    if x == EDUCATION:
        return 'EDUCATION'
    if x == DATASET:
        return 'DATASET'


class_names = ['ABSTAIN', 'GENERAL', 'API', 'EDUCATION', 'DATASET']

def number_to_index(x):
    if x == ABSTAIN:
        return 0
    if x == GENERAL:
        return 1
    if x == API:
        return 2
    if x == EDUCATION:
        return 3
    if x == DATASET:
        return 4


df_gold['label_num'] = df_gold['label'].apply(label_to_number)
df_gold[['full_name','label','label_num']].head(3)

Unnamed: 0_level_0,full_name,label,label_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
61861755,alexa/alexa-skills-kit-sdk-for-nodejs,API,1
84138837,alexa/alexa-cookbook,EDUCATION,2
63275452,alexa/skill-sample-nodejs-fact,EDUCATION,2


## Now Create a Random Forest Model using a Sparse Representation to Pick Keyword Label Functions

We will use the spaCy doc we created to lemmatize as we tokenize the words, giving us better representations for feature importances.

In [165]:
def lemmatize(doc):
    return [token.lemma_ for token in doc]

df_gold['lemmas'] = df_gold['spacy'].apply(lemmatize)
df_gold['lemmas']

id
61861755     [<, p, align="center, ", >, \n  , <, img, src=...
84138837     [\n, #, alexa, skill, building, cookbook, \n\n...
63275452     [#, build, an, alexa, fact, skill, \n, <, img,...
81483877     [#, what, be, the, (, avs, ), ?, \n\n, the, (,...
38904647     [<, p, align="center, ", >, \n  , <, img, src=...
                                   ...                        
105808767    [#, aws, cloud, development, kit, (, aws, cdk,...
574877       [#, aws, for, java, [, !, [, build, status](ht...
2050163      [#, aws, for, ruby, \n\n, [, !, [, gitter](htt...
107600830    [#, aws, lambda, for, go, \n, [, !, [, godoc][...
159005377    [#, container, roadmap, \n\n, this, be, the, p...
Name: lemmas, Length: 203, dtype: object

## TF-IDF Vectorize and Split the Text Data and Labels into Train/Test Sets

We need to vectorize the data in a sparse representation to train the model and get feature importances for each word, so we use sklearn's `TfidfVectorizer` to give more important words more weight.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


vectorizer = TfidfVectorizer(
    analyzer='word',
    min_df=3,
    stop_words='english',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False,
)

df_gold_train, df_gold_test, train_labels, test_labels = train_test_split(
    df_gold,
    df_gold['label_num'],
    test_size=0.3,
    random_state=1337,
)

train_vec = vectorizer.fit_transform(
    df_gold_train['lemmas']
)
test_vec = vectorizer.transform(
    df_gold_test['lemmas']
)

## Now Train a `RandomForestClassifier` and Determine Overall Feature Importances

A random forest model can give us overall feature importances directly, but it doesn't tell us which class they were important for or in which direction: for or against.

In [183]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=500)
clf.fit(train_vec, train_labels)

# Display features and importances in a DataFrame
features = pd.DataFrame(
    {'importance': clf.feature_importances_},
    index=vectorizer.get_feature_names()
)
features = features.sort_values(
    by=['importance'],
    ascending=False
)
features[0:20]

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,importance
skill,0.020562
alexa,0.019981
sample,0.019329
ask,0.014249
skills,0.01349
kit,0.012553
learn,0.012319
voice,0.011787
ion,0.011259
amazon,0.011253


## Observe the Feature Importances to Infer Labeling Functions

From the above we can see some candidates for keyword Labeling Functions in Snorkel. For example, `skill` is a common word in Amazon Alexa skill examples, which are `EDUCATION`. Accordingly most `alexa` projects are also `EDUCATION`. Is this always the case? Probably not, there must be an Alexa Skills `API` SDK. But remember, Snorkel's `LabelModel` will combine weak labels and determine a strong label so long as there are enough Labeling Functions that perform better than random - 20% for our 5 classes.

Similarly, `sample` indicates `EDUCATION`, `kit` might mean `API`, `learn` means `EDUCATION`, Amazon has invested a lot in the ion OSS project, so that is probably `GENERAL`. `tutorial` is probably `EDUCATION`. `sdk` means `API`, as perhaps does `amazon` (as in for example, `amazon lambda`).

Are all of these perfect? No. Are most of them probably better than random, so they will help accuracy? I think so, but we don't have to assume. Snorkel's `LFAnalysis` will tell us their accuracy and we can throw out those that don't perform well enough to help.

While we are still at an overall summary level, we've got some good ideas for Labeling Functions already. And note... our `RandomForestClassifier` below can function as a weak labeler itself! :)

In [184]:
import lime
import numpy as np
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline

avg = 'weighted'
pred = clf.predict(test_vec)
print(f"Model weighted F1 score: {f1_score(test_labels, pred, average=avg)}")

c = make_pipeline(vectorizer, clf)

Model weighted F1 score: 0.7318069222380503


  'precision', 'predicted', average, warn_for)


In [185]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

prob_rows = []
for idx in df_gold_test.index[0:20]:
    exp = explainer.explain_instance(
        df_gold_test['lemmas'].apply(lambda x: ' '.join(x))[idx],
        c.predict_proba,
        num_features=5
    )
    full_name  = df_gold_test['full_name'][idx]
    probs      = c.predict_proba([df_gold_test['lemmas'][idx]])
    pred_index = np.argmax(probs[0])
    pred_class = class_names[pred_index]
    true_class = class_names[number_to_index(test_labels[idx])]
    top_terms  = sorted(exp.as_list(), key=lambda x: x[1])
    
    row = [
        [full_name] + \
        probs[0].tolist() + \
        [pred_class, true_class] + \
        [x[0] for x in top_terms]
    ]
    prob_rows.append(row[0])

pd.DataFrame(
    prob_rows,
    columns=[
        'Full Name', 'P(ABSTAIN)', 'P(GENERAL)', 'P(API)', 'P(EDUCATION)', 
        'P(DATASET)', 'Pred Class', 'True Class', 'Term 1', 'Term 2',
        'Term 3', 'Term 4', 'Term 5',
    ]
)

Unnamed: 0,Full Name,P(ABSTAIN),P(GENERAL),P(API),P(EDUCATION),P(DATASET),Pred Class,True Class,Term 1,Term 2,Term 3,Term 4,Term 5
0,alexa/skill-sample-nodejs-team-lookup,0.0,0.022,0.03,0.948,0.0,EDUCATION,EDUCATION,68747470733a2f2f6d2e6d656469612d616d617a6f6e2e...,db9b9ce26327ad3bac57ec4daf0961a382d75790,94b2,4,alexa
1,aws/containers-roadmap,0.003667,0.290333,0.404,0.296,0.006,API,EDUCATION,ec2,20preview,0,4,roadmap
2,alexa/skill-sample-nodejs-test-automation,0.0051,0.1589,0.2,0.624,0.012,EDUCATION,API,lab08,lab05,lab02,lab06,alexa
3,amzn/smoke-framework-application-generate,0.0005,0.3675,0.488,0.112,0.032,API,GENERAL,5,0,2,service,4
4,amzn/service-model-swift-code-generate,0.009167,0.294833,0.556,0.132,0.008,API,GENERAL,0,2,4,20gitter,5
5,amzn/amazon-pay-sdk-samples,0.03,0.174,0.278,0.518,0.0,EDUCATION,EDUCATION,developer,documentation,payments,please,demonstrate
6,amzn/amzn-drivers,0.079133,0.462867,0.248,0.198,0.012,GENERAL,API,dpdk,rpm,txt,fix,linux
7,amzn/ion-python,0.001,0.643,0.24,0.104,0.012,GENERAL,GENERAL,5,10,16,6,123
8,amzn/MXFusion,0.0,0.588,0.252,0.128,0.032,GENERAL,GENERAL,2,3,license,mxnet,mxfusion
9,amzn/askalono,0.003,0.461,0.268,0.216,0.052,GENERAL,GENERAL,80,b8rensen,e2,askalono,could


In [23]:
%%html
<style>
table {float:left}
</style>

## Label Schema

The labels for this dataset are:

| Number | Code      | Description                      |
|--------|-----------|----------------------------------|
| -1     | ABSTAIN   | No vote, for Labeling Functions  |
| 0      | GENERAL   | A FOSS project of general appeal |
| 1      | API       | An API library for AWS           |
| 2      | EDUCATION | An educational library for AWS   |
| 3      | DATASET   | An open dataset by Amazon        |

In [26]:
%%html
<style>
table {float:left}
</style>

## Labeling Functions

Labeling functions each weakly label the data and need only be better than random. Snorkel's
unsupervised generative graphical model combines these weak labels into strong labels by 
looking at the overlap, conflict and coverage of each weak label set.

| Logic                           | Fields                               | Label       | 200 Sample Accuracy |
|---------------------------------|--------------------------------------|-------------|---------------------|
| If 'sdk' is in                  | `full_name`, `description`, `readme` | `API`       |                     |
| If 'sample' is in               | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'dataset' is in              | `full_name`, `description`, `readme` | `DATASET`   |                     |
| If 'demonstrate' / 'demo' is in | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'walkthrough is in           | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'skill' is in                | `full_name`, `description`           | `EDUCATION` |                     |
| If 'kit' is in                  | `full_name`, `description`           | `EDUCATION` |                     |
| If 'toolbox' is in              | `description`                        | `GENERAL`   |                     |
| if 'extension' is in            | `description`                        | `API`       |                     |
| id 'add amazon' is in           | `description`                        | `API`       |                     |
| if 'integrate' is in            | `description`                        | `API`       |                     |
| if 'ion' is in                  | `full_name`, `description`           | `GENERAL`   |                     |
|                                 |                                      |             |                     |
|                                 |                                      |             |                     |
