In [1]:
import pandas as pd


df = pd.read_json('data/aws_repos.jsonl', lines=True)
df.head(3)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,mirror_url,archived,disabled,open_issues_count,license,forks,open_issues,watchers,default_branch,score
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,,False,False,10,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,,False,False,15,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,,False,False,4,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1


## Get READMEs for each repository

The [Github README API](https://developer.github.com/v3/repos/contents/#get-the-readme) makes it very easy to download the README of a project. Let's fetch the README of every Amazon open source project on Github.

In [None]:
import base64
import os
import time

import github


GITHUB_PERSONAL_TOKEN = os.environ.get('GITHUB_PERSONAL_TOKEN')
if not GITHUB_PERSONAL_TOKEN:
    raise Exception('GITHUB_PERSONAL_TOKEN undefined!')

g = github.Github(GITHUB_PERSONAL_TOKEN)

In [None]:
def get_readme(full_name):
    """Get the repo for full_name, get the readme, then decode base64->bytes->utf8"""
    repo = g.get_repo(full_name)
    readme_bin = repo.get_readme().content
    readme = base64.b64decode(readme_bin).decode()
    return readme

# Given a project full_name (owner/repo), fetch the README and return as a UTF-8 string
readmes = []
for i, full_name in df['full_name'].iteritems():
    try:
        readme = get_readme(full_name)
        readmes.append(readme)
    except github.UnknownObjectException as e:
        print(e)
        readmes.append('')
    except github.RateLimitExceededException as e:
        print(e)
        
        # Sleep for an hour + a 2 minute safety margin, if we hit the 5,000/hr rate limit
        time.sleep(60 * 62)
        
        # Get the repo again, now that we aren't rate limited
        readme = get_readme(full_name)
        readmes.append(readme)

In [None]:
df['readme'] = pd.Series(readmes)

In [None]:
df.head(3)

## Store the Data for Hand Labeling of a Sample

Store the data as CSV for hand labeling to guide our Labeling Function development. Also store to Parquet.

In [22]:
import pyarrow


# Save to CSV for hand labeling
df[[
    'id', 'full_name', 'description', 'fork', 'forks_count', 'language', 'homepage',
    'open_issues_count', 'watchers', 'readme', 
]].to_csv(
    'data/aws_readmes.csv',
    index_label='index',
)

# Save relevant features to Parquet for safe keeping
df[['id', 'full_name', 'description', 'fork', 'forks_count', 'language', 'homepage',
    'open_issues_count', 'watchers', 'readme',
]].to_parquet(
    'data/aws_readmes.parquet',
    engine='pyarrow'
)

In [12]:
# # Temporary load from other machine
# import pyarrow

# readme_df = pd.read_parquet('data/aws_readmes.parquet', engine='pyarrow')

# # Join READMEs in and drop duplicate ID column
# df_join = df.join(readme_df, rsuffix='_readme')
# del df_join['id_readme']

# df = df_join

# df.head(3)

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,archived,disabled,open_issues_count,license,forks,open_issues,watchers,default_branch,score,readme
0,61861755,MDEwOlJlcG9zaXRvcnk2MTg2MTc1NQ==,alexa-skills-kit-sdk-for-nodejs,alexa/alexa-skills-kit-sdk-for-nodejs,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-skills-kit-sdk-...,The Alexa Skills Kit SDK for Node.js helps you...,False,https://api.github.com/repos/alexa/alexa-skill...,...,False,False,10,"{'key': 'apache-2.0', 'name': 'Apache License ...",662,10,2774,2.0.x,1,"<p align=""center"">\n <img src=""https://m.medi..."
1,84138837,MDEwOlJlcG9zaXRvcnk4NDEzODgzNw==,alexa-cookbook,alexa/alexa-cookbook,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-cookbook,A series of sample code projects to be used fo...,False,https://api.github.com/repos/alexa/alexa-cookbook,...,False,False,15,"{'key': 'apache-2.0', 'name': 'Apache License ...",899,15,1534,master,1,\n# Alexa Skill Building Cookbook\n\n<div styl...
2,63275452,MDEwOlJlcG9zaXRvcnk2MzI3NTQ1Mg==,skill-sample-nodejs-fact,alexa/skill-sample-nodejs-fact,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-fact,Build An Alexa Fact Skill,False,https://api.github.com/repos/alexa/skill-sampl...,...,False,False,4,"{'key': 'apache-2.0', 'name': 'Apache License ...",1174,4,989,master,1,"# Build An Alexa Fact Skill\n<img src=""https:/..."


## Create spaCy Documents from READMEs

Setup the large english language model and have it merge multi-token named entities.

In [13]:
import spacy
from spacy.pipeline import merge_entities


# Enable a GPU if you have one
spacy.prefer_gpu()

# Download the spaCy english model
spacy.cli.download('en_core_web_lg')
nlp = spacy.load("en_core_web_lg")

# Merge multi-token entities together
nlp.add_pipe(merge_entities)

nlp.pipeline


[93m    Linking successful[0m
    /home/rjurney/anaconda3/envs/amazon/lib/python3.7/site-packages/en_core_web_lg
    -->
    /home/rjurney/anaconda3/envs/amazon/lib/python3.7/site-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')



[('tagger', <spacy.pipeline.Tagger at 0x7f1544a32a50>),
 ('parser', <spacy.pipeline.DependencyParser at 0x7f14fda1da10>),
 ('ner', <spacy.pipeline.EntityRecognizer at 0x7f14fda1dfb0>),
 ('merge_entities', <function spacy.pipeline.merge_entities>)]

In [14]:
df['spacy'] = df['readme'].apply(nlp)
df.head(3)

## Load the Gold Labeled Data

Data was labeled via a [Google Sheet](https://docs.google.com/spreadsheets/d/1ULt0KxIdb5HUJCEMt_AmOuPbTvN1zg8UA_4RvjlVwXQ/edit?usp=sharing) and exported to CSV at [data/Amazon_Open_Source_Analysis_Gold.csv](data/Amazon_Open_Source_Analysis_Gold.csv).

### Submitting Corrections or Additions

If you feel any labels are wrong, first read the definitions in the README and comment on the sheet. You may also copy the Google Sheet and continue labeling yourself if you want to ensure the accuracy of this analysis.

In [None]:
# Load all 2,469 records and then filter out the unlabeled ones
df_gold = pd.read_csv('data/Amazon_Open_Source_Analysis_Gold.csv')

df_gold = df_gold[df_gold['label'].notnull()]
print(f'Gold labeled records: {len(df_gold.index):,}')

df_gold = df.set_index('id').join(
    df_gold.set_index('id'),
    how='inner',
    on='id',
    rsuffix='_gold'
)

# Drop duplicate columns
df_gold = df_gold.drop(
    [
        'full_name_gold','url_gold','description_gold','fork_gold','forks_count_gold',
        'language_gold','homepage_gold','open_issues_count_gold','watchers_gold', 
        'readme_gold',
        
    ],
    axis=1
)

### Defining Label Schema

The labels for this dataset are:

| Number | Code      | Description                      |
|--------|-----------|----------------------------------|
| -1     | ABSTAIN   | No vote, for Labeling Functions  |
| 0      | GENERAL   | A FOSS project of general appeal |
| 1      | API       | An API library for AWS           |
| 2      | EDUCATION | An educational library for AWS   |
| 3      | DATASET   | An open dataset by Amazon        |

In [63]:
ABSTAIN   = -1
GENERAL   = 0
API       = 1
EDUCATION = 2
DATASET   = 3

In [164]:
# Map the labels to their numeric label numbers
def label_to_number(x):
    """Convert string labels from the Google Sheet to their numeric values"""
    if x == 'ABSTAIN':
        return ABSTAIN
    if x == 'GENERAL':
        return GENERAL
    if x == 'API':
        return API
    if x == 'EDUCATION':
        return EDUCATION
    if x == 'DATASET':
        return DATASET


def number_to_label(x):
    """Convert numeric labels to their values in the Google Sheet"""
    if x == ABSTAIN:
        return 'ABSTAIN'
    if x == GENERAL:
        return 'GENERAL'
    if x == API:
        return 'API'
    if x == EDUCATION:
        return 'EDUCATION'
    if x == DATASET:
        return 'DATASET'


class_names = ['ABSTAIN', 'GENERAL', 'API', 'EDUCATION', 'DATASET']

def number_to_index(x):
    if x == ABSTAIN:
        return 0
    if x == GENERAL:
        return 1
    if x == API:
        return 2
    if x == EDUCATION:
        return 3
    if x == DATASET:
        return 4


df_gold['label_num'] = df_gold['label'].apply(label_to_number)
df_gold[['full_name','label','label_num']].head(3)

Unnamed: 0_level_0,full_name,label,label_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
61861755,alexa/alexa-skills-kit-sdk-for-nodejs,API,1
84138837,alexa/alexa-cookbook,EDUCATION,2
63275452,alexa/skill-sample-nodejs-fact,EDUCATION,2


## Now Create a Random Forest Model using a Sparse Representation to Pick Keyword Label Functions

We will use the spaCy doc we created to lemmatize as we tokenize the words, giving us better representations for feature importances.

In [165]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def lemmatize(doc):
    return [token.lemma_ for token in doc]

df_gold['lemmas'] = df_gold['spacy'].apply(lemmatize)
df_gold['lemmas']

id
61861755     [<, p, align="center, ", >, \n  , <, img, src=...
84138837     [\n, #, alexa, skill, building, cookbook, \n\n...
63275452     [#, build, an, alexa, fact, skill, \n, <, img,...
81483877     [#, what, be, the, (, avs, ), ?, \n\n, the, (,...
38904647     [<, p, align="center, ", >, \n  , <, img, src=...
                                   ...                        
105808767    [#, aws, cloud, development, kit, (, aws, cdk,...
574877       [#, aws, for, java, [, !, [, build, status](ht...
2050163      [#, aws, for, ruby, \n\n, [, !, [, gitter](htt...
107600830    [#, aws, lambda, for, go, \n, [, !, [, godoc][...
159005377    [#, container, roadmap, \n\n, this, be, the, p...
Name: lemmas, Length: 203, dtype: object

In [167]:
vectorizer = TfidfVectorizer(
    analyzer='word',
    min_df=3,
    #stop_words='english',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False,
)

df_gold_train, df_gold_test, train_labels, test_labels = train_test_split(
    df_gold,
    df_gold['label_num'].values,
    test_size=0.3,
    random_state=1337,
)

train_vec = vectorizer.fit_transform(
    df_gold_train['lemmas']
)
test_vec = vectorizer.transform(
    df_gold_test['lemmas']
)


clf = RandomForestClassifier(n_estimators=500)
clf.fit(train_vec, train_labels)

# Display features and importances in a DataFrame
features = pd.DataFrame(
    {'importance': clf.feature_importances_},
    index=vectorizer.get_feature_names()
)
features = features.sort_values(
    by=['importance'],
    ascending=False
)
features[0:20]

Unnamed: 0,importance
skill,0.020205
sample,0.01691
alexa,0.016517
learn,0.012054
voice,0.010535
ask,0.010429
kit,0.010141
ion,0.009859
amazon,0.009754
tutorial,0.009564


In [170]:
df_gold_train

Unnamed: 0_level_0,node_id,name,full_name,private,owner,html_url,description,fork,url,forks_url,...,open_issues,watchers,default_branch,score,readme,spacy,index,label,label_num,lemmas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
185252023,MDEwOlJlcG9zaXRvcnkxODUyNTIwMjM=,metalearn-leap,amzn/metalearn-leap,False,"{'login': 'amzn', 'id': 8594673, 'node_id': 'M...",https://github.com/amzn/metalearn-leap,Original PyTorch implementation of the Leap me...,False,https://api.github.com/repos/amzn/metalearn-leap,https://api.github.com/repos/amzn/metalearn-le...,...,0,133,master,1,## Transferring Knowledge across Learning Proc...,"(#, #, Transferring, Knowledge, across, Learni...",106,GENERAL,0,"[#, #, transferring, knowledge, across, learni..."
129122096,MDEwOlJlcG9zaXRvcnkxMjkxMjIwOTY=,skill-sample-nodejs-berry-bash,alexa/skill-sample-nodejs-berry-bash,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-nodejs-b...,Demonstrates the use of interactive render tem...,False,https://api.github.com/repos/alexa/skill-sampl...,https://api.github.com/repos/alexa/skill-sampl...,...,3,18,master,1,# Quickly Build A Multi Modal Quiz & Dictionar...,"(#, Quickly, Build, A, Multi, Modal, Quiz, &, ...",46,EDUCATION,2,"[#, quickly, build, a, multi, modal, quiz, &, ..."
164706289,MDEwOlJlcG9zaXRvcnkxNjQ3MDYyODk=,sketch-constructor,amzn/sketch-constructor,False,"{'login': 'amzn', 'id': 8594673, 'node_id': 'M...",https://github.com/amzn/sketch-constructor,Read/write/manipulate Sketch files in Node wit...,False,https://api.github.com/repos/amzn/sketch-const...,https://api.github.com/repos/amzn/sketch-const...,...,14,372,master,1,# Sketch Constructor\n\nThis library provides ...,"(#, Sketch, Constructor, \n\n, This, library, ...",99,GENERAL,0,"[#, sketch, constructor, \n\n, this, library, ..."
190665843,MDEwOlJlcG9zaXRvcnkxOTA2NjU4NDM=,sample-fire-tv-app-video-skill,amzn/sample-fire-tv-app-video-skill,False,"{'login': 'amzn', 'id': 8594673, 'node_id': 'M...",https://github.com/amzn/sample-fire-tv-app-vid...,This sample Fire TV app shows how to integrate...,False,https://api.github.com/repos/amzn/sample-fire-...,https://api.github.com/repos/amzn/sample-fire-...,...,0,2,master,1,# Readme\n\nThis sample-fire-tv-app-video-skil...,"(#, Readme, \n\n, This, sample, -, fire, -, tv...",175,EDUCATION,2,"[#, readme, \n\n, this, sample, -, fire, -, tv..."
145045841,MDEwOlJlcG9zaXRvcnkxNDUwNDU4NDE=,alexa-apis-for-python,alexa/alexa-apis-for-python,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/alexa-apis-for-python,The Alexa APIs for Python consists of python c...,False,https://api.github.com/repos/alexa/alexa-apis-...,https://api.github.com/repos/alexa/alexa-apis-...,...,0,27,master,1,.. raw:: html\n\n <embed>\n <p align...,"(.., raw, :, :, html, \n\n , <, embed, >, \...",34,API,1,"[.., raw, :, :, html, \n\n , <, embed, >, \..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574877,MDEwOlJlcG9zaXRvcnk1NzQ4Nzc=,aws-sdk-java,aws/aws-sdk-java,False,"{'login': 'aws', 'id': 2232217, 'node_id': 'MD...",https://github.com/aws/aws-sdk-java,The official AWS SDK for Java.,False,https://api.github.com/repos/aws/aws-sdk-java,https://api.github.com/repos/aws/aws-sdk-java/...,...,170,3113,master,1,# AWS SDK for Java [![Build Status](https://tr...,"(#, AWS SDK, for, Java, [, !, [, Build, Status...",199,API,1,"[#, aws, for, java, [, !, [, build, status](ht..."
86101701,MDEwOlJlcG9zaXRvcnk4NjEwMTcwMQ==,amazon-instant-access-sdk-ruby,amzn/amazon-instant-access-sdk-ruby,False,"{'login': 'amzn', 'id': 8594673, 'node_id': 'M...",https://github.com/amzn/amazon-instant-access-...,Ruby SDK to aid in third-party integration wit...,False,https://api.github.com/repos/amzn/amazon-insta...,https://api.github.com/repos/amzn/amazon-insta...,...,0,4,master,1,Amazon Instant Access - Ruby SDK\n============...,"(Amazon Instant Access - Ruby, SDK, \n, =, =, ...",167,API,1,"[amazon, sdk, \n, =, =, =, =, =, =, =, =, =, =..."
181562814,MDEwOlJlcG9zaXRvcnkxODE1NjI4MTQ=,skill-sample-java-how-to,alexa/skill-sample-java-how-to,False,"{'login': 'alexa', 'id': 17815977, 'node_id': ...",https://github.com/alexa/skill-sample-java-how-to,This tutorial will walk first-time Alexa skill...,False,https://api.github.com/repos/alexa/skill-sampl...,https://api.github.com/repos/alexa/skill-sampl...,...,0,0,master,1,Build an Alexa How-To Skill in ASK Java SDK\n=...,"(Build, an, Alexa, How, -, To, Skill, in, ASK,...",92,EDUCATION,2,"[build, an, alexa, how, -, to, skill, in, ask,..."
220352818,MDEwOlJlcG9zaXRvcnkyMjAzNTI4MTg=,basis-point-sets,amzn/basis-point-sets,False,"{'login': 'amzn', 'id': 8594673, 'node_id': 'M...",https://github.com/amzn/basis-point-sets,,False,https://api.github.com/repos/amzn/basis-point-...,https://api.github.com/repos/amzn/basis-point-...,...,0,0,master,1,# Efficient Learning on Point Clouds with Basi...,"(#, Efficient, Learning, on, Point, Clouds, wi...",189,GENERAL,0,"[#, efficient, learning, on, point, clouds, wi..."


In [168]:
import lime
import numpy as np
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline

avg = 'weighted'
pred = clf.predict(test_vec)
print(f"Model weighted F1 score: {f1_score(test_labels, pred, average=avg)}")

c = make_pipeline(vectorizer, clf)

Model weighted F1 score: 0.738921205711264


  'precision', 'predicted', average, warn_for)


In [169]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

prob_rows = []
for idx in df_gold.index[0:20]:
    exp = explainer.explain_instance(
        df_gold['lemmas'].apply(lambda x: ' '.join(x))[idx],
        c.predict_proba,
        num_features=5
    )
    full_name  = df_gold['full_name'][idx]
    probs      = c.predict_proba([df_gold['lemmas'][idx]])
    pred_index = np.argmax(probs[0])
    pred_class = class_names[pred_index]
    true_class = class_names[number_to_index(test_labels[idx])]
    top_terms  = sorted(exp.as_list(), key=lambda x: x[1])
    
#     print('Project full_name:', full_name)
#     print('Probability(ABSTAIN)   =', probs[0,0])
#     print('Probability(GENERAL)   =', probs[0,1])
#     print('Probability(API)       =', probs[0,2])
#     print('Probability(EDUCATION) =', probs[0,3])
#     print('Probability(DATASET)   =', probs[0,4])
#     print('Pred class: %s' % pred_class)
#     print('True class: %s' % true_class)
#     print(top_terms)
    
    row = [
        [full_name] + \
        probs[0].tolist() + \
        [pred_class, true_class] + \
        [x[0] for x in top_terms]
    ]
    prob_rows.append(row[0])

pd.DataFrame(
    prob_rows,
    columns=[
        'Full Name', 'P(ABSTAIN)', 'P(GENERAL)', 'P(API)', 'P(EDUCATION)', 
        'P(DATASET)', 'Pred Class', 'True Class', 'Term 1', 'Term 2',
        'Term 3', 'Term 4', 'Term 5',
    ]
)

IndexError: index 61861755 is out of bounds for axis 0 with size 61

In [23]:
%%html
<style>
table {float:left}
</style>

## Label Schema

The labels for this dataset are:

| Number | Code      | Description                      |
|--------|-----------|----------------------------------|
| -1     | ABSTAIN   | No vote, for Labeling Functions  |
| 0      | GENERAL   | A FOSS project of general appeal |
| 1      | API       | An API library for AWS           |
| 2      | EDUCATION | An educational library for AWS   |
| 3      | DATASET   | An open dataset by Amazon        |

In [26]:
%%html
<style>
table {float:left}
</style>

## Labeling Functions

Labeling functions each weakly label the data and need only be better than random. Snorkel's
unsupervised generative graphical model combines these weak labels into strong labels by 
looking at the overlap, conflict and coverage of each weak label set.

| Logic                         | Fields                               | Label       | 200 Sample Accuracy |
|-------------------------------|--------------------------------------|-------------|---------------------|
| If 'sdk' is in                | `full_name`, `description`, `readme` | `API`       |                     |
| If 'sample' is in             | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'dataset' is in            | `full_name`, `description`, `readme` | `DATASET`   |                     |
| If 'demonstrate'/'demo' is in | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'walkthrough' is in        | `full_name`, `description`, `readme` | `EDUCATION` |                     |
| If 'skill' is in              | `full_name`, `description`           | `EDUCATION` |                     |
| If 'kit' is in                | `full_name`, `description`           | `EDUCATION` |                     |
| If 'toolbox' is in            | `description`                        | `GENERAL`   |                     |
| if 'extension' is in          | `description`                        | `API`       |                     |
