# Introduction

## Purpose 

Here we are experimenting with semi-supervised learning. We have prepared =~ 80 examples for our initial model. We will then use this model to classify more data, and then that model to classify more data, cyclically until we arrive at our goal of a good classifier.

## Classifying Educational Projects

The goal of this project is to classify github projects as primarily educational or not. This classification will then feed into a time series analysis of educational content on Github over time.

## First Model

In [1]:
import sys, os, re
import json, csv
import jsonlines
import numpy as np
import pandas as pd
import random

# =~ 80 examples done by hand... lets see if we can use semi-supervised learning to improve our training data!
raw_training_data = []
with jsonlines.open('../data/html/first_training_enriched_processed.jsonl') as reader:
    raw_training_data = [record for record in reader]
raw_training_data[0]

{'archived': 0,
 'forks': 1,
 'has_downloads': 1,
 'has_issues': 1,
 'has_wiki': 1,
 'is_edu': 0,
 'is_fork': 0,
 'network_count': 1,
 'open_issues': 0,
 'readme_words': ['htf', 'hack', 'future', 'net', 'challenge'],
 'repo': 'Djohnnie/HTF2017',
 'size': 79,
 'stargazers': 1,
 'subscribers': 2,
 'watchers': 1}

In [2]:
# Add the number of words, as it is a strong signal
len_nos = [len(doc['readme_words']) for doc in raw_training_data if doc['is_edu'] == 0]
len_yes = [len(doc['readme_words']) for doc in raw_training_data if doc['is_edu'] == 1]
len_all = [len(doc['readme_words']) for doc in raw_training_data]
max_all = max(len_all)
normalized_all = [x/max_all for x in len_all]
values = [doc['is_edu'] for doc in raw_training_data]

print('Nos average: {:.2f}, median: {}'.format(np.average(len_nos), np.median(len_nos)))
print('Yes average: {:.2f}, median: {}'.format(np.average(len_yes), np.median(len_yes)))

print('README word count/education normalized cross correlation: {0:.2f}'.format(
    np.correlate(values, normalized_all)[0]
))

Nos average: 143.07, median: 77.0
Yes average: 1003.18, median: 231.0
README word count/education normalized cross correlation: 2.25


In [3]:
just_words = [' '.join(doc['readme_words']) for doc in raw_training_data]
just_words[0]

'htf hack future net challenge'

In [4]:
just_values = [doc['is_edu'] for doc in raw_training_data]
just_values = np.array(just_values)
just_values[0]

0

In [5]:
from keras.preprocessing.text import Tokenizer

MAX_WORDS=1000

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(just_words)

sequences = tokenizer.texts_to_sequences(just_words)
word_index = tokenizer.word_index
vocab_size = len(word_index)

print('Found {:,} unique tokens.'.format(vocab_size))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 7,437 unique tokens.


In [6]:
from keras import preprocessing

MAX_WORDS_PER_DOC=1000

padded_sequences = preprocessing.sequence.pad_sequences(
    sequences,
    maxlen=MAX_WORDS_PER_DOC
)
padded_sequences.shape

(76, 1000)

In [21]:
# Prepare a numpy array containing our enrichment data...
CONTINUOUS_KEYS = [
    'forks',
    'network_count',
    'open_issues',
    'size',
    'stargazers',
    'subscribers',
    'watchers',
    'word_count',
]

TOKEN_KEYS = [
    'archived',
    'has_downloads',
    'has_issues',
    'has_wiki',
    'is_fork',
]

enrichment_data = []
for doc in raw_training_data:
    doc['word_count'] = len(doc['readme_words'])
    new_doc = { your_key: doc[your_key] for your_key in CONTINUOUS_KEYS + TOKEN_KEYS }
    enrichment_data.append(new_doc)

df = pd.DataFrame(enrichment_data)
df[0:6]

Unnamed: 0,archived,forks,has_downloads,has_issues,has_wiki,is_fork,network_count,open_issues,size,stargazers,subscribers,watchers,word_count
0,0,1,1,1,1,0,1,0,79,1,2,1,5
1,0,1,1,0,1,1,1378,0,2507,0,1,0,234
2,0,94,1,1,1,0,94,7,58529,133,26,133,73
3,0,2,1,1,1,0,2,0,21,2,2,2,93
4,0,29,1,1,1,0,29,0,9740,25,2,25,2
5,0,1,1,1,1,0,1,0,24,3,2,3,0


In [22]:
# L2 normalize continuous variables
df[CONTINUOUS_KEYS] = (df[CONTINUOUS_KEYS] - df[CONTINUOUS_KEYS].mean()) / df[CONTINUOUS_KEYS].std()
df[0:6]

Unnamed: 0,archived,forks,has_downloads,has_issues,has_wiki,is_fork,network_count,open_issues,size,stargazers,subscribers,watchers,word_count
0,0,-0.337913,1,1,1,0,-0.392798,-0.385463,-0.486493,-0.342495,-0.368881,-0.342495,-0.280836
1,0,-0.337913,1,0,1,1,0.131313,-0.385463,-0.417227,-0.342589,-0.370256,-0.342589,-0.155115
2,0,-0.302495,1,1,1,0,-0.357401,-0.136294,1.180953,-0.330136,-0.335887,-0.330136,-0.243504
3,0,-0.337532,1,1,1,0,-0.392418,-0.385463,-0.488147,-0.342402,-0.368881,-0.342402,-0.232524
4,0,-0.32725,1,1,1,0,-0.382141,-0.385463,-0.210886,-0.340248,-0.368881,-0.340248,-0.282483
5,0,-0.337913,1,1,1,0,-0.392798,-0.385463,-0.488062,-0.342308,-0.368881,-0.342308,-0.283581


In [31]:
# Randomly sort data
indices = np.arange(padded_sequences.shape[0])
np.random.shuffle(indices)

padded_sequences = padded_sequences[indices]
df = df.iloc[indices]
labels = just_values[indices]

In [32]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#     data, 
#     labels, 
#     test_size=0.2,
#     random_state=27
# )

from sklearn.model_selection import StratifiedKFold

TRAINING_RUNS = 2

seed = 11
np.random.seed(seed)

kfold = StratifiedKFold(n_splits=TRAINING_RUNS, shuffle=True, random_state=seed)

In [None]:
from keras.models import Sequential
from keras.layers import Input, Flatten, Dense, Embedding, Dropout, LSTM
from keras.layers.merge import Concatenate
from table import Table

accuracies = []
for i, train_test in enumerate(kfold.split(padded_sequences, labels)):
    
    train = train_test[0]
    test = train_test[1]
    
    X_train = data[train]
    X_test  = data[test]
    y_train = labels[train]
    y_test  = labels[test]
    
    main_model = Sequential()
    
    # Readme word embedding
    readme_model = Sequential()
    readme_model.add(
        Embedding(
            vocab_size, 
            64, 
            input_length=MAX_WORDS_PER_DOC
        )
    )
    readme_model.add(Flatten())
    readme_model.add(
        Dense(32, activation='relu')
    )

    # Numeric Github API enrichment features
    api_model = Sequential()
    
    
    main_model.add(
        Dense(1, activation='sigmoid')
    )

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['acc', 'mse', 'mae', 'mape', 'cosine']
    )
    model.summary()

    history = model.fit(
        X_train,
        y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.3,
        verbose=0
    )
    scores = model.evaluate(X_test, y_test, verbose=0)
    
    print(Table(model.metrics_names, [scores]))
    
    accuracy_pct = scores[1] * 100
    accuracies.append(accuracy_pct)

print(
    Table(
        ['Average','Median','Minimum','Maximum'],
        [[np.average(accuracies), np.median(accuracies), np.max(accuracies), np.min(accuracies)]]
    )
)

## Applying the First Model

In [None]:
# # Run me ONCE so we can go collect data from the (rate limited) Github API for our 1,000 record sample
# with open('../data/html/documents.jsonl') as f:
#     second_raw_training = []
#     for line in f:
#         record = json.loads(line)
#         second_raw_training.append(record)

# second_raw_training = random.sample(second_raw_training, 1000)

# with open('../data/html/first_exploit_set.jsonl', 'w') as f:
#     for record in second_raw_training:
#         f.write( json.dumps(record) + '\n' )

# Run me every time thereafter you run this code block
with open('../data/html/first_exploit_set.jsonl') as f:
    second_raw_training = []
    for line in f:
        record = json.loads(line)
        second_raw_training.append(record)
len(second_raw_training)

In [None]:
second_just_words = [' '.join(doc['readme_words']) for doc in second_raw_training]

second_sequences = tokenizer.texts_to_sequences(second_just_words)

In [None]:
second_truncated = preprocessing.sequence.pad_sequences(
    second_sequences,
    maxlen=MAX_WORDS_PER_DOC
)
second_truncated.shape

In [None]:
second_truncated_enriched ...

In [None]:
first_model_predictions = model.predict_classes(second_truncated)
first_predictions_list = list(first_model_predictions.T[0])

first_positive = [{**datum, **{'pred_edu': pred}} for datum, pred in zip(second_raw_training, first_predictions_list) if pred == 1]
first_pos_repos = [(doc['pred_edu'], 'https://github.com/' + doc['repo']) for doc in first_positive]
first_pos_repos