# Introduction

## Purpose 

Here we are experimenting with semi-supervised learning. We have prepared =~ 80 examples for our initial model. We will then use this model to classify more data, and then that model to classify more data, cyclically until we arrive at our goal of a good classifier.

## Classifying Educational Projects

The goal of this project is to classify github projects as primarily educational or not. This classification will then feed into a time series analysis of educational content on Github over time.

## First Model

In [1]:
import sys, os, re
import json, csv
import numpy as np
import random

# =~ 80 examples done by hand... lets see if we can use semi-supervised learning to improve our training data!
raw_training_data = []
with open('../data/html/first_training.jsonl') as f:
    for line in f:
        record = json.loads(line)
        raw_training_data.append(record)
raw_training_data[0]

{'is_edu': 0,
 'readme_words': ['htf', 'hack', 'future', 'net', 'challenge'],
 'repo': 'Djohnnie/HTF2017'}

In [2]:
# Add the number of words, as it is a strong signal
len_nos = [len(doc['readme_words']) for doc in raw_training_data if doc['is_edu'] == 0]
len_yes = [len(doc['readme_words']) for doc in raw_training_data if doc['is_edu'] == 1]
len_all = [len(doc['readme_words']) for doc in raw_training_data]
max_all = max(len_all)
normalized_all = [x/max_all for x in len_all]
values = [doc['is_edu'] for doc in raw_training_data]

print('Nos average: {}, median: {}'.format(np.average(len_nos), np.median(len_nos)))
print('Yes average: {}, median: {}'.format(np.average(len_yes), np.median(len_yes)))

print('README word count/education normalized cross correlation: {0:.2f}'.format(
    np.correlate(values, normalized_all)[0]
))

Nos average: 143.06976744186048, median: 77.0
Yes average: 1003.1818181818181, median: 231.0
README word count/education normalized cross correlation: 2.25


In [3]:
just_words = [' '.join(doc['readme_words']) for doc in raw_training_data]
just_words[0]

'htf hack future net challenge'

In [4]:
just_values = [doc['is_edu'] for doc in raw_training_data]
just_values = np.array(just_values)
just_values[0]

0

In [5]:
from keras.preprocessing.text import Tokenizer

MAX_WORDS=1000

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(just_words)

sequences = tokenizer.texts_to_sequences(just_words)
word_index = tokenizer.word_index
vocab_size = len(word_index)

print('Found {:,} unique tokens.'.format(vocab_size))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 7,437 unique tokens.


In [53]:
from keras import preprocessing

MAX_WORDS_PER_DOC=1000

padded_sequences = preprocessing.sequence.pad_sequences(
    sequences,
    maxlen=MAX_WORDS_PER_DOC
)
padded_sequences.shape

(76, 1000)

In [36]:
# Append word count feature to feature matrix
len_all = [len(doc['readme_words']) for doc in raw_training_data]

len_all = [[x] for x in len_all]
len_all = np.array(len_all)
sequences_and_lengths = np.append(padded_sequences, len_all, axis=1)
assert sequences_and_lengths.shape == (76, 1001)

In [37]:
# Randomly sort data
indices = np.arange(sequences_and_lengths.shape[0])
np.random.shuffle(indices)
data = padded_sequences[indices]
labels = just_values[indices]

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data, 
    labels, 
    test_size=0.2,
    random_state=27
)

In [72]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Dropout, LSTM
from table import Table

TRAINING_RUNS = 12

accuracies = []
for i in range(TRAINING_RUNS):
    
    model = Sequential()
    model.add(
        Embedding(
            vocab_size, 
            64, 
            input_length=MAX_WORDS_PER_DOC
        )
    )
    model.add(Flatten())
    model.add(
        Dense(32, activation='relu')
    )
    model.add(
        Dense(32, activation='relu')
    )
    model.add(
        Dense(1, activation='sigmoid')
    )

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['acc', 'mse', 'mae', 'mape', 'cosine']
    )
    #model.summary()

    history = model.fit(
        X_train,
        y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.3,
        verbose=0
    )
    scores = model.evaluate(X_test, y_test, verbose=0)
    
    #print(Table(model.metrics_names, [scores]))
    
    accuracy_pct = scores[1] * 100
    accuracies.append(accuracy_pct)

print(
    Table(
        ['Average','Median','Minimum','Maximum'],
        [[np.average(accuracies), np.median(accuracies), np.max(accuracies), np.min(accuracies)]]
    )
)

+----------+----------+----------+----------+
| Average  | Median   | Minimum  | Maximum  |
+----------+----------+----------+----------+
| 67.1875f | 68.7500f | 75.0000f | 62.5000f |
+----------+----------+----------+----------+


## Applying the First Model

In [11]:
with open('../data/html/documents.jsonl') as f:
    second_raw_training = []
    for line in f:
        record = json.loads(line)
        second_raw_training.append(record)

second_raw_training = random.sample(second_raw_training, 1000)

In [12]:
second_just_words = [' '.join(doc['readme_words']) for doc in second_raw_training]

second_sequences = tokenizer.texts_to_sequences(second_just_words)

In [13]:
second_truncated = preprocessing.sequence.pad_sequences(
    second_sequences,
    maxlen=MAX_WORDS_PER_DOC
)
second_truncated.shape

(1000, 1000)

In [94]:
first_model_predictions = model.predict_classes(second_truncated)
first_predictions_list = list(first_model_predictions.T[0])

first_positive = [{**datum, **{'pred_edu': pred}} for datum, pred in zip(second_raw_training, first_predictions_list) if pred == 1]
first_pos_repos = ['https://github.com/' + doc['repo'] for doc in first_positive]
first_pos_repos

['https://github.com/garyp/sifter',
 'https://github.com/crysisfarcry222/rapidjson',
 'https://github.com/jonsuh/mcgriddle',
 'https://github.com/MattNguyen/exfile-s3',
 'https://github.com/DIKU-EDU/remarks',
 'https://github.com/sourcegraph/syntect_server',
 'https://github.com/sit/handson-strace',
 'https://github.com/flidw55/nfu40341127',
 'https://github.com/pwr/Solaar',
 'https://github.com/UK-MAC/mega-stream',
 'https://github.com/matthewelse/micropython',
 'https://github.com/jeescu/react-firebase',
 'https://github.com/Yogeshkad/gulp',
 'https://github.com/dkruchinin/particles',
 'https://github.com/fraserxu/react-testing-recipes',
 'https://github.com/AzureAD/microsoft-authentication-library-for-dotnet',
 'https://github.com/AlexarJING/awesome-love2d',
 'https://github.com/USGS-WiM/sparrow-eastern-us-js',
 'https://github.com/curlykale/zheng',
 'https://github.com/shanjgit/tensorflow-generative-model-collections',
 'https://github.com/Daniel-Santhanaraj/tripguru',
 'https://gi