In [1]:
# Display matplotlib plots in the output
%matplotlib inline
%load_ext autoreload
%autoreload 2

import json
import random
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectPercentile
from sklearn.svm import *
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import sys
from scipy.sparse import coo_matrix, vstack
from collections import defaultdict
from sklearn.metrics import f1_score,recall_score,precision_score,classification_report,confusion_matrix,accuracy_score

import gensim
from gensim import corpora, models, similarities


Couldn't import dot_parser, loading of dot files will not be possible.


In [20]:
%load_ext autoreload
%autoreload 2
REPO_ROOT = "/Users/samuelkahn/Desktop/Berkeley/DS210"

def tokenize_js(script):
    script = re.sub(r'(\/\*[^*]+\*\/)', "", script)
    script = re.sub(r'\/\/.+', "", script)
    tokens = re.findall(r'(\"[^"]+\"|\'[^\']+\'|[\w\\\/\-_\"\']+|{|}|,|[\+\*]|\(|\)|\.|/\*.+\*\/)', script)
    return [t.lower() for t in tokens]

def parse_scripts(tbl):
    scripts=[]
    if isinstance(tbl,dict) and 3000 in tbl.keys():
        for item in tbl[3000]:
            with open("%s/scripts_min/%s.js" % (REPO_ROOT, item["sha"])) as f:     
                scripts.append(f.read().decode(errors='replace'))
        return scripts
    else:
        for item in tbl:
            with open("%s/scripts_min/%s.js" % (REPO_ROOT, item["sha"])) as f:     
                scripts.append(f.read().decode(errors='replace'))
        return scripts


def topic_model_train(tokenizer,topics):
    X_train = {}
    Y_train = {}
    X_test = {}
    Y_test = {}
    
    script_list = [tokenizer(script) for script in parse_scripts(train_tables)]

    frequency = defaultdict(int)
    for text in script_list:
        for token in text:
            frequency[token] += 1
    train_texts = [[token for token in text if frequency[token] > 1] for text in script_list]


    train_dictionary = corpora.Dictionary(train_texts)
    train_corpus = [train_dictionary.doc2bow(text) for text in train_texts]

    ### Create and save TFIDF representation of the data
    tfidf = models.TfidfModel(train_corpus)

    train_tfidf = tfidf[train_corpus]
    lda = models.ldamodel.LdaModel(train_tfidf, id2word=train_dictionary, num_topics=topics,passes=20)
    
    #### create test set 
    script_list_test = [tokenizer(script) for script in parse_scripts(test_table)]

    frequency = defaultdict(int)
    for text in script_list:
        for token in text:
            frequency[token] += 1
    test_texts = [[token for token in text if frequency[token] > 1] for text in script_list_test]


    test_vectors = map(lambda x: train_dictionary.doc2bow(x),test_texts)
    test_tfidf = tfidf[test_vectors]

    lda_train=lda[train_tfidf]

    lda_test=lda[test_tfidf]

    train_vector = gensim.matutils.corpus2csc(lda_train).todense().transpose()
    test_vector = gensim.matutils.corpus2csc(lda_test).todense().transpose()
    

    return train_vector,test_vector

def vectorize_table(model_type, tokenizer, parser, table, train_size, test_size):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    
    data = {
        "X_train": vectorizer.fit_transform(parser(table[:train_size])),
        "Y_train": np.array([item["flag-any"] for item in table[:train_size]]),
        "X_test": vectorizer.transform(parser(table[-test_size:])),
        "Y_test": np.array([item["flag-any"] for item in table[-test_size:]]),
    }
    
    with open("%s/models/data/dataset_%s_%d.pickle" % (REPO_ROOT, model_type, train_size), "w") as f:
        pickle.dump(data, f)    
def test_model(model, model_name,train_vector,test_vector,target_train,target_test):
    model.fit(train_vector, target_train)
    test_pred = model.predict(test_vector)
    print '-'*50
    print model_name
    print classification_report(target_test,test_pred)
    print '-'*50
def write_results():
    output = ("Model,Training set,Accuracy,Precision,Recall,F1 score\n" +
        "\n".join([",".join([str(s) for s in row]) for row in output_table]))
    with open("%s/results/linear_models.csv" % REPO_ROOT, "w") as f:
        f.write(output)
        
def write_min_dataset():
    min_table = (
            [table[i] for i in positive_examples] + 
            [table[i] for i in negative_examples])
    with open("%s/scripts_min/min/table_flag.json" % REPO_ROOT, "w") as f:
        json.dump(min_table, f)
    
    for item in min_table:
        with open("%s/scripts/%s.js" % (REPO_ROOT, item["sha"])) as f1:
            with open("%s/scripts/min/%s.js" % (REPO_ROOT, item["sha"]), "w") as f2:
                f2.write(f1.read())
    print "Wrote %d items" % len(min_table)

with open("%s/scripts_min/table_flag.json" % REPO_ROOT) as f:
    raw_table = json.load(f)

# Filter out inline scripts for now
raw_table = [i for i in raw_table if i["inline"] == False]

scripts_table = {}
for item in raw_table:
    if item["sha"] not in scripts_table:
        scripts_table[item["sha"]] = item
        scripts_table[item["sha"]]["count"] = 0
        
    for entry in item:
        if entry.startswith("flag-"):
            if item[entry] == 1:
                scripts_table[item["sha"]][entry] = 1
                
    scripts_table[item["sha"]]["count"] += 1
    
table = scripts_table.values()

positive_examples = [i for i, e in enumerate(table) if e["flag-any"] == 1]
negative_examples = [i for i, e in enumerate(table) if e["flag-any"] == 0]
random.seed(1492)
random.shuffle(positive_examples)
random.shuffle(negative_examples)
negative_examples = negative_examples[:len(positive_examples)]
TOTAL_SIZE = len(positive_examples)+len(negative_examples)
print "%d items in raw table." % len(raw_table)
print "%d unique scripts." % len(table)
print "%d positive + %d negative examples = %d total." % (
    len(positive_examples), len(negative_examples),
    TOTAL_SIZE)

TRAIN_SIZES = []
size = 375
while size < TOTAL_SIZE - 1000:
    TRAIN_SIZES.append(size)
    size *= 2
    
TEST_SIZE = TOTAL_SIZE - TRAIN_SIZES[-1]

print "Training sizes: %s" % TRAIN_SIZES
print "Test size: %d" % TEST_SIZE

train_tables = {}
for train_size in TRAIN_SIZES:
    train_tables[train_size] = (
        [table[i] for i in positive_examples[:(train_size/2)]] + 
        [table[i] for i in negative_examples[:(train_size/2)]])

test_table = (
    [table[i] for i in positive_examples[-TEST_SIZE/2:]] + 
    [table[i] for i in negative_examples[-TEST_SIZE/2:]])

X_train = {}
Y_train = {}
X_test = {}
Y_test = {}

script_list = [tokenize_js(script) for script in parse_scripts(train_tables)]

frequency = defaultdict(int)
for text in script_list:
    for token in text:
        frequency[token] += 1
train_texts = [' '.join([token for token in text if frequency[token] > 0]) for text in script_list]
target_train = np.array([item["flag-any"] for item in train_tables[3000]])
target_test = np.array([item["flag-any"] for item in test_table])


script_list_test = [tokenize_js(script) for script in parse_scripts(test_table)]


frequency = defaultdict(int)
for text in script_list:
    for token in text:
        frequency[token] += 1
test_texts = [' '.join([token for token in text if frequency[token] > 0]) for text in script_list_test]
# for percent in [0.1,0.2,0.3,0.4,0.5]:
#     for topics in [100,200,400,800]:
vectorizer = TfidfVectorizer(tokenizer=tokenize_js)
train_data=vectorizer.fit_transform(train_texts)
test_data=vectorizer.transform(test_texts)
chi_fs=SelectPercentile(chi2,1)
chi2_train=chi_fs.fit_transform(train_data,target_train).todense()
chi2_test=chi_fs.transform(test_data).todense()
train_lda,test_lda=topic_model_train(tokenize_js,100)

train=np.hstack([chi2_train,train_lda])
test=np.hstack([chi2_test,test_lda])


# print 'Chi2 Feature Selection Percentage: %.2f \n Latent Dirchilet Allocation # Topis: %i'%(percent,topics)
test_model(RandomForestClassifier(n_estimators=1000),'RandomForest',train,test,target_train,target_test)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
5942 items in raw table.
5942 unique scripts.
2971 positive + 2971 negative examples = 5942 total.
Training sizes: [375, 750, 1500, 3000]
Test size: 2942
--------------------------------------------------
RandomForest
             precision    recall  f1-score   support

          0       0.83      0.91      0.87      1471
          1       0.90      0.81      0.85      1471

avg / total       0.86      0.86      0.86      2942

--------------------------------------------------


In [10]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
import glob


In [11]:

class LabeledLineSentence(object):
    def __init__(self, data):
        self.data=data
    def __iter__(self):
        for uid, line in enumerate(self.data):
            yield LabeledSentence(line, ['SENT_%s' % uid])
            


train_doc_text= map(lambda x:x.split(),train_texts)
test_doc_text= map(lambda x:x.split(),test_texts)

documents=LabeledLineSentence(train_doc_text)
model = Doc2Vec(documents, size=200, window=15, 
                min_count=5, workers=2, dm=0,
                max_vocab_size=1000000,
                hs=0, negative=5,
                iter=20,
                sample=5e-5,
                dbow_words=1)

In [14]:
### Array containing the document vectors for each document
doc2vec_train_array=[]
for index in range(0,len(train_doc_text)):
    doc2vec_train_array.append(model.docvecs[index])
doc2vec_train_array=np.array(doc2vec_train_array)

### Project the development set into the Doc2Vec embedding space so we can evaulate out perofrance
doc2vec_test_array=[]
for index in range(0,len(test_doc_text)):
    doc2vec_test_array.append(model.infer_vector(test_doc_text[index]))

doc2vec_test_array=np.array(doc2vec_test_array)

In [19]:
train=np.hstack([train,doc2vec_train_array])
test=np.hstack([test,doc2vec_test_array])
test_model(RandomForestClassifier(n_estimators=1000),'RandomForest',train,test,target_train,target_test)

--------------------------------------------------
RandomForest
             precision    recall  f1-score   support

          0       0.76      0.96      0.85      1471
          1       0.95      0.69      0.80      1471

avg / total       0.85      0.83      0.82      2942

--------------------------------------------------
