## - Combine all data 

In [22]:
import pandas as pd
from os import listdir

path = '../data/'
files = listdir('../data/')
df = pd.DataFrame(columns=["url", "query", "text"])

for f in files:
    temp = pd.read_csv(path + f)
    if 'article-name' in temp.columns:
        temp.rename(columns={'article-name':'name','article-url':'url','content':'text','keyword':'query'}, inplace=True)
    if len(temp) < 1:
        continue
    df = df.append(temp)
df.drop(['Unnamed: 0', 'name'], inplace=True, axis=1)

## - data preprocessing
    1. stop word removal
    2. lower case letters
    3. non ascii character removal

In [23]:
from nltk.corpus import stopwords
import re
stop = stopwords.words('english')

def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

df['text'] = df['text'].apply(remove_non_ascii)
df['text'] = df['text'].apply(normalize_text)
df['text'] = df['text'].apply(remove_stop_words)
df["text"] = df['text'].str.replace('[^\w\s]','')

## - a simple word2vec model
    In this section we apply simple word to vec model to tokenized data.

In [24]:
from gensim.models import Word2Vec
from nltk import word_tokenize

In [25]:
df['tokenized_text'] = df.apply(lambda row: word_tokenize(row['text']), axis=1)

In [26]:
model = Word2Vec(df['tokenized_text'], size=100)

In [27]:
for num in [1, 3, 5, 10, 12, 16, 17, 18, 19, 28, 29, 30, 32, 33, 34, 37, 38]:
    term = "apt%s"%str(num)
    if term in model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in model.most_similar(term): print(t)
        print('\n')

Most similar words for apt1
('iranian', 0.999222993850708)
('mandiant', 0.9991157054901123)
('north', 0.9987305402755737)
('according', 0.9986847639083862)
('apt10', 0.997677206993103)
('previously', 0.997546374797821)
('launched', 0.9974732398986816)
('discovered', 0.9972922801971436)
('suspected', 0.9971252083778381)
('related', 0.9971103668212891)


Most similar words for apt3
('team', 0.9952455759048462)
('observed', 0.9951759576797485)
('cyberespionage', 0.9946916103363037)
('strontium', 0.9946668148040771)
('spotted', 0.994046151638031)
('cozy', 0.9936408996582031)
('also', 0.993202269077301)
('tracked', 0.9931139945983887)
('aimed', 0.9927853345870972)
('behind', 0.9927506446838379)


Most similar words for apt10
('apt37', 0.9993234276771545)
('tracks', 0.999140739440918)
('iranian', 0.9987781047821045)
('overlap', 0.9986939430236816)
('previously', 0.9986189007759094)
('recently', 0.9984301328659058)
('primarily', 0.9983476400375366)
('statesponsored', 0.9979463815689087)
('chi

  """


### here we got one interesting result for apt17 as apt28
    but for all other word2vec results we observe that we are getting names like malware, attackers, groups, backdoor in the most similar items.  
    It might be the case that the names of attacker groups are ommited because they are phrases instead simple words.

## - word2vec with bigram phrases
    here we try to find bigram phrases from the dataset and apply word2vec model to it

In [28]:
from gensim.models import Phrases
from collections import Counter

In [29]:
bigram = Phrases()

In [30]:
bigram.add_vocab(df['tokenized_text'])

In [31]:
bigram_counter = Counter()
for key in bigram.vocab.keys():
    if len(key.split("_")) > 1:
        bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(20):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)

cyber_security       353
security_conference  334
ics_cyber            334
document_getelementsbytagname 163
comjsplusone_js      163
conference_singapore 163
google_comjsplusone  163
script_0             163
ciso_forum           163
forum_half           163
document_createelement 163
po_src               163
apis_google          163
textjavascript_po    163
type_textjavascript  163
po_async             163
var_po               163
parentnode_insertbefore 163
async_true           163
po_type              163


In [20]:
bigram_model = Word2Vec(bigram[df['tokenized_text']], size=100)



In [21]:
for num in [1, 3, 5, 10, 12, 16, 17, 18, 19, 28, 29, 30, 32, 33, 34, 37, 38]:
    term = "apt%s"%str(num)
    if term in bigram_model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in bigram_model.most_similar(term): print(t)
        print('\n')

Most similar words for apt1
(u'2017', 0.9999014139175415)
(u'campaigns', 0.9998936057090759)
(u'vulnerabilities', 0.9998923540115356)
(u'however', 0.9998865127563477)
(u'says', 0.9998756647109985)
(u'microsoft', 0.999866247177124)
(u'behind', 0.9998636245727539)
(u'recently', 0.999862551689148)
(u'first', 0.9998561143875122)
(u'actors', 0.9998513460159302)


Most similar words for apt3
(u'various', 0.9997950792312622)
(u'hackers', 0.9997645616531372)
(u'threat_actor', 0.9997585415840149)
(u'infrastructure', 0.999748170375824)
(u'fancy_bear', 0.9997477531433105)
(u'cyber_espionage', 0.9997457265853882)
(u'different', 0.9997384548187256)
(u'operation', 0.9997243881225586)
(u'found', 0.9997132420539856)
(u'pawn_storm', 0.9997034072875977)


Most similar words for apt10
(u'number', 0.999851644039154)
(u'code', 0.9998489618301392)
(u'systems', 0.9998442530632019)
(u'likely', 0.9998415112495422)
(u'even', 0.9998409748077393)
(u'kaspersky', 0.9998403787612915)
(u'least', 0.9998272657394409)
(

  """


### After applying bigram phrases still we cannot see the desired results. 