## - Combine all data 

In [1]:
import pandas as pd
from os import listdir

path = '../data/'
files = listdir('../data/')
df = pd.DataFrame(columns=["url", "query", "text"])

for f in files:
    temp = pd.read_csv(path + f)
    if len(temp) < 1:
        continue
    df = df.append(temp)
df.drop(['Unnamed: 0'], inplace=True, axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


## - data preprocessing
    1. stop word removal
    2. lower case letters
    3. non ascii character removal

In [2]:
from nltk.corpus import stopwords
import re
stop = stopwords.words('english')

def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    return norm_text

def remove_stop_words(text):
    return " ".join([item.lower() for item in text.split() if item not in stop])

def remove_non_ascii(text):
    return ''.join(["" if ord(i) < 32 or ord(i) > 126 else i for i in text])

df['text'] = df['text'].apply(remove_non_ascii)
df['text'] = df['text'].apply(normalize_text)
df['text'] = df['text'].apply(remove_stop_words)
df["text"] = df['text'].str.replace('[^\w\s]','')

## - a simple word2vec model
    In this section we apply simple word to vec model to tokenized data.

In [3]:
from gensim.models import Word2Vec
from nltk import word_tokenize



In [4]:
df['tokenized_text'] = df.apply(lambda row: word_tokenize(row['text']), axis=1)

In [5]:
model = Word2Vec(df['tokenized_text'], size=100)

In [6]:
model.wv.vocab

{'limited': <gensim.models.keyedvectors.Vocab at 0x7f8d6016a2d0>,
 'dynamic': <gensim.models.keyedvectors.Vocab at 0x7f8d6013e610>,
 'four': <gensim.models.keyedvectors.Vocab at 0x7f8da4245d10>,
 'laterally': <gensim.models.keyedvectors.Vocab at 0x7f8d601aeb90>,
 'asian': <gensim.models.keyedvectors.Vocab at 0x7f8d6013e650>,
 'coreshell': <gensim.models.keyedvectors.Vocab at 0x7f8d6013e690>,
 'looking': <gensim.models.keyedvectors.Vocab at 0x7f8d60558ed0>,
 'patches': <gensim.models.keyedvectors.Vocab at 0x7f8d6016a350>,
 'tweet': <gensim.models.keyedvectors.Vocab at 0x7f8d6016a390>,
 'supported': <gensim.models.keyedvectors.Vocab at 0x7f8d601653d0>,
 'patched': <gensim.models.keyedvectors.Vocab at 0x7f8d6016a3d0>,
 'worth': <gensim.models.keyedvectors.Vocab at 0x7f8d60198390>,
 'sogu': <gensim.models.keyedvectors.Vocab at 0x7f8d601983d0>,
 'updated': <gensim.models.keyedvectors.Vocab at 0x7f8d6013e710>,
 'implants': <gensim.models.keyedvectors.Vocab at 0x7f8d6016a310>,
 'regional': <g

In [7]:
for num in [1, 3, 5, 10, 12, 16, 17, 18, 19, 28, 29, 30, 32, 33, 34, 37, 38]:
    term = "apt%s"%str(num)
    if term in model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in model.most_similar(term): print(t)
        print('\n')

Most similar words for apt1
('report', 0.999941349029541)
('code', 0.9999402761459351)
('malware', 0.9999359846115112)
('used', 0.9999310970306396)
('one', 0.9999299645423889)
('number', 0.9999285936355591)
('using', 0.9999278783798218)
('security', 0.9999271035194397)
('data', 0.9999256730079651)
('also', 0.9999256730079651)


Most similar words for apt3
('process', 0.9953946471214294)
('persistent', 0.9953526258468628)
('advanced', 0.9953408241271973)
('functionality', 0.9953373670578003)
('earlier', 0.9953318238258362)
('45', 0.9953290820121765)
('calls', 0.9953078627586365)
('unusual', 0.995301365852356)
('would', 0.9952923059463501)
('api', 0.9952799677848816)


Most similar words for apt10
('two', 0.9995896220207214)
('targeted', 0.9995874762535095)
('groups', 0.9995850920677185)
('time', 0.9995764493942261)
('report', 0.9995744228363037)
('2015', 0.9995692372322083)
('attackers', 0.9995688199996948)
('user', 0.9995658993721008)
('including', 0.9995650053024292)
('attacker', 0.99

  """
  if np.issubdtype(vec.dtype, np.int):


### here we got one interesting result for apt17 as apt28
    but for all other word2vec results we observe that we are getting names like malware, attackers, groups, backdoor in the most similar items.  
    It might be the case that the names of attacker groups are ommited because they are phrases instead simple words.

## - word2vec with bigram phrases
    here we try to find bigram phrases from the dataset and apply word2vec model to it

In [8]:
from gensim.models import Phrases
from collections import Counter

In [9]:
bigram = Phrases()

In [10]:
bigram.add_vocab(df['tokenized_text'])

In [11]:
bigram_counter = Counter()
for key in bigram.vocab.keys():
    if len(key.split("_")) > 1:
        bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(20):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)

ee_ee                49
00_00                34
threat_actors        32
kaspersky_lab        31
middle_east          30
cyber_espionage      30
c2_server            28
threat_actor         25
vn_hanoi             24
apt_groups           24
200_83               24
program_files        23
c_program            22
files_x86            22
figure_1             21
naikon_apt           20
11_01                19
turnedup_201461      19
201461_11            19
central_asia         18


In [12]:
bigram_model = Word2Vec(bigram[df['tokenized_text']], size=100)



In [13]:
for num in [1, 3, 5, 10, 12, 16, 17, 18, 19, 28, 29, 30, 32, 33, 34, 37, 38]:
    term = "apt%s"%str(num)
    if term in bigram_model.wv.vocab:
        print("Most similar words for %s"%term)
        for t in bigram_model.most_similar(term): print(t)
        print('\n')

Most similar words for apt1
(u'code', 0.9999025464057922)
(u'used', 0.9998924732208252)
(u'malware', 0.9998922944068909)
(u'report', 0.9998915195465088)
(u'using', 0.9998892545700073)
(u'one', 0.9998818039894104)
(u'information', 0.9998788833618164)
(u'data', 0.9998749494552612)
(u'malicious', 0.9998738765716553)
(u'also', 0.9998722076416016)


Most similar words for apt3
(u'dropper', 0.9883150458335876)
(u'tracked', 0.9882252216339111)
(u'100', 0.9881900548934937)
(u'process', 0.9881615042686462)
(u'functionality', 0.9881383180618286)
(u'unusual', 0.9881379008293152)
(u'would', 0.9881075024604797)
(u'calls', 0.9881061315536499)
(u'earlier', 0.9880630373954773)
(u'built', 0.9880538582801819)


Most similar words for apt10
(u'two', 0.9993767738342285)
(u'report', 0.9993574619293213)
(u'groups', 0.9993534684181213)
(u'time', 0.9993522763252258)
(u'file', 0.9993485808372498)
(u'attackers', 0.9993419647216797)
(u'example', 0.9993366003036499)
(u'targeted', 0.9993312358856201)
(u'mandiant',

  """


### After applying bigram phrases still we cannot see the desired results. 