In [1]:
import numpy as np
from ml_helpers import ml_helpers
from redis_management import RedisManagement as rmgt
from collections import OrderedDict


In [2]:
from sklearn.cluster import DBSCAN

In [3]:
help(DBSCAN)

Help on class DBSCAN in module sklearn.cluster.dbscan_:

class DBSCAN(sklearn.base.BaseEstimator, sklearn.base.ClusterMixin)
 |  Perform DBSCAN clustering from vector array or distance matrix.
 |  
 |  DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
 |  Finds core samples of high density and expands clusters from them.
 |  Good for data which contains clusters of similar density.
 |  
 |  Read more in the :ref:`User Guide <dbscan>`.
 |  
 |  Parameters
 |  ----------
 |  eps : float, optional
 |      The maximum distance between two samples for them to be considered
 |      as in the same neighborhood.
 |  
 |  min_samples : int, optional
 |      The number of samples (or total weight) in a neighborhood for a point
 |      to be considered as a core point. This includes the point itself.
 |  
 |  metric : string, or callable
 |      The metric to use when calculating distance between instances in a
 |      feature array. If metric is a string or callable, it must 

In [4]:
mat = np.load('matrix_first_vector.npy')
redis_h = rmgt('malwares')
ml_h= ml_helpers(redis_h.redis_client)
mat.shape


(724, 5)

# DBScan with the first vector

In [156]:
dbscan = DBSCAN(eps=0.001,min_samples=1, metric="euclidean",n_jobs=8)

In [157]:
dbscan.fit(mat)

DBSCAN(algorithm='auto', eps=0.001, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=1, n_jobs=8, p=None)

In [158]:
labels=dbscan.labels_.tolist()

In [159]:
all_malwares = ml_h.get_all_malwares
for index,l in enumerate(labels):
    ml_h.set_label(all_malwares[index],'DBscan','first_vector',l)

In [160]:
distrib = {}
for m in all_malwares:
    try:
        distrib[redis_h.client.hget(m,'DBscan_first_vector')].append((m,redis_h.client.hget(m,'label')))
    except KeyError:
        distrib[redis_h.client.hget(m,'DBscan_first_vector')] = [(m,redis_h.client.hget(m,'label'))]

In [161]:
distrib

{b'0': [(b'0008065861f5b09195e51add72dacd3c4bbce6444711320ad349c7dab5bb97fb',
   b'VolatileCedar.Explosion'),
  (b'0269085bb03ba9030209b0d95f0dd65f80539c27ea376eb2a8f11c68bf60cb8a',
   b'EquationGroup'),
  (b'03641e5632673615f23b2a8325d7355c4499a40f47b6ae094606a73c56e24ad0',
   b'VolatileCedar.Explosion'),
  (b'045f0ecae2362355f06d4fc8fa97e577daad1e01e6f0c0523b5b0f9e15306c74',
   b'EquationGroup'),
  (b'05455efecab4a7931fa53a3c2008d04fc6b539c5e8f451f19b617bd9b3ebcd83',
   b'WMIGhost'),
  (b'0581a38d1dc61e0da50722cb6c4253d603cc7965c87e1e42db548460d4abdcae',
   b'Win32.Lephic'),
  (b'05edcc3e5679ee254c78058c4f446e195544d3ff3374bd141c1895e7ed6a410b',
   b'Dyre'),
  (b'07529fae9e74be81fd302d022603d9f0796b4b9120b0d6131f75d41b979bbca5',
   b'VolatileCedar.Explosion'),
  (b'083c64c404ac1ea6df1a4cb6eafa91ef90b7abacc54547cf008cd74e77195746',
   b'EquationGroup'),
  (b'084a220ba90622cc223b93f32130e9f2d072679f66d1816775bf14832d492b8a',
   b'WMIGhost'),
  (b'09c04206b57bb8582faffb37e4ebb6867a02492

In [162]:
distrib.keys()

dict_keys([b'224', b'166', b'198', b'206', b'242', b'217', b'156', b'90', b'197', b'284', b'264', b'35', b'262', b'23', b'295', b'158', b'308', b'256', b'218', b'200', b'109', b'341', b'185', b'135', b'280', b'51', b'186', b'296', b'254', b'318', b'140', b'58', b'86', b'137', b'210', b'253', b'331', b'39', b'99', b'247', b'144', b'125', b'193', b'163', b'317', b'21', b'78', b'337', b'147', b'244', b'277', b'139', b'287', b'165', b'74', b'167', b'152', b'178', b'0', b'48', b'110', b'340', b'37', b'108', b'334', b'208', b'153', b'225', b'322', b'306', b'143', b'234', b'216', b'240', b'114', b'136', b'44', b'191', b'80', b'50', b'69', b'159', b'243', b'64', b'107', b'321', b'241', b'120', b'257', b'173', b'124', b'3', b'66', b'36', b'283', b'16', b'190', b'96', b'302', b'290', b'10', b'129', b'324', b'20', b'207', b'285', b'170', b'229', b'227', b'100', b'343', b'215', b'196', b'40', b'194', b'344', b'146', b'62', b'126', b'300', b'211', b'112', b'155', b'5', b'128', b'157', b'176', b'55'

In [163]:
[(k,len(v)) for k,v in distrib.items()]

[(b'224', 1),
 (b'166', 1),
 (b'198', 1),
 (b'206', 1),
 (b'242', 1),
 (b'217', 1),
 (b'156', 1),
 (b'90', 1),
 (b'197', 1),
 (b'284', 1),
 (b'264', 1),
 (b'35', 1),
 (b'262', 1),
 (b'23', 1),
 (b'295', 1),
 (b'158', 1),
 (b'308', 1),
 (b'256', 1),
 (b'218', 1),
 (b'200', 1),
 (b'109', 1),
 (b'341', 1),
 (b'185', 1),
 (b'135', 1),
 (b'280', 1),
 (b'51', 1),
 (b'186', 1),
 (b'296', 1),
 (b'254', 1),
 (b'318', 1),
 (b'140', 1),
 (b'58', 2),
 (b'86', 1),
 (b'137', 1),
 (b'210', 1),
 (b'253', 1),
 (b'331', 1),
 (b'39', 1),
 (b'99', 1),
 (b'247', 1),
 (b'144', 1),
 (b'125', 1),
 (b'193', 1),
 (b'163', 1),
 (b'317', 1),
 (b'21', 1),
 (b'78', 1),
 (b'337', 1),
 (b'147', 1),
 (b'244', 1),
 (b'277', 1),
 (b'139', 1),
 (b'287', 1),
 (b'165', 1),
 (b'74', 1),
 (b'167', 1),
 (b'152', 1),
 (b'178', 1),
 (b'0', 299),
 (b'48', 1),
 (b'110', 1),
 (b'340', 1),
 (b'37', 1),
 (b'108', 1),
 (b'334', 1),
 (b'208', 2),
 (b'153', 1),
 (b'225', 1),
 (b'322', 1),
 (b'306', 1),
 (b'143', 1),
 (b'234', 1),
 (b'2

In [164]:
distrib[b'4']

[(b'022224bfad26bab87cf5f4b17981a4224ef8fa6919520b3bc2946234efda1e11',
  b'EquationGroup'),
 (b'340b09d661a6ac45af53c348a5c1846ad6323d34311e66454e46c1d38d53af8b',
  b'PotaoExpress'),
 (b'3b0520cb529168cad100ab5512fc4c520d28fa239237f01c5a09cc4df7bd8eeb',
  b'EquationGroup'),
 (b'9562c06202c391d33b961887193eb92a55c7ebee9d22d283758a087313ab2565',
  b'EquationGroup'),
 (b'98ed964765ba84e04bbcd771c70ce91835a1af82c7571ac9e63e6ee35c7f7098',
  b'EquationGroup'),
 (b'9c17f267f79597ee01515f5ef925375d8a19844830cc46917a3d1b5bcb0ba4c3',
  b'WMIGhost'),
 (b'bce34c6e3de69a401257ebe9aed5fec7dc659011ce1606eed374b7a0ef71fff3',
  b'EquationGroup'),
 (b'ec5901dd85bdfe3a9e6e10fd45a4583d7d5ce11a79b9a42cc69a7a17b046e76b',
  b'EquationGroup'),
 (b'edccf8c9a7184c80737767f414aad482c2d8fe17f437e2030617deacbaa5e763',
  b'EquationGroup')]

In [165]:
distrib[b'0']

[(b'0008065861f5b09195e51add72dacd3c4bbce6444711320ad349c7dab5bb97fb',
  b'VolatileCedar.Explosion'),
 (b'0269085bb03ba9030209b0d95f0dd65f80539c27ea376eb2a8f11c68bf60cb8a',
  b'EquationGroup'),
 (b'03641e5632673615f23b2a8325d7355c4499a40f47b6ae094606a73c56e24ad0',
  b'VolatileCedar.Explosion'),
 (b'045f0ecae2362355f06d4fc8fa97e577daad1e01e6f0c0523b5b0f9e15306c74',
  b'EquationGroup'),
 (b'05455efecab4a7931fa53a3c2008d04fc6b539c5e8f451f19b617bd9b3ebcd83',
  b'WMIGhost'),
 (b'0581a38d1dc61e0da50722cb6c4253d603cc7965c87e1e42db548460d4abdcae',
  b'Win32.Lephic'),
 (b'05edcc3e5679ee254c78058c4f446e195544d3ff3374bd141c1895e7ed6a410b',
  b'Dyre'),
 (b'07529fae9e74be81fd302d022603d9f0796b4b9120b0d6131f75d41b979bbca5',
  b'VolatileCedar.Explosion'),
 (b'083c64c404ac1ea6df1a4cb6eafa91ef90b7abacc54547cf008cd74e77195746',
  b'EquationGroup'),
 (b'084a220ba90622cc223b93f32130e9f2d072679f66d1816775bf14832d492b8a',
  b'WMIGhost'),
 (b'09c04206b57bb8582faffb37e4ebb6867a02492ffc08268bcbc717708d1a8919',

In [166]:
distrib[b'1']

[(b'003315b0aea2fcb9f77d29223dd8947d0e6792b3a0227e054be8eb2a11f443d9',
  b'EquationGroup.Fanny')]

# Dbscan with the second vector

In [167]:
import numpy as np
from ml_helpers import ml_helpers
from redis_management import RedisManagement as rmgt
from collections import OrderedDict
from sklearn.cluster import DBSCAN

In [175]:
mat_second_vector = np.load('matrix_second_vector.npy')
redis_h = rmgt('malwares')
ml_h= ml_helpers(redis_h.redis_client)
mat_second_vector.shape


(724, 5)

In [176]:
dbscan = DBSCAN(eps=0.001,min_samples=1, metric="euclidean",n_jobs=8)

In [177]:
dbscan.fit(mat_second_vector)

DBSCAN(algorithm='auto', eps=0.001, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=1, n_jobs=8, p=None)

In [178]:
labels=dbscan.labels_.tolist()

In [179]:
all_malwares = ml_h.get_all_malwares
for index,l in enumerate(labels):
    ml_h.set_label(all_malwares[index],'DBscan','second_vector',l)

In [180]:
distrib = {}
for m in all_malwares:
    try:
        distrib[redis_h.client.hget(m,'DBscan_second_vector')].append((m,redis_h.client.hget(m,'label')))
    except KeyError:
        distrib[redis_h.client.hget(m,'DBscan_second_vector')] = [(m,redis_h.client.hget(m,'label'))]

In [181]:
distrib.keys()

dict_keys([b'224', b'283', b'222', b'166', b'1', b'32', b'198', b'206', b'242', b'217', b'57', b'228', b'6', b'156', b'90', b'197', b'189', b'219', b'220', b'269', b'35', b'289', b'262', b'59', b'158', b'84', b'25', b'202', b'19', b'200', b'109', b'164', b'43', b'284', b'185', b'135', b'280', b'51', b'263', b'63', b'186', b'256', b'241', b'254', b'134', b'2', b'268', b'58', b'290', b'201', b'187', b'86', b'137', b'210', b'230', b'14', b'279', b'148', b'253', b'274', b'39', b'209', b'247', b'46', b'99', b'208', b'121', b'144', b'125', b'105', b'111', b'138', b'225', b'180', b'163', b'21', b'118', b'78', b'204', b'175', b'147', b'141', b'246', b'277', b'139', b'287', b'231', b'273', b'177', b'74', b'167', b'178', b'0', b'9', b'48', b'110', b'97', b'229', b'102', b'37', b'23', b'218', b'106', b'160', b'29', b'153', b'168', b'236', b'165', b'152', b'119', b'182', b'94', b'234', b'130', b'240', b'114', b'136', b'67', b'216', b'44', b'237', b'191', b'80', b'133', b'221', b'24', b'250', b'88'

In [182]:
[(k,len(v)) for k,v in distrib.items()]

[(b'224', 1),
 (b'283', 1),
 (b'222', 1),
 (b'166', 1),
 (b'1', 352),
 (b'32', 1),
 (b'198', 1),
 (b'206', 1),
 (b'242', 1),
 (b'217', 1),
 (b'57', 1),
 (b'228', 1),
 (b'6', 2),
 (b'156', 1),
 (b'90', 1),
 (b'197', 1),
 (b'189', 1),
 (b'219', 1),
 (b'220', 1),
 (b'269', 1),
 (b'35', 1),
 (b'289', 1),
 (b'262', 1),
 (b'59', 1),
 (b'158', 1),
 (b'84', 1),
 (b'25', 2),
 (b'202', 1),
 (b'19', 1),
 (b'200', 1),
 (b'109', 1),
 (b'164', 2),
 (b'43', 1),
 (b'284', 1),
 (b'185', 1),
 (b'135', 1),
 (b'280', 1),
 (b'51', 5),
 (b'263', 1),
 (b'63', 1),
 (b'186', 1),
 (b'256', 1),
 (b'241', 1),
 (b'254', 1),
 (b'134', 1),
 (b'2', 11),
 (b'268', 1),
 (b'58', 1),
 (b'290', 1),
 (b'201', 2),
 (b'187', 1),
 (b'86', 2),
 (b'137', 1),
 (b'210', 1),
 (b'230', 1),
 (b'14', 1),
 (b'279', 1),
 (b'148', 1),
 (b'253', 1),
 (b'274', 1),
 (b'39', 1),
 (b'209', 1),
 (b'247', 1),
 (b'46', 1),
 (b'99', 1),
 (b'208', 1),
 (b'121', 1),
 (b'144', 1),
 (b'125', 1),
 (b'105', 1),
 (b'111', 1),
 (b'138', 1),
 (b'225', 1)

In [183]:
distrib[b'1']

[(b'003315b0aea2fcb9f77d29223dd8947d0e6792b3a0227e054be8eb2a11f443d9',
  b'EquationGroup.Fanny'),
 (b'022224bfad26bab87cf5f4b17981a4224ef8fa6919520b3bc2946234efda1e11',
  b'EquationGroup'),
 (b'0269085bb03ba9030209b0d95f0dd65f80539c27ea376eb2a8f11c68bf60cb8a',
  b'EquationGroup'),
 (b'037bdc95919b1d3d65af6202e8c9c9ca3caba7a863e4e39162b93fa032881feb',
  b'EquationGroup'),
 (b'0404b8957c27de20bebb133d3cf0a28e30700f667f7c2f3fe7fde7e726b691cd',
  b'EquationGroup'),
 (b'043eec3e120c34cd0ac6c954c2ee6dd533a0a647367ee2ca2d5a508a4a8ac75c',
  b'EquationGroup'),
 (b'045f0ecae2362355f06d4fc8fa97e577daad1e01e6f0c0523b5b0f9e15306c74',
  b'EquationGroup'),
 (b'06cd057c035acbff3e9294860053ce9ec237a166e52018b6651250fa99e1a8a0',
  b'EquationGroup'),
 (b'083c64c404ac1ea6df1a4cb6eafa91ef90b7abacc54547cf008cd74e77195746',
  b'EquationGroup'),
 (b'0a39d48ce6fddd2feb5aefb26c3f437cf460dbf8670544ae9f1bd594856ac178',
  b'EquationGroup'),
 (b'0ac533252bd0595e5f983da38c18f89aa8cb6fbba6c85acd19be57b614338bdf',
  b

In [184]:
distrib[b'8']

[(b'05455efecab4a7931fa53a3c2008d04fc6b539c5e8f451f19b617bd9b3ebcd83',
  b'WMIGhost'),
 (b'0eb038e7e5edd6ac1b4eee8dd1c51b6d94da24d02ba705e7e7f10b41edf701c2',
  b'WMIGhost'),
 (b'6b91fdb0992ca029c913092db7b4fd94c917c1473953d1ec77c74d030776fe9a',
  b'WMIGhost'),
 (b'c7128e2772b4f8c59943028e205d1b23c07f36206c1c61a05645c7bf143b24ee',
  b'WMIGhost')]

In [185]:
distrib[b'93']

[(b'4e39bc95e35323ab586d740725a1c8cbcde01fe453f7c4cac7cced9a26e42cc9',
  b'Trojan.Regin'),
 (b'5001793790939009355ba841610412e0f8d60ef5461f2ea272ccf4fd4c83b823',
  b'Trojan.Regin'),
 (b'a7493fac96345a989b1a03772444075754a2ef11daa22a7600466adc1f69a669',
  b'Trojan.Regin'),
 (b'c0cf8e008fbfa0cb2c61d968057b4a077d62f64d7320769982d28107db370513',
  b'Trojan.Regin')]

In [186]:
distrib[b'76']

[(b'42028874fae37ad9dc89eb37149ecb1e6439869918309a07f056924c1b981def',
  b'PotaoExpress'),
 (b'a3a43bbc69e24c0bc3ab06fbf3ccc35cf8687e2862f86fb0d269258b68c710c9',
  b'PotaoExpress'),
 (b'b8844e5b72971fe67d2905e77ddaa3366ae1c3bead92be6effd58691bc1ff8ec',
  b'PotaoExpress')]