# Setup

In [1]:
# ML algorithms
from sklearn.cluster import KMeans

# preprocessing
from sklearn.preprocessing import StandardScaler, Imputer

import numpy as np
from numpy import nan

from pprint import pprint

import math

# Read data

In [2]:
with open("data/dataset_26977", "r", encoding='utf-8') as f:
    data = eval(f.read())

X = np.array([d['feat'] for d in data])
y = np.array([d['target'] for d in data])
feat_labels = data[0]['feat_labels']

print(X.shape)
print(y.shape)
print(feat_labels)
pprint(X[:5])
pprint(y[:5])
print("# compromised: ", len([tar for tar in y if tar==1]))
print("# malicious: ", len([tar for tar in y if tar==0]))
print("# unknown: ", len([tar for tar in y if tar==-1]))
print("% compromised: ", len([tar for tar in y if tar==1])/len(y))
print("% malicious: ", len([tar for tar in y if tar==0])/len(y))
print("% unknown: ", len([tar for tar in y if tar==-1])/len(y))
# print("# archived true: ", len([x for x in Xtrain if x[0]==1]))
# print("# years_active: ", len([x for x in Xtrain if not np.isnan(x[1])]))
# print("# years_inactive: ", len([x for x in Xtrain if x[2]<=1]))
# print("# captures: ", len([x for x in Xtrain if not np.isnan(x[3])]))
# print("# archive_redirected true: ", len([x for x in Xtrain if x[4]==1]))
# print("# freenom tld true: ", len([x for x in Xtrain if x[5]==1]))
# print("# wildcard_subdomain true: ", len([x for x in Xtrain if x[6]==1]))
# print("# reachable true: ", len([i for i, x in enumerate(Xtrain) if x[7]==1]))
# print("# blocked true: ", len([x for x in Xtrain if x[8]==1]))
# print("# alexa_rank true: ", len([x for x in Xtrain if x[9]>-1]))
# print("# brandname >80: ", len([x for x in Xtrain if x[-1]>80]))

(26977, 19)
(26977,)
['archived', 'years_active', 'years_inactive', 'num_captures', 'freenom_tld', 'prev_mal_tld', 'wildcard_subdomain', 'reachable', 'redirected', 'blocked', 'alexa_rank', 'ratio_longest_word', 'contain_digit', 'contain_dash', 'name_len', 'brandname_partialratio', 'prev_mal_domain_ed', 'sub_levels', 'num_sub']
array([[1.00000000e+00, 9.00000000e+00, 1.00000000e+00, 7.42000000e+02,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.14676000e+05, 3.75000000e-01,
        1.00000000e+00, 0.00000000e+00, 4.00000000e+00, 6.70000000e+01,
        2.50000000e-01, 3.00000000e+00, 8.00000000e+00],
       [0.00000000e+00,            nan,            nan,            nan,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
                   nan,            nan,            nan, 2.00000000e-01,
        1.00000000e+00, 0.00000000e+00, 7.00000000e+00, 6.70000000e+01,
        1.42857143e-01, 2.00000000e+00

# Normalize and Impute

In [3]:
imp = Imputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X[2])

[ 1.0387732   2.34597843 -1.0895996  -0.05839451 -0.31056035 -0.29937868
  2.31279556  0.92100277 -0.40652599 -0.31751894  3.95687224 -1.13001204
  2.93672114 -0.48923731 -0.99644922 -1.51404367  0.83925415 -0.48958764
 -0.04328799]




# Cluster

In [4]:
def getEntropy(pred, pred_label):
    t_total = len(data)
    ent_w = 0
    #print(t_total)
    for pred, labels in pred_label.items():
        #print(pred)
        neg, pos, unknown = labels[0], labels[1], labels[-1]
        total = neg+pos+unknown
        if total > 0:
            #print(total)
            neg_pr = neg/total
            if neg_pr == 0 or neg_pr == 1:
                ent = 0
            else:
                #print(neg_pr)
                pos_pr = 1-neg_pr
                ent = -neg_pr*math.log(neg_pr, 2) - pos_pr*math.log(pos_pr, 2)
            #print(ent)
        else:
            ent = 0
        ent_w += ent*(total/t_total)
    return ent_w

In [9]:
n_clusters = [2,3,4,5,6,7,8,9,10,11,12]
#n_clusters = [8]
#n_c = 5
for n_c in n_clusters:
    print("n_clusters:", n_c)
    clf = KMeans(n_clusters=n_c)
    pred = clf.fit_predict(X)

    pred_label = dict()
    for i, pr in enumerate(pred):
        label = y[i]
        if pr not in pred_label:
            pred_label[pr] = dict()
            pred_label[pr][0] = 0
            pred_label[pr][1] = 0
            pred_label[pr][-1] = 0


        pred_label[pr][label] += 1

    print(pred_label)
    print("entropy:", getEntropy(pred, pred_label))
    print()

n_clusters: 2
{1: {0: 69, 1: 383, -1: 14164}, 0: {0: 420, 1: 103, -1: 11838}}
entropy: 0.12148366331996381

n_clusters: 3
{0: {0: 63, 1: 385, -1: 14294}, 2: {0: 96, 1: 99, -1: 9822}, 1: {0: 330, 1: 2, -1: 1886}}
entropy: 0.10060091378259056

n_clusters: 4
{2: {0: 63, 1: 349, -1: 13496}, 0: {0: 96, 1: 101, -1: 9901}, 1: {0: 330, 1: 2, -1: 1884}, 3: {0: 0, 1: 34, -1: 721}}
entropy: 0.10042858147829653

n_clusters: 5
{2: {0: 7, 1: 45, -1: 2734}, 3: {0: 70, 1: 27, -1: 2267}, 4: {0: 63, 1: 306, -1: 11779}, 1: {0: 349, 1: 74, -1: 8501}, 0: {0: 0, 1: 34, -1: 721}}
entropy: 0.11935883572480038

n_clusters: 6
{4: {0: 2, 1: 101, -1: 3445}, 1: {0: 75, 1: 79, -1: 5742}, 0: {0: 3, 1: 29, -1: 4180}, 3: {0: 308, 1: 2, -1: 1671}, 5: {0: 51, 1: 251, -1: 9844}, 2: {0: 50, 1: 24, -1: 1120}}
entropy: 0.09775612308588759

n_clusters: 7
{3: {0: 6, 1: 34, -1: 1979}, 0: {0: 88, 1: 101, -1: 6619}, 1: {0: 56, 1: 285, -1: 10619}, 4: {0: 8, 1: 30, -1: 4183}, 6: {0: 330, 1: 2, -1: 1879}, 2: {0: 0, 1: 34, -1: 721},

In [10]:
cluster_to_label = dict()

for cluster, labels in pred_label.items():
    neg, pos = labels[0], labels[1]
    
    lable = -1
    if neg > pos:
        label = 0
    elif neg < pos:
        label = 1
    
    cluster_to_label[cluster] = label
print(cluster_to_label)

{7: 1, 2: 0, 6: 1, 11: 1, 3: 1, 4: 0, 8: 1, 1: 1, 0: 0, 5: 1, 10: 1, 9: 0}


In [11]:
data_label = data

for i, pr in enumerate(pred):
    label = y[i]
    if label == -1:
        label = cluster_to_label[pr]
        data_label[i]["target"] = label

In [12]:
X_cluster = np.array([d['feat'] for d in data_label])
y_cluster = np.array([d['target'] for d in data_label])
feat_labels = data[0]['feat_labels']

print(X_cluster.shape)
print(y_cluster.shape)
print(feat_labels)
pprint(X_cluster[:5])
pprint(y_cluster[:5])
print("# compromised: ", len([tar for tar in y_cluster if tar==1]))
print("# malicious: ", len([tar for tar in y_cluster if tar==0]))
print("# unknown: ", len([tar for tar in y_cluster if tar==-1]))
print("% compromised: ", len([tar for tar in y_cluster if tar==1])/len(y))
print("% malicious: ", len([tar for tar in y_cluster if tar==0])/len(y))
print("% unknown: ", len([tar for tar in y_cluster if tar==-1])/len(y))

(26977, 19)
(26977,)
['archived', 'years_active', 'years_inactive', 'num_captures', 'freenom_tld', 'prev_mal_tld', 'wildcard_subdomain', 'reachable', 'redirected', 'blocked', 'alexa_rank', 'ratio_longest_word', 'contain_digit', 'contain_dash', 'name_len', 'brandname_partialratio', 'prev_mal_domain_ed', 'sub_levels', 'num_sub']
array([[1.00000000e+00, 9.00000000e+00, 1.00000000e+00, 7.42000000e+02,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.14676000e+05, 3.75000000e-01,
        1.00000000e+00, 0.00000000e+00, 4.00000000e+00, 6.70000000e+01,
        2.50000000e-01, 3.00000000e+00, 8.00000000e+00],
       [0.00000000e+00,            nan,            nan,            nan,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
                   nan,            nan,            nan, 2.00000000e-01,
        1.00000000e+00, 0.00000000e+00, 7.00000000e+00, 6.70000000e+01,
        1.42857143e-01, 2.00000000e+00

In [13]:
with open('./data/data_cluster_26977', 'w', encoding="utf-8") as fout:
    pprint(data_label, fout)

# Cluster Analysis

In [15]:
#n_clusters = [2,3,4,5,6,7,8,9,10,11,12]
n_clusters = [2]
#n_c = 5
for n_c in n_clusters:
    print("n_clusters:", n_c)
    clf = KMeans(n_clusters=n_c)
    pred = clf.fit_predict(X)

    pred_label = dict()
    pred_data = dict()
    for i, pr in enumerate(pred):
        label = y[i]
        dt = data[i]
        if pr not in pred_label:
            pred_label[pr] = dict()
            pred_label[pr][0] = 0
            pred_label[pr][1] = 0
            pred_label[pr][-1] = 0
            
            pred_data[pr] = dict()
            pred_data[pr][0] = []
            pred_data[pr][1] = []
            pred_data[pr][-1] = []

        pred_label[pr][label] += 1
        pred_data[pr][label].append(dt)

    print(pred_label)
    print("entropy:", getEntropy(pred, pred_label))
    print()

n_clusters: 2
{1: {0: 69, 1: 383, -1: 14191}, 0: {0: 420, 1: 103, -1: 11811}}
entropy: 0.12144051983185408



In [28]:
x = [x['info'][0] for x in pred_data[0][1]]
for xx in x:
    print(xx)

account-services-login.net
ackermanproduction.com
activerecoveryinc.cf
addarpluss.com
administartingserv.online
ahaofunweb.com
amservers.fr
antseducati.com
attvalidated.com
banauws.com
careerx.cf
casa-davinci.mn
caucasiana.ge
caughtoncamera.co.za
cazanele-dunarii.ro
cdnusercontent.men
centrul2sibiu.ro
cleanday34.ru
clearfieldcountydemocrats.com
com-app.eu
compraonline14.com.ar
constcarte-updates.com
contact-apple.com.br
contemporarympreg.net
cryptoga.org
denise-thibault.com
depositeagency.online
edgnuunfit.com
ekoservis.in.ua
enovadese.tk
etyu-mimon.com
fromevalleywindowcleaning.co.uk
fromnowon.eu
ghandinku.com
ginphed.com.ng
godadogixc.com
gottfried-weiss.online
gw-siegen.us
help-mnhbvgyfty.ga
help-robinettegosselin.net
iban-naambevestigen.website
id-icloud-nnsms.com
iflixes.gq
inversoresasociados.com.bo
ios-sign-in.us
jasdbseywion.com
johnson-industrlal.com
justinso.ga
kjcdzi.com
langitterbelahbalik.com
latina.com.es
lawyerparalegalservices.com
linkatel.gt
linked-buyers.com
mafiathre

In [26]:
#pprint([x['info'][0] for x in pred_data[1][0] if x['feat'][7] == 1])
pprint([x['info'][0] for x in pred_data[0][1]])

['3utilities.com',
 'appleidsecuritys.com',
 'appliancesty.date',
 'at0.win',
 'automobilesbuy.pw',
 'beautyitems.xyz',
 'bounceme.net',
 'buycamera.date',
 'buytools.pw',
 'buytoolse.site',
 'camillebirrell.site',
 'canbeauty.date',
 'carmeelbrmeente.site',
 'carpartsnet.men',
 'carpetstore.pw',
 'carpetstores.xyz',
 'catherinewylde.site',
 'cedricbacon.site',
 'cheaptoyget.date',
 'culinarygoodst.trade',
 'ddns.net',
 'ddnsking.com',
 'dekinsore.online',
 'discountmany.site',
 'eprtmanpower.com',
 'euesshop.top',
 'furnituresy.trade',
 'getwood.date',
 'glassesscarf.date',
 'goodshouse.men',
 'goodsshop.win',
 'homemachine.xyz',
 'homeresjob.men',
 'humphreyjulius.site',
 'imcreator.com',
 'inhomething.men',
 'inthebedst.date',
 'itursulogalvan.edu.mx',
 'jewelryshops.trade',
 'joyviolet.site',
 'jpanbest.win',
 'kimonodress.date',
 'kitchentask.xyz',
 'kitchenget.pw',
 'kitchentooly.date',
 'machinelive.xyz',
 'manydiscount.site',
 'menshoesly.trade',
 'modelshop.pw',
 'motormeet.me