# Setup

In [41]:
# ML algorithms
from sklearn.cluster import KMeans, DBSCAN, Birch

# preprocessing
from sklearn.preprocessing import StandardScaler, Imputer

import numpy as np
from numpy import nan

from pprint import pprint

import math
import random

# Functions

In [12]:
# balanced train and test sets
def balance_data(X, y):
    X_bal = []
    y_bal = []
    X_rest = []
    y_rest = []
    min1 = len([tar for tar in y if tar==1])
    min2 = len([tar for tar in y if tar==-1])
    mini = min1
    if min2 < min1:
        mini = min2

    cnt_1 = 0
    cnt_0 = 0
    print(mini)
    for i, tar in enumerate(y):
        if tar == 1 and cnt_1<mini:
            cnt_1+=1
            X_bal.append(X[i])
            y_bal.append(y[i])
        elif tar == -1 and cnt_0<mini:
            cnt_0+=1
            X_bal.append(X[i])
            y_bal.append(y[i])
        elif tar == 0:
            X_bal.append(X[i])
            y_bal.append(y[i])
        else:
            X_rest.append(X[i])
            y_rest.append(y[i])

    print("# X_bal:", len(X_bal))
    print("# y_bal:", len(y_bal))
    print("% compromised: ", len([tar for tar in y_bal if tar==1])/len(y_bal))
    print("% malicious: ", len([tar for tar in y_bal if tar==-1])/len(y_bal))
    print("% unlabeled: ", len([tar for tar in y_bal if tar==0])/len(y_bal))
    
    print("# X_rest:", len(X_rest))
    print("# y_rest:", len(y_rest))
    print("% compromised: ", len([tar for tar in y_rest if tar==1])/len(y_rest))
    print("% malicious: ", len([tar for tar in y_rest if tar==-1])/len(y_rest))
    print("% unlabeled: ", len([tar for tar in y_rest if tar==0])/len(y_rest))
    
    return [X_bal, y_bal, X_rest, y_rest]

# Read data

In [42]:
with open("/home/sophie/SSRG/domainclassification/results/dataset/domain_target_55383", "r", encoding='utf-8') as f:
    data = eval(f.read())
    
datasets = set([d['dataset'] for d in data])
pprint(datasets)

{'comp-deltaphish',
 'comp-phishlabs',
 'mal-apwg',
 'mal-future-apwg',
 'mal-phishlabs',
 'unlabeled-apwg'}


In [43]:
#random.Random(4).shuffle(data)

dataset_train = {'mal-apwg', 'comp-phishlabs', 'unlabeled-apwg'}

X = np.array([d['feat'] for d in data if d['dataset'] in dataset_train])
y = np.array([d['target'] for d in data if d['dataset'] in dataset_train])
data = [d for d in data if d['dataset'] in dataset_train]
feat_labels = data[0]['feat_labels']

print(len(data))
print(X.shape)
print(y.shape)
print(feat_labels)
pprint(X[:5])
pprint(y[:5])
print("# compromised: ", len([tar for tar in y if tar==1]))
print("# malicious: ", len([tar for tar in y if tar==-1]))
print("# unknown: ", len([tar for tar in y if tar==0]))
print("% compromised: ", len([tar for tar in y if tar==1])/len(y))
print("% malicious: ", len([tar for tar in y if tar==-1])/len(y))
print("% unknown: ", len([tar for tar in y if tar==0])/len(y))

51576
(51576, 19)
(51576,)
['archived', 'years_active', 'years_inactive', 'num_captures', 'freenom_tld', 'prev_mal_tld', 'wildcard_subdomain', 'reachable', 'redirected', 'blocked', 'alexa_rank', 'ratio_longest_word', 'contain_digit', 'contain_dash', 'name_len', 'brandname_partialratio', 'prev_mal_domain_ed', 'sub_levels', 'num_sub']
array([[1.00000000e+00, 9.00000000e+00, 1.00000000e+00, 7.42000000e+02,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.14676000e+05, 3.75000000e-01,
        1.00000000e+00, 0.00000000e+00, 4.00000000e+00, 5.00000000e+01,
        2.50000000e-01, 1.00000000e+00, 8.00000000e+00],
       [0.00000000e+00,            nan,            nan,            nan,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
                   nan,            nan,            nan, 2.00000000e-01,
        1.00000000e+00, 0.00000000e+00, 7.00000000e+00, 8.00000000e+01,
        1.42857143e-01, 0.000000

In [27]:
# balance
#X, y, _, _ = balance_data(X, y)

# Normalize and Impute

In [45]:
imp = Imputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X[2])

[ 0.88321685  2.14786531 -1.0150291  -0.02673723 -0.29762989 -0.2200951
  2.13069229  0.92116008 -0.37882407 -0.3178661   3.63590108 -1.2123141
  3.34903141 -0.4211886  -1.06909517 -5.45318381  0.96980288  0.76978709
  0.10094108]


# Cluster

In [46]:
def entropy(prob):
    if prob == 1 or prob == 0:
        return 0
    return -prob*math.log(prob, 2) - (1-prob)*math.log((1-prob), 2)

In [47]:
def getEntropy(pred, pred_label):
    t_total = len(data)
    ent_w = 0

    for pred, labels in pred_label.items():
        neg, pos, unknown = labels[-1], labels[1], labels[0]
        total = neg+pos+unknown
        neg_pr = neg/total
        if total > 0:
            neg_pr = neg/total
        else:
            neg_pr = 0
        
        ent = entropy(neg_pr)
        ent_w += ent*(total/t_total)
    return ent_w

In [55]:
clf_name = [
        #"KMeans",
        #"DBSCAN",
        "Birch"
        ]

#n_clusters = [2,3,4,5,6,7,8,9,10,11,12]
n_clusters = [20]
for name in clf_name:
    print("name:", name)
    for n_c in n_clusters:
        print("n_clusters:", n_c)
        if name == "KMeans":
            clf = KMeans(n_clusters=n_c)
        elif name == "DBSCAN":
            clf = DBSCAN(min_samples=30)
        elif name == "Birch":
            clf = Birch(n_clusters=n_c, threshold=0.5, branching_factor=50)
        else:
            continue
    
        pred = clf.fit_predict(X)

        pred_label = dict()
        for i, pr in enumerate(pred):
            label = y[i]
            if pr not in pred_label:
                pred_label[pr] = dict()
                pred_label[pr][0], pred_label[pr][1], pred_label[pr][-1] = 0, 0, 0

            pred_label[pr][label] += 1

        pprint(pred_label)
        print("entropy:", getEntropy(pred, pred_label))
        print()

name: Birch
n_clusters: 20
{0: {-1: 5, 0: 6676, 1: 76},
 1: {-1: 1, 0: 804, 1: 22},
 2: {-1: 41, 0: 18772, 1: 265},
 3: {-1: 1, 0: 1042, 1: 8},
 4: {-1: 3, 0: 617, 1: 13},
 5: {-1: 44, 0: 1815, 1: 32},
 6: {-1: 70, 0: 3891, 1: 31},
 7: {-1: 0, 0: 2, 1: 0},
 8: {-1: 7, 0: 2862, 1: 41},
 9: {-1: 1, 0: 3086, 1: 54},
 10: {-1: 0, 0: 906, 1: 10},
 11: {-1: 0, 0: 3, 1: 0},
 12: {-1: 0, 0: 1390, 1: 45},
 13: {-1: 0, 0: 172, 1: 3},
 14: {-1: 2, 0: 29, 1: 0},
 15: {-1: 0, 0: 124, 1: 1},
 16: {-1: 7, 0: 4803, 1: 38},
 17: {-1: 310, 0: 1828, 1: 2},
 18: {-1: 0, 0: 1514, 1: 28},
 19: {-1: 3, 0: 76, 1: 0}}
entropy: 0.054446596155741475



# Process clusters

In [37]:
cluster_to_label = dict()

for cluster, labels in pred_label.items():
    neg, pos = labels[-1], labels[1]
    total = neg+pos
    
    # ignore clusters where max label is less than threshold
    if max(neg, pos) < 10:
        continue
    
    # ignore clusters where entropy is > threshold
    neg_pr = neg/total
    if entropy(neg_pr) > 0.2:
        continue
    
    label = 0
    if neg > pos:
        label = -1
    elif neg < pos:
        label = 1
    
    cluster_to_label[cluster] = label
print(cluster_to_label)

{10: 1, 3: -1, 0: 1}


In [38]:
data_label = []

for i, pr in enumerate(pred):
    if pr in cluster_to_label:
        label = y[i]
        if label == 0:
            label = cluster_to_label[pr]
            temp = data[i]
            temp["target"] = label
            data_label.append(temp)
            
len(data_label)

5952

In [39]:
X_cluster = np.array([d['feat'] for d in data_label])
y_cluster = np.array([d['target'] for d in data_label])
feat_labels = data[0]['feat_labels']

print(X_cluster.shape)
print(y_cluster.shape)
print(feat_labels)
pprint(X_cluster[:5])
pprint(y_cluster[:5])
print("# compromised: ", len([tar for tar in y_cluster if tar==1]))
print("# malicious: ", len([tar for tar in y_cluster if tar==-1]))
print("# unknown: ", len([tar for tar in y_cluster if tar==0]))
print("% compromised: ", len([tar for tar in y_cluster if tar==1])/len(y_cluster))
print("% malicious: ", len([tar for tar in y_cluster if tar==-1])/len(y_cluster))
print("% unknown: ", len([tar for tar in y_cluster if tar==0])/len(y_cluster))

(5952, 19)
(5952,)
['archived', 'years_active', 'years_inactive', 'num_captures', 'freenom_tld', 'prev_mal_tld', 'wildcard_subdomain', 'reachable', 'redirected', 'blocked', 'alexa_rank', 'ratio_longest_word', 'contain_digit', 'contain_dash', 'name_len', 'brandname_partialratio', 'prev_mal_domain_ed', 'sub_levels', 'num_sub']
array([[1.00000000e+00, 9.00000000e+00, 1.00000000e+00, 7.42000000e+02,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.14676000e+05, 3.75000000e-01,
        1.00000000e+00, 0.00000000e+00, 4.00000000e+00, 5.00000000e+01,
        2.50000000e-01, 1.00000000e+00, 8.00000000e+00],
       [0.00000000e+00,            nan,            nan,            nan,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
                   nan,            nan,            nan, 4.28571429e-01,
        1.00000000e+00, 0.00000000e+00, 3.00000000e+00, 6.70000000e+01,
        3.33333333e-01, 0.00000000e+00, 

In [40]:
with open('./data/data_cluster_5952', 'w', encoding="utf-8") as fout:
    pprint(data_label, fout)

# Cluster Analysis

In [15]:
#n_clusters = [2,3,4,5,6,7,8,9,10,11,12]
n_clusters = [2]
#n_c = 5
for n_c in n_clusters:
    print("n_clusters:", n_c)
    clf = KMeans(n_clusters=n_c)
    pred = clf.fit_predict(X)

    pred_label = dict()
    pred_data = dict()
    for i, pr in enumerate(pred):
        label = y[i]
        dt = data[i]
        if pr not in pred_label:
            pred_label[pr] = dict()
            pred_label[pr][0] = 0
            pred_label[pr][1] = 0
            pred_label[pr][-1] = 0
            
            pred_data[pr] = dict()
            pred_data[pr][0] = []
            pred_data[pr][1] = []
            pred_data[pr][-1] = []

        pred_label[pr][label] += 1
        pred_data[pr][label].append(dt)

    print(pred_label)
    print("entropy:", getEntropy(pred, pred_label))
    print()

n_clusters: 2
{1: {0: 69, 1: 383, -1: 14191}, 0: {0: 420, 1: 103, -1: 11811}}
entropy: 0.12144051983185408



In [28]:
x = [x['info'][0] for x in pred_data[0][1]]
for xx in x:
    print(xx)

account-services-login.net
ackermanproduction.com
activerecoveryinc.cf
addarpluss.com
administartingserv.online
ahaofunweb.com
amservers.fr
antseducati.com
attvalidated.com
banauws.com
careerx.cf
casa-davinci.mn
caucasiana.ge
caughtoncamera.co.za
cazanele-dunarii.ro
cdnusercontent.men
centrul2sibiu.ro
cleanday34.ru
clearfieldcountydemocrats.com
com-app.eu
compraonline14.com.ar
constcarte-updates.com
contact-apple.com.br
contemporarympreg.net
cryptoga.org
denise-thibault.com
depositeagency.online
edgnuunfit.com
ekoservis.in.ua
enovadese.tk
etyu-mimon.com
fromevalleywindowcleaning.co.uk
fromnowon.eu
ghandinku.com
ginphed.com.ng
godadogixc.com
gottfried-weiss.online
gw-siegen.us
help-mnhbvgyfty.ga
help-robinettegosselin.net
iban-naambevestigen.website
id-icloud-nnsms.com
iflixes.gq
inversoresasociados.com.bo
ios-sign-in.us
jasdbseywion.com
johnson-industrlal.com
justinso.ga
kjcdzi.com
langitterbelahbalik.com
latina.com.es
lawyerparalegalservices.com
linkatel.gt
linked-buyers.com
mafiathre

In [26]:
#pprint([x['info'][0] for x in pred_data[1][0] if x['feat'][7] == 1])
pprint([x['info'][0] for x in pred_data[0][1]])

['3utilities.com',
 'appleidsecuritys.com',
 'appliancesty.date',
 'at0.win',
 'automobilesbuy.pw',
 'beautyitems.xyz',
 'bounceme.net',
 'buycamera.date',
 'buytools.pw',
 'buytoolse.site',
 'camillebirrell.site',
 'canbeauty.date',
 'carmeelbrmeente.site',
 'carpartsnet.men',
 'carpetstore.pw',
 'carpetstores.xyz',
 'catherinewylde.site',
 'cedricbacon.site',
 'cheaptoyget.date',
 'culinarygoodst.trade',
 'ddns.net',
 'ddnsking.com',
 'dekinsore.online',
 'discountmany.site',
 'eprtmanpower.com',
 'euesshop.top',
 'furnituresy.trade',
 'getwood.date',
 'glassesscarf.date',
 'goodshouse.men',
 'goodsshop.win',
 'homemachine.xyz',
 'homeresjob.men',
 'humphreyjulius.site',
 'imcreator.com',
 'inhomething.men',
 'inthebedst.date',
 'itursulogalvan.edu.mx',
 'jewelryshops.trade',
 'joyviolet.site',
 'jpanbest.win',
 'kimonodress.date',
 'kitchentask.xyz',
 'kitchenget.pw',
 'kitchentooly.date',
 'machinelive.xyz',
 'manydiscount.site',
 'menshoesly.trade',
 'modelshop.pw',
 'motormeet.me