# Setup

In [39]:
# ML algorithms
from sklearn.cluster import KMeans, DBSCAN, Birch

# preprocessing
from sklearn.preprocessing import StandardScaler, Imputer

# feature reduction
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

# feature calibration
from sklearn.isotonic import IsotonicRegression

import numpy as np
from numpy import nan

from pprint import pprint

import math
import random

# Functions

In [40]:
# balanced train and test sets
def balance_data(X, y):
    X_bal = []
    y_bal = []
    X_rest = []
    y_rest = []
    min1 = len([tar for tar in y if tar==1])
    min2 = len([tar for tar in y if tar==-1])
    mini = min1
    if min2 < min1:
        mini = min2

    cnt_1 = 0
    cnt_0 = 0
    print(mini)
    for i, tar in enumerate(y):
        if tar == 1 and cnt_1<mini:
            cnt_1+=1
            X_bal.append(X[i])
            y_bal.append(y[i])
        elif tar == -1 and cnt_0<mini:
            cnt_0+=1
            X_bal.append(X[i])
            y_bal.append(y[i])
        elif tar == 0:
            X_bal.append(X[i])
            y_bal.append(y[i])
        else:
            X_rest.append(X[i])
            y_rest.append(y[i])

    print("# X_bal:", len(X_bal))
    print("# y_bal:", len(y_bal))
    print("% compromised: ", len([tar for tar in y_bal if tar==1])/len(y_bal))
    print("% malicious: ", len([tar for tar in y_bal if tar==-1])/len(y_bal))
    print("% unlabeled: ", len([tar for tar in y_bal if tar==0])/len(y_bal))
    
    print("# X_rest:", len(X_rest))
    print("# y_rest:", len(y_rest))
    print("% compromised: ", len([tar for tar in y_rest if tar==1])/len(y_rest))
    print("% malicious: ", len([tar for tar in y_rest if tar==-1])/len(y_rest))
    print("% unlabeled: ", len([tar for tar in y_rest if tar==0])/len(y_rest))
    
    return [X_bal, y_bal, X_rest, y_rest]

# Read data

In [41]:
with open("/home/sophie/SSRG/domainclassification/results/dataset/domain_target_55383", "r", encoding='utf-8') as f:
    data = eval(f.read())
    
datasets = set([d['dataset'] for d in data])
pprint(datasets)

{'comp-deltaphish',
 'comp-phishlabs',
 'mal-apwg',
 'mal-future-apwg',
 'mal-phishlabs',
 'unlabeled-apwg'}


In [42]:
#random.Random(4).shuffle(data)

#dataset_train = {'mal-apwg', 'comp-deltaphish', 'unlabeled-apwg'}
dataset_train = {'mal-apwg', 'comp-phishlabs', 'unlabeled-apwg'}

X = np.array([d['feat'] for d in data if d['dataset'] in dataset_train])
y = np.array([d['target'] for d in data if d['dataset'] in dataset_train])
data = [d for d in data if d['dataset'] in dataset_train]
feat_labels = data[0]['feat_labels']

print(len(data))
print(X.shape)
print(y.shape)
print(feat_labels)
pprint(X[:5])
pprint(y[:5])
print("# compromised: ", len([tar for tar in y if tar==1]))
print("# malicious: ", len([tar for tar in y if tar==-1]))
print("# unknown: ", len([tar for tar in y if tar==0]))
print("% compromised: ", len([tar for tar in y if tar==1])/len(y))
print("% malicious: ", len([tar for tar in y if tar==-1])/len(y))
print("% unknown: ", len([tar for tar in y if tar==0])/len(y))

51576
(51576, 19)
(51576,)
['archived', 'years_active', 'years_inactive', 'num_captures', 'freenom_tld', 'prev_mal_tld', 'wildcard_subdomain', 'reachable', 'redirected', 'blocked', 'alexa_rank', 'ratio_longest_word', 'contain_digit', 'contain_dash', 'name_len', 'brandname_partialratio', 'prev_mal_domain_ed', 'sub_levels', 'num_sub']
array([[1.00000000e+00, 9.00000000e+00, 1.00000000e+00, 7.42000000e+02,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.14676000e+05, 3.75000000e-01,
        1.00000000e+00, 0.00000000e+00, 4.00000000e+00, 5.00000000e+01,
        2.50000000e-01, 1.00000000e+00, 8.00000000e+00],
       [0.00000000e+00,            nan,            nan,            nan,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
                   nan,            nan,            nan, 2.00000000e-01,
        1.00000000e+00, 0.00000000e+00, 7.00000000e+00, 8.00000000e+01,
        1.42857143e-01, 0.000000

In [5]:
# balance
X, y, X_val, y_val = balance_data(X, y)

495
# X_bal: 51402
# y_bal: 51402
% compromised:  0.009629975487335123
% malicious:  0.009629975487335123
% unlabeled:  0.9807400490253297
# X_rest: 194
# y_rest: 194
% compromised:  1.0
% malicious:  0.0
% unlabeled:  0.0


# Normalize and Impute

In [43]:
imp = Imputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X[2])

[ 0.88321685  2.14786531 -1.0150291  -0.02673723 -0.29762989 -0.2200951
  2.13069229  0.92116008 -0.37882407 -0.3178661   3.63590108 -1.2123141
  3.34903141 -0.4211886  -1.06909517 -5.45318381  0.96980288  0.76978709
  0.10094108]


# Data calibration

In [44]:
def sortAscendingFeature(cX, cy):
    idx_cX = np.argsort(cX)
    cX = np.array(cX)[idx_cX]
    cy = np.array(cy)[idx_cX]
    return cX, idx_cX, cy

In [45]:
cX = [3,4,5,1,2]
cy = [1,1,1,0,0]
sortAscendingFeature(cX,cy)

(array([1, 2, 3, 4, 5]), array([3, 4, 0, 1, 2]), array([0, 0, 1, 1, 1]))

In [46]:
def calibratedFeature(cX, cy):
    cX, idx_cX, _ = sortAscendingFeature(cX, cy)
    print(cX[:-20])
    iso = IsotonicRegression()
    iso.fit(cX, cy)
    cX = iso.transform(cX)
    print(cX[:-20])
    
    # sort back to original indexes
    idx_cX_sort = np.sort(idx_cX)
    cX = np.array(cX)[idx_cX_sort]
    return cX

In [47]:
# calibrated features
cfeatures = {'years_active', 'years_inactive', 'num_captures', 'alexa_rank', 'sub_levels', 'num_sub'}

In [48]:
for i, feat in enumerate(feat_labels):
    if feat in cfeatures:
        print(i)
        cX, cy = X[:,i], y
        #print(cX[:50])
        #print(cy[:50])
        cX = calibratedFeature(cX, cy)
        X[:, i] = cX

1
[-1.32216556 -1.32216556 -1.32216556 ...  3.66600381  3.66600381
  3.66600381]
[0.00045996 0.00045996 0.00045996 ... 0.00227337 0.00227337 0.00227337]
2
[-1.0150291  -1.0150291  -1.0150291  ... 14.69395269 14.69395269
 15.74121815]
[-0.00695207 -0.00695207 -0.00695207 ...  0.          0.
  0.        ]
3
[-0.04083159 -0.04083159 -0.04083159 ...  3.49836931  3.68571581
  3.84942716]
[-0.00085085 -0.00085085 -0.00085085 ...  0.0011532   0.0011532
  0.0011532 ]
10
[-5.33307632 -5.3328625  -5.33266514 ... 10.64074877 10.6548934
 10.65842956]
[0.         0.         0.         ... 0.00382022 0.00382022 0.00382022]
17
[-0.45677348 -0.45677348 -0.45677348 ... 13.0353928  13.0353928
 13.0353928 ]
[-0.04181624 -0.04181624 -0.04181624 ...  0.          0.
  0.        ]
18
[-0.05508077 -0.05508077 -0.05508077 ... 20.85184636 21.47593374
 22.4120648 ]
[-0.03092817 -0.03092817 -0.03092817 ...  0.          0.
  0.        ]


In [49]:
print(X[2])

[ 8.83216849e-01  4.59961219e-04 -6.95206928e-03 -8.50846937e-04
 -2.97629887e-01 -2.20095096e-01  2.13069229e+00  9.21160081e-01
 -3.78824065e-01 -3.17866096e-01  0.00000000e+00 -1.21231410e+00
  3.34903141e+00 -4.21188604e-01 -1.06909517e+00 -5.45318381e+00
  9.69802881e-01 -4.18162373e-02 -3.09281678e-02]


# Data reduction

In [50]:
# To deal with high-dimensional data
def getPCA(X, n_c=10):
    pca = PCA(n_components=n_c, svd_solver='full', random_state=42)
    pca.fit(X) 
    X_pca = pca.transform(X)
    return X_pca
    #print(X_pca[2])

# Cluster

In [51]:
def entropy(prob):
    if prob == 1 or prob == 0:
        return 0
    return -prob*math.log(prob, 2) - (1-prob)*math.log((1-prob), 2)

In [52]:
def getEntropy(pred, pred_label):
    t_total = len(data)
    ent_w = 0

    for pred, labels in pred_label.items():
        neg, pos, unknown = labels[-1], labels[1], labels[0]
        total = neg+pos+unknown
        neg_pr = neg/total
        if total > 0:
            neg_pr = neg/total
        else:
            neg_pr = 0
        
        ent = entropy(neg_pr)
        ent_w += ent*(total/t_total)
    return ent_w

In [53]:
def getCluster(X, n_c=12, name="KMeans"):
    if name == "KMeans":
        clf = KMeans(n_clusters=n_c, random_state=42)
    elif name == "DBSCAN":
        clf = DBSCAN(min_samples=30)
    elif name == "Birch":
        clf = Birch(n_clusters=n_c, threshold=0.5, branching_factor=50)

    pred = clf.fit_predict(X)

    pred_label = dict()
    for i, pr in enumerate(pred):
        label = y[i]
        if pr not in pred_label:
            pred_label[pr] = dict()
            pred_label[pr][0], pred_label[pr][1], pred_label[pr][-1] = 0, 0, 0

        pred_label[pr][label] += 1
        
    return pred_label, pred

In [54]:
def processClusters(pred_label, min_entropy=0.2, min_entity=10):
    cluster_to_label = dict()

    for cluster, labels in pred_label.items():
        neg, pos = labels[-1], labels[1]
        total = neg+pos

        # ignore clusters where max label is less than threshold
        if max(neg, pos) < min_entity:
            continue

        # ignore clusters where entropy is > threshold
        neg_pr = neg/total
        if entropy(neg_pr) > min_entropy:
            continue

        label = 0
        if neg > pos:
            label = -1
        elif neg < pos:
            label = 1

        cluster_to_label[cluster] = label
    return cluster_to_label
    #print(cluster_to_label)

In [55]:
def getNumDomainsAndDomainTypeRatio(pred, cluster_to_label, data, y):

    data_idx_label = []
    for i, pr in enumerate(pred):
        if pr in cluster_to_label:
            label = y[i]
            if label == 0:
                label = cluster_to_label[pr]
                data_idx_label.append([i, label])
                #temp = data[i]
                #temp["target"] = label
                #data_label.append(temp)
    
    #X_cluster = np.array([d['feat'] for d in data_label])
    #y_cluster = np.array([d['target'] for d in data_label])
    
    per_comp, per_mal = 0, 0
    if len(data_idx_label) > 0:
        per_comp = len([x for x in data_idx_label if x[1]==1])/len(data_idx_label)
        per_mal = len([x for x in data_idx_label if x[1]==-1])/len(data_idx_label)
    
    return round(per_comp*100), round(per_mal*100), data_idx_label

In [56]:
# try several hyperparamters
#pca_components = list(range(1,20)) #1-19
#pca_components = list(range(5,11)) #1-19
#pca_components = list(range(12,14)) #1-19
pca_components = [10]
pca_components = [False] + pca_components
trypca = True
if trypca is False:
    pca_components = [0]

#n_clusters = list(range(2,13)) #2-12
#n_clusters = list(range(7,13))
n_clusters = list(range(12,16))

#min_entropy = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
min_entropy = [0.1, 0.15, 0.2]
#min_entropy = [0.2, 0.25, 0.3]

#min_entity = list(range(1,20)) #1-20
min_entity = list(range(7,11)) #1-20

#orig_X, orig_y = X, y

data_idx_label = dict()
cnt = 0
test = False
print("Number of combinations:", len(pca_components)*len(n_clusters)*len(min_entropy)*len(min_entity))
print(["pca_c", "n_c", "min_ent", "min_e", "per_comp", "per_mal", "num_domains"])
for min_e in min_entity:
    for n_c in n_clusters:
        for min_ent in min_entropy:
            for pca_c in pca_components:
            
                # keep track of combinations
                cnt+=1
                if cnt > 10 and test:
                    continue
                else:
                    print(cnt)
                
                # try reducing dimensions 
                if trypca and pca_c != False:
                    X_pca = getPCA(X, pca_c)
                else:
                    X_pca = X
                #print(X_pca[2])
                
                # get predicted cluster labels
                pred_label, pred = getCluster(X_pca, n_c)
                #pprint(pred_label)
                ent = getEntropy(pred, pred_label)
                
                # only keep clusters with certain entropy and minimum number of entities in cluster
                cluster_to_label = processClusters(pred_label, min_ent, min_e)
                #print(cluster_to_label)
                
                # get resulting number of domains and domain type ratio
                per_comp, per_mal, idx_label = getNumDomainsAndDomainTypeRatio(pred, cluster_to_label, data, y)
                
                # ignore results with too little domains or incorrect ratio
                num_domains = len(idx_label)
                entry = [pca_c, n_c, min_ent, min_e, per_comp, per_mal, num_domains, ent]
                print(entry)
#                 if num_domains < 4000:
#                     ignored.append(entry)
#                 elif per_comp > 80 or per_comp < 50:
#                     ignored.append(entry)
#                 else:
#                     candidate.append(entry)
                idx_label = str(idx_label)
                if idx_label not in data_idx_label:
                    data_idx_label[idx_label] = []
                data_idx_label[idx_label].append(entry)
                    
                print()

Number of combinations: 96
['pca_c', 'n_c', 'min_ent', 'min_e', 'per_comp', 'per_mal', 'num_domains']
1
[False, 12, 0.1, 7, 61, 39, 4705, 0.053974795296406725]

2
[10, 12, 0.1, 7, 63, 37, 4945, 0.05399749441506932]

3
[False, 12, 0.15, 7, 61, 39, 4705, 0.053974795296406725]

4
[10, 12, 0.15, 7, 63, 37, 4945, 0.05399749441506932]

5
[False, 12, 0.2, 7, 61, 39, 4705, 0.053974795296406725]

6
[10, 12, 0.2, 7, 63, 37, 4945, 0.05399749441506932]

7
[False, 13, 0.1, 7, 61, 39, 4690, 0.05395600309897802]

8
[10, 13, 0.1, 7, 43, 57, 3231, 0.05401676954026381]

9
[False, 13, 0.15, 7, 61, 39, 4690, 0.05395600309897802]

10
[10, 13, 0.15, 7, 43, 57, 3231, 0.05401676954026381]

11
[False, 13, 0.2, 7, 74, 26, 6951, 0.05395600309897802]

12
[10, 13, 0.2, 7, 43, 57, 3231, 0.05401676954026381]

13
[False, 14, 0.1, 7, 55, 45, 4075, 0.053843925842135314]

14
[10, 14, 0.1, 7, 58, 42, 4328, 0.05395961470263861]

15
[False, 14, 0.15, 7, 83, 17, 10465, 0.053843925842135314]

16
[10, 14, 0.15, 7, 58, 42, 432

In [57]:
print(len(data_idx_label))

13


In [60]:
pprint(list(data_idx_label.values()))

for x in list(data_idx_label.values()):
    num_domains = x[0][6]
    per_mal = x[0][5]
    per_comp = x[0][4]
    
    per_min = min(per_mal, per_comp)
    #diff = per_max-50
    print(num_domains*per_min/100)

[[[False, 12, 0.1, 7, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.15, 7, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.2, 7, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.1, 8, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.15, 8, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.2, 8, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.1, 9, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.15, 9, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.2, 9, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.1, 10, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.15, 10, 61, 39, 4705, 0.053974795296406725],
  [False, 12, 0.2, 10, 61, 39, 4705, 0.053974795296406725]],
 [[10, 12, 0.1, 7, 63, 37, 4945, 0.05399749441506932],
  [10, 12, 0.15, 7, 63, 37, 4945, 0.05399749441506932],
  [10, 12, 0.2, 7, 63, 37, 4945, 0.05399749441506932],
  [10, 12, 0.1, 8, 63, 37, 4945, 0.05399749441506932],
  [10, 12, 0.15, 8, 63, 37, 4945, 0.05399749441506932],
  [10, 

# Choose specific hyperparameters after manual checking

In [31]:
def getNumDomainsAndDomainTypeRatioSpecific(pred, cluster_to_label, data, y):

    data_label = []
    for i, pr in enumerate(pred):
        if pr in cluster_to_label:
            label = y[i]
            if label == 0:
                label = cluster_to_label[pr]
                #data_idx_label.append([i, label])
                temp = data[i]
                temp["target"] = label
                data_label.append(temp)
    
    X_cluster = np.array([d['feat'] for d in data_label])
    y_cluster = np.array([d['target'] for d in data_label])
    
    per_comp, per_mal = 0, 0
    if len(data_idx_label) > 0:
        per_comp = len([x for x in y_cluster if x==1])/len(y_cluster)
        per_mal = len([x for x in y_cluster if x==-1])/len(y_cluster)
    
    return data_label, round(per_comp*100), round(per_mal*100)

In [61]:
#X_pca = getPCA(X, n_c=12)
X_pca = X
X_pca, n_c, min_ent, min_e = X_pca, 15, 0.2, 10
#73, 27, 19195, 0.053659451762850106],

# get predicted cluster labels
pred_label, pred = getCluster(X_pca, n_c)
#pprint(pred_label)
ent = getEntropy(pred, pred_label)
print(ent)

# only keep clusters with certain entropy and minimum number of entities in cluster
cluster_to_label = processClusters(pred_label, min_ent, min_e)
#print(cluster_to_label)

# get resulting number of domains and domain type ratio
data_label, per_comp, per_mal = getNumDomainsAndDomainTypeRatioSpecific(pred, cluster_to_label, data, y)
print(len(data_label))
print(per_comp)
print(per_mal)

0.053169777953601664
6409
63
37


In [62]:
with open('./data/data_cluster_6409', 'w', encoding="utf-8") as fout:
    pprint(data_label, fout)

# Cluster Analysis

In [15]:
#n_clusters = [2,3,4,5,6,7,8,9,10,11,12]
n_clusters = [2]
#n_c = 5
for n_c in n_clusters:
    print("n_clusters:", n_c)
    clf = KMeans(n_clusters=n_c)
    pred = clf.fit_predict(X)

    pred_label = dict()
    pred_data = dict()
    for i, pr in enumerate(pred):
        label = y[i]
        dt = data[i]
        if pr not in pred_label:
            pred_label[pr] = dict()
            pred_label[pr][0] = 0
            pred_label[pr][1] = 0
            pred_label[pr][-1] = 0
            
            pred_data[pr] = dict()
            pred_data[pr][0] = []
            pred_data[pr][1] = []
            pred_data[pr][-1] = []

        pred_label[pr][label] += 1
        pred_data[pr][label].append(dt)

    print(pred_label)
    print("entropy:", getEntropy(pred, pred_label))
    print()

n_clusters: 2
{1: {0: 69, 1: 383, -1: 14191}, 0: {0: 420, 1: 103, -1: 11811}}
entropy: 0.12144051983185408



In [28]:
x = [x['info'][0] for x in pred_data[0][1]]
for xx in x:
    print(xx)

account-services-login.net
ackermanproduction.com
activerecoveryinc.cf
addarpluss.com
administartingserv.online
ahaofunweb.com
amservers.fr
antseducati.com
attvalidated.com
banauws.com
careerx.cf
casa-davinci.mn
caucasiana.ge
caughtoncamera.co.za
cazanele-dunarii.ro
cdnusercontent.men
centrul2sibiu.ro
cleanday34.ru
clearfieldcountydemocrats.com
com-app.eu
compraonline14.com.ar
constcarte-updates.com
contact-apple.com.br
contemporarympreg.net
cryptoga.org
denise-thibault.com
depositeagency.online
edgnuunfit.com
ekoservis.in.ua
enovadese.tk
etyu-mimon.com
fromevalleywindowcleaning.co.uk
fromnowon.eu
ghandinku.com
ginphed.com.ng
godadogixc.com
gottfried-weiss.online
gw-siegen.us
help-mnhbvgyfty.ga
help-robinettegosselin.net
iban-naambevestigen.website
id-icloud-nnsms.com
iflixes.gq
inversoresasociados.com.bo
ios-sign-in.us
jasdbseywion.com
johnson-industrlal.com
justinso.ga
kjcdzi.com
langitterbelahbalik.com
latina.com.es
lawyerparalegalservices.com
linkatel.gt
linked-buyers.com
mafiathre

In [26]:
#pprint([x['info'][0] for x in pred_data[1][0] if x['feat'][7] == 1])
pprint([x['info'][0] for x in pred_data[0][1]])

['3utilities.com',
 'appleidsecuritys.com',
 'appliancesty.date',
 'at0.win',
 'automobilesbuy.pw',
 'beautyitems.xyz',
 'bounceme.net',
 'buycamera.date',
 'buytools.pw',
 'buytoolse.site',
 'camillebirrell.site',
 'canbeauty.date',
 'carmeelbrmeente.site',
 'carpartsnet.men',
 'carpetstore.pw',
 'carpetstores.xyz',
 'catherinewylde.site',
 'cedricbacon.site',
 'cheaptoyget.date',
 'culinarygoodst.trade',
 'ddns.net',
 'ddnsking.com',
 'dekinsore.online',
 'discountmany.site',
 'eprtmanpower.com',
 'euesshop.top',
 'furnituresy.trade',
 'getwood.date',
 'glassesscarf.date',
 'goodshouse.men',
 'goodsshop.win',
 'homemachine.xyz',
 'homeresjob.men',
 'humphreyjulius.site',
 'imcreator.com',
 'inhomething.men',
 'inthebedst.date',
 'itursulogalvan.edu.mx',
 'jewelryshops.trade',
 'joyviolet.site',
 'jpanbest.win',
 'kimonodress.date',
 'kitchentask.xyz',
 'kitchenget.pw',
 'kitchentooly.date',
 'machinelive.xyz',
 'manydiscount.site',
 'menshoesly.trade',
 'modelshop.pw',
 'motormeet.me