In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import math
import json
import random
import csv
import multiprocessing
import sys
import pickle
import os
import gc
from annoy import AnnoyIndex
import nmslib
import tensorflow as tf
import bottleneck as bn

import heapq
import time
import operator
from collections import Counter,defaultdict
from outliers import smirnov_grubbs as grubbs

from lightfm import LightFM
from lightfm.evaluation import auc_score,precision_at_k,recall_at_k
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split

from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score,confusion_matrix, roc_curve, auc,classification_report
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, coo_matrix, hstack,dok_matrix, lil_matrix, vstack
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LassoCV

from gensim.models.doc2vec import Doc2Vec
from gensim import corpora,models,similarities
import nltk
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument


import itertools
import shap
import seaborn as sns

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

mpl.rcParams['figure.dpi']= 100
#mpl.rcParams['figure.dpi']= 150

In [None]:
DATAPATH="/mnt/projects/continuous_scanning/trackingChange/recData/"
NUM_THREADS=72
def make_cdf( data, label=""):

    sorted_data = np.sort(data)
    yvals=np.arange(len(sorted_data))/float(len(sorted_data)-1)

    plt.plot(sorted_data,yvals, label=label,alpha=0.8)
    return yvals

def cold_train_test_split(interactions,num_users,train_percentage=0.5,random_state=7):
    
    #remove weird duplicates that are introduced by lightfm
    interactions_dok=dok_matrix((interactions.shape),dtype=interactions.dtype)
    interactions_dok._update(zip(zip(interactions.row,interactions.col),interactions.data))

    train_a = interactions_dok.tocsr()
    test_a = interactions_dok.tocsr()
    
    train_mask = np.array([True]*num_users)
    np.random.seed(random_state)
    indices = np.random.choice(np.arange(num_users), replace=False,
                           size=int(num_users * train_percentage))
    
    train_mask[indices] = False
    test_mask = ~np.array(train_mask)
    
    nnz_per_row = np.diff(train_a.indptr)
    
    #zero out rows
    
    test_a.data[np.repeat(test_mask, nnz_per_row)] = 0
    train_a.data[np.repeat(train_mask, nnz_per_row)] = 0
    train_a.eliminate_zeros()
    test_a.eliminate_zeros()
    

    #makes its own shape--need to have it retain old shape....
    return train_a,test_a



def extractData(f, feat = "slash20", train_percentage=0.8, withItemFeats=True,\
                calcWeights = True,weightCap = 2000): 
    df_csv = pd.read_csv(DATAPATH+f,usecols=["ip",feat,"p"])#,"p1"])
    df_csv = df_csv.fillna(0)

    #prune the items, as they are expensive
    #if withItemFeats:
    #    df_csv["minidata"] = df_csv["minidata"].astype('category')
    #    df_csv[["minidata"]] = df_csv[["minidata"]].apply(lambda x: x.cat.codes)

    #    df_csv['minidata'] = np.where(~df_csv['minidata'].duplicated(keep=False), 0, df_csv['minidata'])

        
        
    #introduce dataset
    dataset = Dataset()
    if withItemFeats:
        dataset.fit(list(df_csv["ip"]),list(df_csv["p"]),\
        user_features=list(df_csv[feat]),
        item_features = list(df_csv["p"])+list(df_csv["p1"]))#+list(df_csv["p3"]))
    else:
        dataset.fit(df_csv["ip"],df_csv["p"],\
                user_features=df_csv[feat])
    #user_features = list(df_csv["ip"])+list(df_csv["s1"])+list(df_csv["s2"])+list(df_csv["s3"])+list(df_csv["s4"])+

    
    num_users, num_items = dataset.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))


    
    #get inverse propensity weights
    if calcWeights: 
        normed_counts = (1/(df_csv['p'].value_counts(normalize=True)*100))
        df_weights = pd.DataFrame({'p':normed_counts.index, 'w':normed_counts.values})
        df_csv= pd.merge(df_csv,df_weights,how='left',on="p")
        df_csv['w'] = df_csv['w'].apply(lambda x: weightCap if x > weightCap else x)
    
        #build interactions
        (interactions, weights) = \
        dataset.build_interactions(list(zip(df_csv['ip'], df_csv['p'],df_csv['w'])))
    else:
        (interactions, weights) = \
        dataset.build_interactions(list(zip(df_csv['ip'], df_csv['p'])))

    #print("making to dense")
    #interactions = interactions *2
    #interactions = coo_matrix(interactions.todense() - 1)
    #print("success")
    print(repr(interactions))

    #build featureset
    newFeats = list(zip(df_csv['ip'], \
                        list(zip(list(df_csv[feat])))))
    user_features = dataset.build_user_features(newFeats)
    


    #remove identity matrix since this is a cold start
    #user_features.setdiag([0]*num_users)
    #user_features.eliminate_zeros()
    #user_features = normalize(user_features, norm='l2', axis=1)
    
    if withItemFeats:
        newFeats = list(set(list(zip(df_csv['p'], list(zip(list(df_csv["p"]),list(df_csv["p1"])))))))#,list(df_csv["p3"])))))))

        item_features = dataset.build_item_features(newFeats)
        ''' 
        item_features.setdiag([0]*num_items)
        item_features.eliminate_zeros()
        item_features = normalize(item_features, norm='l1', axis=1)
        '''
    
    #list(zip(list(df_csv["asn"]),list(df_csv["s1"])))
    #list(df_csv["asn"]),list(df_csv["s2"]),list(df_csv["s3"]),list(df_csv["s4"]),
    #create train test
    
    train,test = cold_train_test_split(interactions,num_users,train_percentage=train_percentage)

    
    #handle weight stuff
    trainw = train.multiply(weights)
    testw = test.multiply(weights)
    train.sort_indices()
    trainw.sort_indices()
    test.sort_indices()
    testw.sort_indices()
    trainw = trainw.tocoo()
    testw = testw.tocoo()
    
    #zero out rows for training
    train_rows = np.array(list(set(trainw.row)))
    userf_train_mask = np.array([True]*num_users)
    userf_train_mask[train_rows] = False
    nnz_per_row = np.diff(user_features.indptr)
    userf_train = user_features.copy()
    userf_train.data[np.repeat(userf_train_mask, nnz_per_row)] = 0
    userf_train.eliminate_zeros()
    
    #good to have mappings
    dmap= dataset.mapping()
    inv_ip_map = {v: k for k, v in dmap[0].items()}
    inv_port_map = {v: k for k, v in dmap[2].items()}
    
    if withItemFeats:
        return df_csv, dataset, num_users, num_items, interactions, weights,user_features, userf_train, item_features, train,test,trainw,testw, dmap, inv_ip_map, inv_port_map
    else:
        return df_csv, dataset, num_users, num_items, interactions, weights,user_features, userf_train, None, train,test,trainw,testw,dmap, inv_ip_map, inv_port_map
    
    

In [None]:
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in globals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

# Set-up the Data for the Recommendor 

In [None]:
f = "ht65k_miniset.iconv" # removed null byte! sed 's/\x0//g' ht65k_1pset.csv > ht65k_1pset.iconv
#f = "ht65k_1pset.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map,\
inv_port_map = extractData(f,train_percentage = 0.8, withItemFeats=False,calcWeights= False)

#print(str(df[df["ip"]=="164.10.45.170"]["minidata"]))
print(user_features.shape)
#print(item_features.shape)

#  Run the Recommendor

In [None]:
hybrid_model = LightFM(loss='warp', #'warp
                random_state=2016,
                learning_rate=0.01, #0.05
                no_components=4000,
                #learning_schedule='adadelta',
                max_sampled= 175, #50,
                #user_alpha=0.5
                       
                      )

hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                sample_weight = trainw,
                  epochs=10, #10
                  num_threads=NUM_THREADS, verbose=True)



In [None]:
pids = [22003, 41463, 13817, 12794, 44029]
user_ids = test_rows[pids]
for u,i in zip(user_ids,pids):
    print(df[df["ip"] == inv_ip_map[u]][["p"]])
    print([inv_port_map[j] for j in predictions[i][0]])
    print(predictions[i][1])


In [None]:
print("Benchmarking number of Epochs for Training")
epochs = [2,4,6,8,18,28] #36 seconds per epoch
norm_services = [215,276,305,325,477,596]
frac_services = [0.33,0.33,.32,.32,.32,.28]
frac_ips = [0.5,0.51,.51,.51,.5,.48]

plt.plot(epochs,frac_ips, '.-', label="Fraction of IPs")
plt.plot(epochs,frac_services,'.-',label="Fraction of Services")
plt.plot(epochs,np.array(norm_services)/1000, '.-',label="Norm Services *10^4")
plt.xlabel("Epochs (35 seconds train/epoch)")
plt.legend()


In [None]:
print("Benchmarking number of Guesses")
num_guesses = [10,20,30,40,50,60,70,80,90,100] #36 seconds per epoch
norm_services = [253,367,447,513,566,599,626,651,693,716]
frac_services = [0.367,0.412,.438,.453,.463,.467,.468,.478,.471,.483]
frac_ips = [0.635,0.666,.693,.711,.721,.723,.726,.737,.725,.743]

plt.plot(num_guesses,frac_ips, '.-', label="Fraction of IPs")
plt.plot(num_guesses,frac_services,'.-',label="Fraction of Services")
plt.plot(num_guesses,np.array(norm_services)/1000, '.-',label="Norm Services *10^4")
plt.xlabel("Number of Guesses")
plt.legend()




slash23<br>
Total Normalized Services:  275.60068175444036<br>
Total Fraction Services: 0.1504163978262171<br>
Total Fraction IPs: 0.26026281153043646<br>
slash22<br>
Total Normalized Services:  345.8524479960912<br>
Total Fraction Services: 0.19542860373693768<br>
Total Fraction IPs: 0.31960831110155413<br>
slash20<br>
Total Normalized Services:  470.6145427630729<br>
Total Fraction Services: 0.30577136699548346<br>
Total Fraction IPs: 0.4869636312509895<br>
slash19<br>
Total Normalized Services:  548.2586308052825<br>
Total Fraction Services: 0.3608868741560627<br>
Total Fraction IPs: 0.5565077872271044<br>
slash18<br>
Total Normalized Services:  605.7537606680435<br>
Total Fraction Services: 0.39644980942815145<br>
Total Fraction IPs: 0.595762217238943   <br>
slash17<br>
Total Normalized Services:  657.5500775907003<br>
Total Fraction Services: 0.44045527714401644<br>
Total Fraction IPs: 0.6762068370696113<br>
slash16<br>
Total Normalized Services:  712.1528322787614<br>
Total Fraction Services: 0.4779535310668698<br>
Total Fraction IPs: 0.7342791586817362<br>
ASN<br>
Total Normalized Services:  1031.9949926509712<br>
Total Fraction Services: 0.4221605459733798<br>
Total Fraction IPs: 0.6154804072846195<br>
<br>
Max Sampled500, 0.05 LR <br>
ASN or slash16 (whichever smaller)<br>
Total Normalized Services:  788.1392639226955<br>
Total Normalized IPs:  47579.61659849398<br>
Total Fraction Services: 0.2771394698093774<br>
Total Fraction IPs: 0.39957797033504805<br>

ASN<br>
Total Normalized Services:  742.5351331088734<br>
Total Normalized IPs:  47807.5798893569<br>
Total Fraction Services: 0.28212160527192265<br>
Total Fraction IPs: 0.4109968737668156<br>
slash16<br>
Total Normalized Services:  579.7953558116823<br>
Total Normalized IPs:  49038.62965705219<br>
Total Fraction Services: 0.2863692242114928<br>
Total Fraction IPs: 0.39370263685836016<br>

In [None]:
print("Learning Rate of 0.05, max sampled 500")
retrains = [False] 
retrain_weights = [False] 
BENCH2_CYCLES = 30
BIASED=True


RETRAIN = False
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=600

total_epochs = 2

#for r,rw in zip(retrains,retrain_weights):
for curFeat in ["asn","slash16"]: #,
    print(curFeat)

    del hybrid_model
    del user_embeddings
    del item_embeddings
    del df
    gc.collect()
    #''' 
    f = "ht65k_1pset.iconv"
    df, dataset, num_users, num_items, interactions, weights, \
    user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
    inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)


    hybrid_model = LightFM(loss='warp', #'warp
        random_state=2016,
        learning_rate=0.05,
        no_components=4000,
        #learning_schedule='adadelta',
        max_sampled= 500, #175,
        #user_alpha=0.5

        )

    
    print("Initialized Model")
    hybrid_model = hybrid_model.fit(train, #newtrain
                      user_features=userf_train,
                      #item_features=item_features,
                    #sample_weight = trainw,
                      epochs=20, #10
                      num_threads=NUM_THREADS, verbose=True)

   
    print("EF_CONSTRUCTION: ", EF_CONSTRUCTION)
    model_description = constructDescript("PlayingOptimized_", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=True)
    print(model_description)

    
    START = 0
    LIM = 4539900
    NUM_PREDS = 100


    num_test_ips, num_services,num_ips_per_port, test_rows, testcoo,\
        item_biases, item_embeddings, user_biases, user_embeddings, predictions, \
        pred_ranks, testPorts, hist_preds,LIM = \
        prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                     item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                     NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )
    #'''
    all_correctly_predicted, norm_services, frac_service, frac_ip, total_ips_correct \
    = evalSuccess(predictions,test,dmap,num_ips_per_port, num_services,num_test_ips)

    #np.save(DATAPATH + "1p_train20Epoch_100Pred_600EF_" + curFeat+ "_predictions.npy",all_correctly_predicted)
    

    

In [None]:
all_correctly_predicted, norm_services, frac_service, frac_ip, total_ips_correct \
    = evalSuccess(predictions,test,dmap,num_ips_per_port, num_services,num_test_ips)

In [None]:
print("False Retrain")

BENCH2_CYCLES = 100
BIASED=True


RETRAIN = False
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=600


START = 0
LIM = 4539900
NUM_PREDS = 100



bench2_results,predictions,all_correctly_predicted =  bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,model_description)




In [None]:
print("True Retrain")


BENCH2_CYCLES = 100
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=600

START = 0
LIM = 4539900
NUM_PREDS = 100

'''
curFeat = "slash16"
f = "ht65k_1pset.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)



hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.05, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 2000, #50,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                #sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)


model_description = constructDescript("Epoch20_maxsample2000_slash16user_1p_model_", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=True)
print(model_description)

num_test_ips, num_services,num_ips_per_port, test_rows, testcoo,\
    item_biases, item_embeddings, user_biases, user_embeddings, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

#'''

bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,model_description)



In [None]:
del df
del inv_ip_map
#del user_embeddings
gc.collect()

In [None]:
print("True Retrain,slash16,more filtered dataset")


BENCH2_CYCLES = 100
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=600

START = 0
LIM = 4539900
NUM_PREDS = 100

RETRAIN_GAP = 10
RETRAIN_ALL = True
RETRAIN_EPOCHS =10 

'''
curFeat = "slash16"
#f = "ht65k_1pset.iconv"
f = "ht65k_1pset_l10.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)



hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.05, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                #sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)




num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

'''
model_description = constructDescript("PlayingRetrainGap10_6KComp_", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=True)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)




In [None]:
print("True Retrain,slash16,more filtered dataset, retrain all false")



BENCH2_CYCLES = 600
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=600

START = 0
LIM = 4539900
NUM_PREDS = 100

RETRAIN_GAP = 10
RETRAIN_ALL = False
RETRAIN_EPOCHS =10 

#'''
curFeat = "slash16"
#f = "ht65k_1pset.iconv"
f = "ht65k_1pset_l10.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)
#'''


hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.05, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                #sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)




num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

#'''
model_description = constructDescript("PlayingRetrainGap10_6KComp_RetrainAllFalse", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=True)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)





In [None]:
print("True Retrain,slash16,more filtered dataset, retrain all false")


BENCH2_CYCLES = 300
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=600

START = 0
LIM = 4539900
NUM_PREDS = 100

RETRAIN_GAP = 10
RETRAIN_ALL = False
RETRAIN_EPOCHS =10 

#'''
curFeat = "slash16"
#f = "ht65k_1pset.iconv"
f = "ht65k_1pset_l10.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)
#'''


hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.01, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                #sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)




num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

#'''
model_description = constructDescript("PlayingRetrainGap10_4KComp_0.01LR_RetrainAllFalse", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=True)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)




In [None]:
del hybrid_model
gc.collect()

In [None]:
print("True Retrain,slash16,more filtered dataset, retrain all false, EF_Construction 1K")


BENCH2_CYCLES = 300
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = True
HIST_BINS=100
EF_CONSTRUCTION=1000

START = 0
LIM = 4539900
NUM_PREDS = 100

RETRAIN_GAP = 10
RETRAIN_ALL = False
RETRAIN_EPOCHS =10 

#'''
curFeat = "slash16"
#f = "ht65k_1pset.iconv"
f = "ht65k_1pset_l10.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)
#'''


hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.01, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                #sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)




num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

#'''
model_description = constructDescript("PlayingRetrainGap10_4KComp_0.01LR_RetrainAllFalse_EF1K", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=True)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)





In [None]:
del hybrid_model
gc.collect()

In [None]:
print("True Retrain,slash16,more filtered dataset, retrain all false, EF_Construction 1K")


BENCH2_CYCLES = 300


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = False
HIST_BINS=100
EF_CONSTRUCTION=1000

START = 0
LIM = 4539900
NUM_PREDS = 100

RETRAIN_GAP = 10
RETRAIN_ALL = False
RETRAIN_EPOCHS =10 

#'''
curFeat = "slash16"
#f = "ht65k_1pset.iconv"
f = "ht65k_1pset_l10.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= False)
#'''


hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.01, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                #sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)




num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

#'''
model_description = constructDescript("PlayingRetrainGap10_4KComp_0.01LR_RetrainAllFalse_EF1K", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=BIASED)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)






In [None]:
#del hybrid_model
gc.collect()

In [None]:
print("True Retrain,slash16,more filtered dataset, retrain all false, EF_Construction 1K,Calc Weights")


BENCH2_CYCLES = 300
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = False
HIST_BINS=100
EF_CONSTRUCTION=1000

START = 0
LIM = 4539900
NUM_PREDS = 100

RETRAIN_GAP = 10
RETRAIN_ALL = False
RETRAIN_EPOCHS =10 
      
CALC_WEIGHTS = True

'''
curFeat = "slash16"
#f = "ht65k_1pset.iconv"
f = "ht65k_1pset_l10.iconv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.08, withItemFeats=False,calcWeights= CALC_WEIGHTS)



hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.01, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                  sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)




num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )

'''
model_description = constructDescript("PlayingRetrainGap10_4KComp_0.01LR_RetrainAllFalse_EF1K_CalcWeightsTrue", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=BIASED)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,testw,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,CALC_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)







In [None]:
print("True Retrain, UNIFORM Sample  and Oversample0.64 Train,slash16,more filtered dataset, retrain all false, EF_Construction 1K,10kCalc Weights")


BENCH2_CYCLES = 300
BIASED=True


RETRAIN = True
RETRAIN_WEIGHTS = False
WITH_ITEM_FEATS=False
BIASED = False
HIST_BINS=100
EF_CONSTRUCTION=1000

START = 0
LIM = 4539900
NUM_PREDS = 300

RETRAIN_GAP = 10
RETRAIN_ALL = False
RETRAIN_EPOCHS =10 
      
CALC_WEIGHTS = True

#'''
curFeat = "asn"
f = "ht65k_1pset_uniform40.csv"
df, dataset, num_users, num_items, interactions, weights, \
user_features,userf_train, item_features, train,test,trainw,testw,dmap, inv_ip_map, \
inv_port_map = extractData(f,feat = curFeat, train_percentage = 0.1, \
                           withItemFeats=False,calcWeights= CALC_WEIGHTS,\
                          weightCap = 10000)



hybrid_model = LightFM(loss='warp', #'warp
    random_state=2016,
    learning_rate=0.01, #0.05
    no_components=4000,
    #learning_schedule='adadelta',
    max_sampled= 175,
    #user_alpha=0.5

    )


print("Initialized Model")
hybrid_model = hybrid_model.fit(train, #newtrain
                  user_features=userf_train,
                  #item_features=item_features,
                  sample_weight = trainw,
                  epochs=20, #10
                  num_threads=NUM_THREADS, verbose=True)



#'''
num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, predictions, \
    pred_ranks, testPorts, hist_preds,LIM = \
    prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                 item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS,\
                 NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION )


model_description = constructDescript("0.1Train_uniform40_PlayingRetrainGap10_4KComp_0.01LR_RetrainAllFalse_EF1K_10kCalcWeightsTrue_ASN_", RETRAIN =RETRAIN, RETRAIN_WEIGHTS =  RETRAIN_WEIGHTS, BIASED=BIASED)
print(model_description)
bench2_results,predictions,all_correctly_predicted = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                           test,testw,test_rows,train,user_features,item_features, \
                                           NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                           RETRAIN_WEIGHTS,CALC_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description)













In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(df, df["p"])

In [None]:
df.to_csv(DATAPATH + "ht65k_1pset_uniform200_l10_oversample_fakeIPs.csv",index=False)

In [None]:
df_hit = convert(all_correctly_predicted)
df_hit.to_csv(DATAPATH + "0.64Train_uniform200_l10_corr_predictions.csv",index=False)

In [None]:
del hybrid_model
gc.collect()

# Test Bed

In [None]:
def calcNormed(all_correctly_predicted,num_ips_per_port):
    num_correctly_pred_per_port = coo_matrix(all_correctly_predicted.sum(axis=0))

    #do some kind of sum of all of these fractions and divide by total number ports?
    norm_services = coo_matrix(num_correctly_pred_per_port/num_ips_per_port)
    norm_services.data = np.nan_to_num(norm_services.data, copy=False)
    norm_services = norm_services.sum()
    return norm_services

def evalSuccess(predictions,test,dmap,num_ips_per_port, num_services,num_test_ips):

    
    predictions_identity = coo_matrix(([1] * len(predictions.data),\
                                                  (predictions.tocoo().row, predictions.tocoo().col)),\
                                                   shape=test.shape)


    all_correctly_predicted = (test.multiply(predictions_identity)).tocoo()
    
    del predictions_identity
    corPred_identityish = coo_matrix(([1] * len(set(all_correctly_predicted.row)),\
                                                  (list(set(all_correctly_predicted.row)), list(set(all_correctly_predicted.row)))),\
                                                   shape=(test.shape[0],test.shape[0]))


    #extracting rows w/ at least 1 prediction, pretending we found all ports
    normalized_found = corPred_identityish * test
    del corPred_identityish

    total_services_correct = all_correctly_predicted.count_nonzero()
    total_ips_correct = len(set(all_correctly_predicted.row))
    print("total ips correct: ",total_ips_correct)    

    norm_services = calcNormed(all_correctly_predicted,num_ips_per_port)
    norm_IPs = calcNormed(normalized_found, num_ips_per_port)

    
    norm_services = calcNormed(all_correctly_predicted,num_ips_per_port)
    norm_IPs = calcNormed(normalized_found, num_ips_per_port)


    #Remove top 4 ports 80,443,7547,22
    popPorts = []
    for p in [80,443,7547,22]:
        popPorts.append(dmap[2][p])
    
    
    popCrew = all_correctly_predicted.tocsr()[:,popPorts]
    popCrew_services_correct = popCrew.count_nonzero()
    popCrew_ips_correct = len(set(popCrew.tocoo().row))


    num_test_ips_depop = len(set(test[:,popPorts].tocoo().row))
    num_test_services_depop = test[:,popPorts].count_nonzero()


    #if RETRAIN_WEIGHTS:
    #    norm_services = norm_services/10

    frac_services = total_services_correct/num_services
    frac_ips = total_ips_correct/num_test_ips

    frac_services_depop = (total_services_correct -popCrew_services_correct) / (num_services - num_test_services_depop)
    frac_ips_depop = (total_ips_correct -  popCrew_ips_correct)/(num_test_ips - num_test_ips_depop)


    print("Total Normalized Services: ", norm_services)
    print("Total Normalized IPs: ", norm_IPs)
    print("Total Fraction Services:", frac_services)
    print("Total Fraction IPs:", frac_ips )
    print("Total Fraction Depopularized Services:", frac_services_depop)
    print("Total Fraction Depopularized IPs:", frac_ips_depop )
    
    
    return all_correctly_predicted, norm_services, frac_services, frac_ips, total_ips_correct


In [None]:
def groundTruthServices(test,LIM,START):
    testcoo = test.tocoo()
    test_rows = np.array(list(set(testcoo.row))[START:LIM])
    test_rows.sort()
    
    #adjust lim
    if LIM > len(test_rows):
        LIM = len(test_rows)
    
    num_test_ips = len(test_rows)
    num_services = sum(test[test_rows].tocoo().data)
    num_ips_per_port=  coo_matrix(test[test_rows].sum(axis=0))
    return num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, LIM

def getNewPredictions(model,user_features,test_rows,item_features,\
                      BIASED,WITH_ITEM_FEATS,OLD=False, NUM_PREDS=30, \
                      EF_CONSTRUCTION = 200,M=20):
   
    if WITH_ITEM_FEATS:
        item_biases, item_embeddings = model.get_item_representations(item_features)
    else:
        item_biases, item_embeddings = model.get_item_representations()
        
    print("Got Item Representations")
    user_biases, user_embeddings = model.get_user_representations(user_features[test_rows])
    print("Got User Representations")
    predictions = None
    if OLD: 
        if BIASED:
            predictions = (
                user_embeddings.dot(item_embeddings.T) 
                +item_biases.reshape(1, -1) + user_biases.reshape(-1, 1)
            ) 
        else:        
            predictions = (
                user_embeddings.dot(item_embeddings.T) 
            ) 
    else:
    
        norms = np.linalg.norm(item_embeddings, axis=1)
        max_norm = norms.max()
        extra_dimension = np.sqrt(max_norm ** 2 - norms ** 2)
        norm_data = np.append(item_embeddings, extra_dimension.reshape(norms.shape[0], 1), axis=1)

        #first an nmslib
        nms_member_idx = nmslib.init(method='hnsw', space='cosinesimil')
       
        nms_member_idx.addDataPointBatch(norm_data)
        
        #indexTimeParams = {'M': M, 'indexThreadQty': NUM_THREADS,\
        #                  'efConstruction': efC, 'post' : 0}
        #https://github.com/nmslib/nmslib/blob/master/manual/methods.md
        indexTimeParams = {'efConstruction':  EF_CONSTRUCTION, 'M':M }
        
        
        nms_member_idx.createIndex(indexTimeParams,print_progress=True)
        print("Made Index")
    
        #allUsers = np.c_[user_embeddings,[0]*len(test_rows)]
        allUsers = user_embeddings
        top_items = np.array(nms_member_idx.knnQueryBatch(allUsers, k=NUM_PREDS, num_threads=NUM_THREADS))
        print("Got Top Items")
        del  item_embeddings
        del user_embeddings
        gc.collect()
        
        
        num_r = [len(x) for x in top_items[:,1]]
        top_items_coo = coo_matrix((np.concatenate(top_items[:,1]),\
                                  (np.repeat(test_rows, num_r), np.concatenate(top_items[:,0]))),\
                                   shape=test.shape)
        if BIASED:
            norm = np.linalg.norm(item_biases)
            item_biases_norm = (item_biases/norm) #*4
            item_biases_coo = coo_matrix((item_biases_norm,\
                                         ([0]*len(item_biases_norm),np.arange(len(item_biases_norm)))))
            
            top_items_identity = coo_matrix(([1] * len(np.concatenate(top_items[:,1])),\
                                              (np.repeat(test_rows, num_r), np.concatenate(top_items[:,0]))),\
                                               shape=test.shape)
            item_biases_identity = top_items_identity.multiply(item_biases_coo)
            top_items_biased = top_items_coo - item_biases_identity
            top_items = top_items_biased

        else:
            top_items = top_items_coo
    print("Got Predictions/Top Items")
    
    
    return predictions, top_items
    

def prepTheModel(model,test,train,LIM,START,HIST_BINS, user_features,item_features,\
                 BIASED = True,WITH_ITEM_FEATS=False,NUM_PREDS=30,EF_CONSTRUCTION=200, M=20 ):

    
    num_test_ips, num_services,num_ips_per_port, test_rows, testcoo, LIM = \
    groundTruthServices(test,LIM,START)
    testPorts = list(set(testcoo.col))
    
    print("...Grabbed Representations")
    predictions,  top_items = \
    getNewPredictions(model,user_features,test_rows,item_features,BIASED,\
                      WITH_ITEM_FEATS,NUM_PREDS=NUM_PREDS,EF_CONSTRUCTION =EF_CONSTRUCTION, M=M,OLD=False)
    print("...Calculated Predictions")
    if predictions is not None:

    
        #get ranking order of all ports per ip
        #pred_ranks = predictions.argsort(axis=-1)[:,::-1]
        b = tf.argsort(predictions,axis=-1,direction='DESCENDING',stable=False,name=None)
        pred_ranks = tf.keras.backend.eval(b)
    
        print("...Sorted Rankings")

    
        hist_preds = hist_scores_per_user(predictions,HIST_BINS)
        print("...Histogramed the Predictions")
    else: 
        predictions = top_items
        pred_ranks = None
        hist_preds = None
    return num_test_ips, num_services,num_ips_per_port, test_rows, testcoo,\
        predictions, \
        pred_ranks, testPorts, hist_preds,LIM

def constructDescript(prefix,LR=0.01,N_COMP=4000,EPOCHS=10,MAX_SAMPLE=175,BIASED=False,\
                 RETRAIN=False,RETRAIN_WEIGHTS=False,WITH_ITEM_FEATS=False,\
                      CALC_WEIGHTS=False):
    
    return prefix + str(LR)+"LR_"+str(N_COMP)+"NCOMP_"+str(EPOCHS)+"EPOCHS_" + str(MAX_SAMPLE)+"MAXSAMPLE_"\
            +str(BIASED)+"BIASED_"\
            + str(RETRAIN)+"RETRAIN_"+str(RETRAIN_WEIGHTS)+"RETRAINWEIGHTS_"\
            + str(WITH_ITEM_FEATS)+"WITHITEMFEATS_" + str(CALC_WEIGHTS)+"CALCWEIGHTS"

In [None]:
#MAIN!!
#testing biased. w/item feats, calc weights, no retrain or retrain weights
biases = [True,False]
item_feats = [False,False]
calc_weights = [False,True]
train_percentage = 0.08 #0.8 of a 0.1% scan equivalent


for b,i,c in zip(biases,item_feats,calc_weights):
    model_description = constructDescript(prefix = "1p_", BIASED = b, WITH_ITEM_FEATS=i,CALC_WEIGHTS=c,\
                                           RETRAIN = False,RETRAIN_WEIGHTS=False)
    print(model_description)
    completeTest("ht65k_1pset.iconv",train_percentage, model_description,BIASED = b, WITH_ITEM_FEATS=i,CALC_WEIGHTS=c,\
                                           RETRAIN = False,RETRAIN_WEIGHTS=False)
    gc.collect()

## Bench 2 Implementation: Coverage

In [None]:
def bench2_Improvements(model,predictions,hist_preds,num_services,num_test_ips,\
                        test,testw,test_rows,train,user_features,\
                        item_features, NUM_THREADS,RETRAIN,RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS, BIASED,LIM,HIST_BINS, \
                        CYCLES,RETRAIN_WEIGHTS, CALC_WEIGHTS, WITH_ITEM_FEATS,EF_CONSTRUCTION,model_description):
        

    print("-------")
    print("...Beginning Bench2: Compare Different Metrics of Coverage and Hitrate")
    
    """ 
    bench2_results = calcCoverage(model,predictions,hist_preds,num_services,num_test_ips,\
                                  test,train,user_features,item_features, NUM_THREADS,\
                                  RETRAIN, BIASED,LIM,HIST_BINS, CYCLES,RETRAIN_WEIGHTS,\
                                 WITH_ITEM_FEATS)
    """
    bench2_results,predictions,all_correctly_predicted = calcCoverage_optimized(model,predictions,hist_preds,num_services,num_test_ips,\
                                  test,testw,test_rows,train,user_features,item_features, NUM_THREADS,\
                                  RETRAIN,RETRAIN_GAP, RETRAIN_ALL, RETRAIN_EPOCHS ,BIASED,LIM,HIST_BINS, CYCLES,RETRAIN_WEIGHTS,\
                                 CALC_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION)
    print("...calculations complete")
    
    np.save(DATAPATH + model_description+ "_coverageB2.npy",bench2_results)
    np.save(DATAPATH + model_description+ "_correctpredictions.npy",all_correctly_predicted)
    
    plotDiscovery(DATAPATH,bench2_results["frac_services"], bench2_results["frac_ips"])

    plotHitrate(bench2_results["hitrate"],LIM)

    plotNormServiceDiscovery(bench2_results["scanned_num"],bench2_results["normed_services"],LIM)
    
    plotGradientNormServiceDiscovery(bench2_results["scanned_num"],bench2_results["normed_services"],LIM)
 
    print("...Finished Bench2")
    print("-------")
    return bench2_results,predictions,all_correctly_predicted

In [None]:
#https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
def top_n_idx_sparse(matrix, n):
    '''Return index of top n values in each row of a sparse matrix'''
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])
    return top_n_idx


#OPTIMIZED
def calcCoverage_optimized(model,predictions,hist_preds,num_services,num_test_ips,\
                 test, testw,test_rows,train,user_features, item_features, NUM_THREADS,RETRAIN,\
                 RETRAIN_GAP, RETRAIN_ALL,RETRAIN_EPOCHS, BIASED,LIM,HIST_BINS,\
                 CYCLES,RETRAIN_WEIGHTS,CALC_WEIGHTS,WITH_ITEM_FEATS,EF_CONSTRUCTION):

    
    bench2_results = {}
    portsScanned = {}
    normed_services = []
    frac_services = []
    frac_ips = []
    hitrate = []
    scanned_num =  []
    num_unique_rec = []
    pred_all_past = None
    start_choices = 1
    newtrain = train.tocoo()
    total_services_correct = 0
    
    i = 0                 
    while i < CYCLES: 
        print("cycle: ",i)

        #TODO:figure out how to handle this
        #if RETRAIN:

        pred_all_past,  pred_correct, newtrain,minitrain,cur_hitrate,correct, \
        portsScanned,start_choices,scanned_num,predictions = \
        getRekd_v3Optimized(model,portsScanned, predictions, hist_preds, test,test_rows,\
                   newtrain,start_choices,scanned_num,RETRAIN_GAP=RETRAIN_GAP,n=i,LIM=LIM,\
                            HIST_BINS=HIST_BINS,pred_past=pred_all_past)

             

        print("At Bin: ", i)
        print("Hitrate: ", cur_hitrate)
        i = i + RETRAIN_GAP

        all_correctly_predicted, norm_services, frac_service, frac_ip, total_ips_correct \
        = evalSuccess(newtrain,test,dmap,num_ips_per_port, num_services,num_test_ips)


        normed_services.append(norm_services)
        frac_services.append(frac_service)
        frac_ips.append(frac_ip)
        hitrate.append(cur_hitrate)
        num_unique_rec.append(len(portsScanned))
        
        if RETRAIN:

            if RETRAIN_ALL:
                training = newtrain
            else:
                training = minitrain


            if RETRAIN_WEIGHTS:
                training.multiply(10)
                
            if CALC_WEIGHTS:
                trainingw = training.multiply(testw).tocoo()
            else:
                trainingw = training.tocoo()    

            if WITH_ITEM_FEATS:
                model = model.fit_partial(training, 
                    sample_weight = trainingw, #what to do about the weight???
                    user_features=user_features,
                    item_features=item_features,
                    epochs=RETRAIN_EPOCHS,
                    num_threads=NUM_THREADS, verbose=True)
            else:
                model = model.fit_partial(training,
                    sample_weight = trainingw,
                    user_features=user_features,
                    epochs=RETRAIN_EPOCHS,
                    num_threads=NUM_THREADS, verbose=True)       

            print("...Grabbed Representations")
            _,predictions = \
            getNewPredictions(model,user_features,test_rows,item_features,BIASED,\
                              WITH_ITEM_FEATS,NUM_PREDS=NUM_PREDS, OLD=False,\
                              EF_CONSTRUCTION=EF_CONSTRUCTION)

            print("...Calculated Predictions")

    bench2_results["num_unique_rec"] = num_unique_rec
    bench2_results["frac_services"] = frac_services
    bench2_results["frac_ips"] = frac_ips
    bench2_results["hitrate"] = hitrate
    bench2_results["normed_services"] = normed_services
    bench2_results["scanned_num"] = scanned_num
    
    return bench2_results,predictions, all_correctly_predicted    
    
def getRekd_v3Optimized(model, res, predictions, hist_preds, test,test_rows,train,start_choices, \
               scanned_num = [],RETRAIN_GAP=1, n=0,START=0,LIM=1000,\
            HIST_BINS=100,NUM_THREADS=NUM_THREADS,pred_past = None):
    
    testcoo = test.tocoo()
    traincoo = train.tocoo()
    
    correct = 0
    
    # need this to make sure old predictions dont show up again
    # even when predictions have been updated
 
    if n > 0:
        
        pp = pred_past.multiply(10)
        
        #first subtract out previous pred_coo
        predictions = predictions + pp

    
    #get top per row
    chosen_indexes = np.concatenate(top_n_idx_sparse(predictions.tocsr().multiply(-1), RETRAIN_GAP))
        

    chosen_test_rows = np.repeat(test_rows,RETRAIN_GAP)
    
    scanned_num.append(len(chosen_indexes))
    #make matrix of predictions
    pred_all = coo_matrix(([1]*len(chosen_indexes),(chosen_test_rows,chosen_indexes)),shape=test.shape)
        
 

    # filter for only the correctly predicted this time
    pred_correct_coo = testcoo.multiply(pred_all)
    
    """ 
    if UNIQUE_IPS:
        pred_correct_ips = list(set(coo_matrix(pred_correct_coo).row))
        test_rows_l = list(test_rows)
        #find the corresponding prediction rows...
        pred_cor_rows = [test_rows_l.index(i) for i in pred_correct_ips]
        #"zero them out"
        predictions[:,1][pred_cor_rows,:] = 10
    """
        

    #find how many were correct
    correct = pred_correct_coo.count_nonzero()
        
    #log chosen port
    for i in chosen_indexes:
        chosen_port = inv_port_map[i]    
        if chosen_port not in res:
            res[chosen_port] = 0

        res[chosen_port] +=1 

    print("correct:",correct)
    print("Num Scanned: ",len(chosen_indexes) )
    hitrate = correct/len(chosen_indexes)
    #print("accuracy:", accuracy)
    print("Number unique ports Rec'd: ", len(res))
    #print(res)

    newtrain = pred_correct_coo + traincoo #+ pred_false_coo
    minitrain = pred_correct_coo

    
    if pred_past is not None:
        pred_all = pred_all + pred_past
    
    return pred_all,  pred_correct_coo, newtrain,minitrain,hitrate,correct,\
        res,start_choices, scanned_num,predictions

In [None]:

def getMppHitRate(num_ips_per_port,LIM):
    temp_ipsAndPorts = num_ips_per_port.data
    temp_ipsAndPorts.sort()
    temp_ipsAndPorts = temp_ipsAndPorts[::-1]/LIM
    return temp_ipsAndPorts
    


def hist_along_axis(all_args):
        (arr,bins) = all_args
        hist, edges = np.histogram(arr,bins)
        return np.array(hist)

def hist_scores_per_user(arr,bins):

    # Chunks for the mapping (only a few chunks):
    chunks = [(sub_arr, bins) for sub_arr in arr]


    pool = multiprocessing.Pool()
    individual_results = pool.map(hist_along_axis, chunks)
    # Freeing the workers:
    pool.close()
    pool.join()
    #print(individual_results[:2])
    return individual_results


def unpack_index(all_args):
        (arr,i) = all_args
        if i == 0:
            return []
        return np.argpartition(arr, -i)[-i:]


def parallel_maxy(arr,elems):
    """
    Like numpy.apply_along_axis(), but takes advantage of multiple
    cores.
    """ 

    # Chunks for the mapping (only a few chunks):
    chunks = [(sub_arr, i) for sub_arr,i in zip(arr,elems)]


    pool = multiprocessing.Pool()
    individual_results = pool.map(unpack_index, chunks)
    # Freeing the workers:
    pool.close()
    pool.join()
    #print(individual_results[:2])
    return individual_results



##PLOTTING MODULES##

def plotDiscovery(DATAPATH,frac_services, frac_ips):

    df = pd.read_csv(DATAPATH+"lzr01_host_discovery.csv")
    plt.plot([0]+df["rolling_sum"],label="ips - mpp (lzr 65k)")

    df = pd.read_csv(DATAPATH+"lzr01_service_discovery.csv")
    plt.plot([0]+df["rolling_sum"],label="services - mpp (lzr 65k)")

    plt.plot([0]+frac_services,label="services - ML")
    plt.plot([0]+frac_ips,label="ips -ML")
    plt.ylabel("Fraction of IPs/Services Discovered")
    plt.xlabel("Number of Scans")
    plt.xlim((1,len(df["rolling_sum"])))
    plt.title("IPv4 Discovery-Dynamic Choice")
    plt.xscale('log')
    #plt.xlim((-1,20))
    plt.legend()
    plt.show()
    
def plotHitrate(hitrate,LIM):

    mpp_hitrate =  getMppHitRate(num_ips_per_port,LIM)
    #df = pd.read_csv(DATAPATH+"lzr001_hitrate.csv")
    plt.plot([0]+mpp_hitrate,label="hitrate- mpp, (lzr 65k)")
    plt.plot([0]+hitrate,label="hitrate- ML")
    plt.legend()
    plt.xlim((1,len(mpp_hitrate)))
    plt.xscale('log')
    plt.xlabel("Number of Scans")
    plt.ylabel("Hitrate %")
    plt.title("Bandwidth Saved")

    plt.show()

def plotNormServiceDiscovery(scanned_num,normed_services,LIM):
    plt.plot(np.cumsum([1]*int(max(np.cumsum(np.array(scanned_num)/LIM)))),label="mpp, (lzr 65k)")
    plt.plot(np.cumsum(np.array(scanned_num)/LIM),normed_services,label="ML")
    plt.legend()
    plt.xlabel("Number of Scans")
    plt.ylabel("# Normalized")
    plt.title("Normalized Services Discovered")
    plt.show()
    
def plotGradientNormServiceDiscovery(scanned_num,normed_services,LIM):

    plt.plot([1]*int(max(np.cumsum(np.array(scanned_num)/LIM))),label="mpp, (lzr 65k)")
    plt.plot(np.cumsum(np.array(scanned_num)/LIM),np.gradient(normed_services,np.cumsum(np.array(scanned_num)/LIM)),label="ML")
    plt.legend()
    plt.xlabel("Number of Scans")
    plt.ylabel("# Normalized Per Scan")
    plt.title("Derivative of Normalized Services Discovered")
    plt.show()

## Play with Model

In [None]:
print(df[df["ip"] == inv_ip_map[0]][["asn","p","p1","p2","p3"]])
for i in list(df[df["ip"] == inv_ip_map[0]]["p"]):
    print(dmap[2][i])
#print(df)

In [None]:
ipc = dmap[0]["181.138.17.42"]

print(test[ipc])

In [None]:
print(predictions[ipc].data)

In [None]:
#retrain with mini

def trick(hybrid_model, port, weight):
    user = dmap[0]["2.59.42.8"]
    port = [dmap[2][port] ] #8894
    d = ([user]*len(port),port)


    #make matrix of predictions
    mini_choice = coo_matrix(([1]*len(port),([user]*len(port),port)),shape=train.shape)
    weights =  coo_matrix(([weight]*len(port),([user]*len(port),port)),shape=train.shape)

    #newtrain = sparse.hstack((mat1, mat2))

    #''' 
    hybrid_model = hybrid_model.fit_partial(mini_choice, #newtrain
           sample_weight = weights,
          user_features=user_features,
          #item_features=item_features,
          epochs=1,
          num_threads=NUM_THREADS, verbose=True)

    #'''

trick(hybrid_model,5223,10)

In [None]:
USERS = [0]
PORTS = [37707,37717,37699,533,386,1271,0,23290]
#print(PORTS)
NUM_PREDICT = 20

user_biases, user_embeddings = hybrid_model.get_user_representations(user_features[USERS])
item_biases, item_embeddings = hybrid_model.get_item_representations()#item_features)

print(user_biases)

print("User Embeddings: ")
print(user_embeddings)
print(user_embeddings.dot(item_embeddings.T)[0][PORTS])

predictions = (
    user_embeddings.dot(item_embeddings.T) 
    +item_biases.reshape(1, -1) + user_biases.reshape(-1, 1)
) 

print("Predictions:")

print(predictions[0][PORTS])
print("----")
recd = predictions[0].argsort()[::-1][:NUM_PREDICT]
for pid  in recd:
    print(inv_port_map[pid])

print("Item Biases: ")
print(item_biases[PORTS])
print(max(item_biases))

print("----")
print("scores subtracted against item biased")
unbiased = user_embeddings.dot(item_embeddings.T)+ user_biases.reshape(-1, 1) 
print(unbiased[0][PORTS])
unb_recd = unbiased[0].argsort()[::-1][:NUM_PREDICT]
for pid  in unb_recd:
    print(inv_port_map[pid])

In [None]:
def similar_items(item_id, item_features, model, N=10):
    item_biases, item_representations = model.get_item_representations()#item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)
    scores /= item_norms

    best = np.argpartition(scores, -N)[-N:]
    print(best)
    return sorted(zip(best, scores[best] / item_norms[item_id]), 
                  key=lambda x: -x[1])


port_id = dmap[2][8893]
items = similar_items(port_id,item_features,hybrid_model)

for pid, sim in items:
    print(inv_port_map[pid], sim)

# Analysis (Deep Dive)

In [None]:
def convert(coo):
    coo = coo.tocoo(copy=False)
    df_pred = pd.DataFrame({'ip': coo.row, 'port': coo.col})
    df_pred["ip"] = df_pred["ip"].apply(lambda i: inv_ip_map[i] )
    df_pred["port"] = df_pred["port"].apply(lambda i: inv_port_map[i] )
    return df_pred

#all_correctly_predicted = (newtrain - train)
#testcoo = test.tocoo()
#test_rows = list(set(testcoo.row))[START:LIM]

#df_pred = convert(pred_all_past)
#df_hit = convert(train)
#df_hit.to_csv(DATAPATH + "lzr1p_analysis/slash16ReTrainTrain.csv")
#df_missed = convert(test - all_correctly_predicted)

In [None]:
print(pysqldf("SELECT COUNT(distinct ip) FROM df_hit"))
print(pysqldf("SELECT COUNT(distinct ip) FROM df_missed"))

print(pysqldf("SELECT COUNT(*) FROM df_hit"))
print(pysqldf("SELECT COUNT(*) FROM df_missed"))

In [None]:
df[df["p"]==37443]

In [None]:
def ecdf(a):
    x, counts = np.unique(a, return_counts=True)
    y = np.cumsum(counts)
    x = np.insert(x, 0, x[0])
    y = np.insert(y/y[-1], 0, 0.)
    plt.plot(x, y, drawstyle='steps-post')
    plt.xscale('log')

    
#For port 80 stuff that we are missing, how many ports do those ips usually respond on?
num_missed = pysqldf(" SELECT t1.ip, IFNULL(t2.c, 0) c FROM  \
(SELECT distinct ip FROM df_missed where port = 993) as t1 \
LEFT OUTER JOIN \
(SELECT ip, COUNT(distinct port)c FROM df_hit GROUP BY ip) as t2 \
ON t1.ip = t2.ip ")



print(num_missed)
ecdf(num_missed["c"])#.hist(cumulative=True,histtype='step')

In [None]:
#top ports we hit and top ports we miss

#For port 80 stuff that we are missing, how many ports do those ips usually respond on?
num_missed = pysqldf("SELECT port, COUNT(distinct ip)c FROM df_missed GROUP BY port \
ORDER BY c DESC \
LIMIT 20 ")
print(num_missed)

num_hit = pysqldf("SELECT port, COUNT(distinct ip)c FROM df_hit GROUP BY port \
ORDER BY c DESC \
LIMIT 20")
print(num_hit)

In [None]:
print( pysqldf("SELECT COUNT(distinct asn) FROM df"))


# Feature Engineering
https://stackoverflow.com/questions/37037450/multi-label-feature-selection-using-sklearn

Check this out for using latent features in lightfm<br>
https://github.com/lyst/lightfm/issues/486

In [None]:
def find_pops(ports,df):
    selected_features = [] 

    for p in ports:
        X = df.drop(["ip"],axis=1)
        Y = (X["p"] == p )
        X = X.drop(["p"],axis=1)
        selector = SelectKBest(chi2, k='all')
        selector.fit(X, Y)
        selected_features.append(list(selector.scores_))  

    return selected_features
f = "ht65k_miniset.csv"
df = pd.read_csv(DATAPATH+f)
df = df.drop(["minidata"],axis=1)
cols = ['slash20', 'slash19', 'slash18', 'slash17', 'slash16',\
        's1', 's2', 's3', 's4', 'asn'] #,'minidata']
    
df[cols] = df[cols].astype('category')
df[cols] = df[cols].apply(lambda x: x.cat.codes)

pop_ports = [80,443,7547,22,30005,21,8080,5060,4567,25,3306,110,3389,143,8089,8443,8008,587,8085,2000]

rand_ports = list(set(df['p'].sample(n=100, random_state=1)))[:50]

print(df.columns) 

print('popular ports')
sf = find_pops(pop_ports, df)
selected_features = np.mean(sf, axis=0)
print(selected_features)


print('random ports')
sf = find_pops(rand_ports, df)
selected_features = np.mean(sf, axis=0)
print(selected_features)
   


# Scraps

In [None]:
def completeTest(f,tp,model_description, LR=0.01,N_COMP=4000,MAX_SAMPLE=175,EPOCHS=10,BIASED=False,\
                 RETRAIN=True,RETRAIN_WEIGHTS=False,WITH_ITEM_FEATS=False,CALC_WEIGHTS=False):

    gc.collect()

    global test,pred_ranks,num_ips_per_port,test_rows,testcoo,inv_port_map
    #f = "ht65k_miniset.iconv" # removed null byte! sed 's/\x0//g' ht65k_1pset.csv > ht65k_1pset.iconv
    #f = "ht65k_1pset.iconv"
    
    NUM_THREADS=72
    
    #Get the data
    df, dataset, num_users, num_items, interactions, weights, \
    user_features,item_features, train,test,trainw,testw,dmap, inv_ip_map,\
    inv_port_map = extractData(f, train_percentage = tp, withItemFeats=WITH_ITEM_FEATS,calcWeights= CALC_WEIGHTS)


    # Make the model
    hybrid_model = LightFM(loss='warp', #'warp
                    random_state=2016,
                    learning_rate=LR, #0.05
                    no_components=N_COMP,
                    #learning_schedule='adadelta',
                    max_sampled= MAX_SAMPLE, #50,
                    #user_alpha=0.5   
                          )

    if WITH_ITEM_FEATS:
        hybrid_model = hybrid_model.fit(train, 
                      user_features=user_features,
                      item_features=item_features,
                      sample_weight = trainw,
                      epochs=EPOCHS, #10
                      num_threads=NUM_THREADS, verbose=True)
    else:
        hybrid_model = hybrid_model.fit(train,
                      user_features=user_features,
                      sample_weight = trainw,
                      epochs=EPOCHS, #10
                      num_threads=NUM_THREADS, verbose=True)


        
    with open(DATAPATH+model_description+'_model.pickle', 'wb') as fle:
        pickle.dump(hybrid_model, fle, protocol=pickle.HIGHEST_PROTOCOL)
        
    #CALL THE BENCHES
    HIST_BINS = 100
    START = 0
    LIM = 200000  
    BENCH1_PORTS = 100
    BENCH2_CYCLES =25


    num_test_ips, num_services,num_ips_per_port, test_rows, testcoo,\
        item_biases, item_embeddings, user_biases, user_embeddings, predictions, \
        pred_ranks, testPorts, hist_preds = \
        prepTheModel(hybrid_model,test,train,LIM,START,HIST_BINS,user_features,\
                     item_features,BIASED = BIASED,WITH_ITEM_FEATS=WITH_ITEM_FEATS)
 
   

    
    np.random.seed(7)
    randomPorts = np.random.choice(range(0,len(testPorts)), BENCH1_PORTS,replace=False)

    TARGET_FRAC = 0.9
    """ 
    bench1_Imrov_results = bench1_Improvements(np.array(testPorts)[randomPorts],TARGET_FRAC, LIM,\
                                              model_description, DATAPATH,\
                                              test,pred_ranks,num_ips_per_port,test_rows,testcoo,inv_port_map)
    """

    
    bench2_Imrov_results = bench2_Improvements(hybrid_model,predictions,hist_preds,num_services,num_test_ips,\
                                               test,train,user_features,item_features, \
                                               NUM_THREADS,RETRAIN,BIASED, LIM,HIST_BINS, BENCH2_CYCLES,\
                                               RETRAIN_WEIGHTS,WITH_ITEM_FEATS,model_description)

        

## Bench 1 Implementation:  Top Users Per Port to predict hitrate increase per port

In [None]:
def plotImpr(r, label = ""):
    improvements = []
    for p in r:
        temp = r[p]["improvement"]
        if temp < 1:
            temp = 1
        improvements.append(temp)
    
    make_cdf(improvements, label)

    
def plotHitrates(DATAPATH, npy_fs, TARGET_FRAC):
    for f in npy_fs:
        res_og = np.load(DATAPATH +f ,allow_pickle=True).item()
        plotImpr(res_og,f)
        
    #plt.legend()
    plt.xscale('log')
    plt.xlabel("Hitrate Ratio Improvement")
    plt.ylabel("Fraction of Ports")
    plt.title("Hitrate(ML) / Hitrate(Random) to achieve "+ str(TARGET_FRAC)+" coverage")
    plt.show()

def initializer():
    global test,pred_ranks,num_ips_per_port,test_rows,testcoo,inv_port_map
    
# predicting hitrate increate per port compared to randomly probing
def calcImprov(all_args):
    (TPORT, TARGET_FRAC,LIM) = all_args
    
    res = {}
    p = inv_port_map[TPORT]

    target_total = len(test[test_rows,TPORT].data)
    if target_total == 0:
        return res
    target_hit = num_ips_per_port.tocsr()[:,TPORT].data[0]/ LIM
    #print("target port is: ", p)
    #print("Target Total is: ", target_total)
    #print("Target Hitrate is: ", target_hit)
    
    #all users where ranking = 1


    scanned = []
    old_proposed = 0
    
    # scan port by a time and scan most likely groups at a time
    for rank in range(65535):
        scores = np.array(list(pred_ranks[:,rank]))
        ii = np.where(scores == TPORT)[0]
        if len(ii) == 0:
            continue
        actual_test_rows = test_rows[ii]
        scanned.extend(actual_test_rows)
        
        #if raw number is big enough, start checking correctness
        if len(scanned) >= TARGET_FRAC*target_total:
        #if True:
        
            if len(scanned) - old_proposed > 10000: #check 1k at a time
                old_proposed = len(scanned)
            else:
                continue

            num_users = len(scanned)
            chosen_indexes = [TPORT]* num_users

            #make matrix of predictions
            pred_all = coo_matrix(([1]*num_users,(scanned, chosen_indexes)),shape=test.shape)

            # filter for only the correctly predicted this time
            #pred_correct_coo = testcoo.multiply(pred_all)

            combo = testcoo+pred_all
            #find how many were correct
            #correct = list(combo.data).count(2)
            correct = sum(np.array(combo.data)-1)
            fracFound = correct/target_total
            hitrate = correct/num_users
            improvement = hitrate/target_hit
            
            print("Improvement: ", improvement)
            print("At rank: ",rank)
            print("Scanned: ", len(scanned))
            print("Correct: ", correct)
            print("Hitrate: ",hitrate)
            print("FracFound: ",fracFound)

            
            if fracFound >= TARGET_FRAC:
                
                res[p] = {"rank":rank, "scanned":len(scanned),\
                         "correct":correct, "hitrate":hitrate,\
                         "fracFound":fracFound,"improvement":improvement,
                         "target_total":target_total, "target_hit":target_hit}
                print("Improvement: ", improvement)
                #print("At rank: ",rank)
                #print("Scanned: ", len(scanned))
                #print("Correct: ", correct)
                #print("Hitrate: ",hitrate)
                #print("FracFound: ",fracFound)
                break
            
    return res

    
def bench1_Improvements(testPorts,TARGET_FRAC, LIM,model_description, DATAPATH,\
                       test,pred_ranks,num_ips_per_port,test_rows,testcoo,inv_port_map):
    
    
    print("-------")
    print("...Beginning Bench1: Compare Hitrate To Random")
    # Chunks it up to parallelize
    chunks = [(p,TARGET_FRAC,LIM) for p in testPorts]

    pool = multiprocessing.Pool(30,initializer = initializer, initargs = ())
    print("...pooled out calc")
    individual_results = pool.map(calcImprov, chunks)

    # Freeing the workers:
    pool.close()
    pool.join()

    print("...calculations complete")
    result = {}
    for d in individual_results:
        result.update(d)
        
    np.save(DATAPATH + model_description+ "_TARGETFRAC_" + str(TARGET_FRAC)+"_hitrateCompB1.npy",result)
    
    npy_fs = [#"hitrate_comp_item_user_feat.npy",\
          #"hitrate_comp_item_user_feat_weights_1kcap.npy",\
             model_description+"_TARGETFRAC_" + str(TARGET_FRAC)+"_hitrateCompB1.npy"]

    plotHitrates(DATAPATH, npy_fs, TARGET_FRAC)
    
    print("...Finished Bench1")
    print("-------")
    return result

        

In [None]:
def calcCoverage(model,predictions,hist_preds,num_services,num_test_ips,\
                 test,train,user_features, item_features, NUM_THREADS,RETRAIN,\
                 BIASED,LIM,HIST_BINS,\
                 CYCLES,RETRAIN_WEIGHTS,WITH_ITEM_FEATS):

    
    bench2_results = {}
    portsScanned = {}
    normed_services = []
    frac_services = []
    frac_ips = []
    hitrate = []
    scanned_num =  []
    num_unique_rec = []
    pred_all_past = None
    start_choices = 1
    newtrain = train.tocoo()
    total_services_correct = 0
                        
    for i in range(1,CYCLES): 
        print("cycle: ",i)

        if RETRAIN:
            pred_all_past,  pred_correct, newtrain,minitrain,cur_hitrate,correct, \
            portsScanned,start_choices,scanned_num = \
            getRekd_v2(model,portsScanned, predictions, hist_preds, test,test_rows,\
                       newtrain,1,scanned_num,n=i,LIM=LIM,HIST_BINS=HIST_BINS,pred_past=pred_all_past)

        else:
            pred_all_past,  pred_correct, newtrain,minitrain,cur_hitrate,correct, \
            portsScanned,start_choices,scanned_num = \
            getRekd_v2(model,portsScanned, predictions, hist_preds, test,test_rows,\
                       newtrain,start_choices,scanned_num,n=i,LIM=LIM,HIST_BINS=HIST_BINS,pred_past=pred_all_past)

        total_services_correct += correct                

        print("At Bin: ", start_choices-1)
        all_correctly_predicted = (newtrain - train).tocoo()
        total_ips_correct = len(set(all_correctly_predicted.row))
        print("total ips correct: ",total_ips_correct)    

        num_correctly_pred_per_port = coo_matrix(all_correctly_predicted.sum(axis=0))

        #do some kind of sum of all of these fractions and divide by total number ports?
        norm_services = coo_matrix(num_correctly_pred_per_port/num_ips_per_port)
        norm_services.data = np.nan_to_num(norm_services.data, copy=False)
        norm_services = norm_services.sum()
        
        #if RETRAIN_WEIGHTS:
        #    norm_services = norm_services/10
        
        print("Total Normalized Services: ", norm_services)
        print("Total Fraction Services:", total_services_correct/num_services)
        print("Total Fraction IPs:", total_ips_correct/num_test_ips)
        print("Hitrate: ", cur_hitrate)
        normed_services.append(norm_services)
        frac_services.append(total_services_correct/num_services)
        frac_ips.append(total_ips_correct/num_test_ips)
        hitrate.append(cur_hitrate)
        num_unique_rec.append(len(portsScanned))
        
        if RETRAIN:
            
            #scaling the retrain more
            if RETRAIN_WEIGHTS:
                minitrain.multiply(10)
            
            minitrainw = minitrain.tocoo()
            
            if WITH_ITEM_FEATS:
                model = model.fit_partial(minitrain, 
                    sample_weight = minitrainw, #what to do about the weight???
                    user_features=user_features,
                    item_features=item_features,
                    epochs=1,
                    num_threads=NUM_THREADS, verbose=False)
            else:
                model = model.fit_partial(minitrain,
                    sample_weight = minitrainw,
                    user_features=user_features,
                    epochs=1,
                    num_threads=NUM_THREADS, verbose=False)       
            
             _ ,predictions= getNewPredictions(model,\
                    user_features,test_rows,item_features,BIASED,WITH_ITEM_FEATS)
            hist_preds = hist_scores_per_user(predictions,100)
            
    
    bench2_results["num_unique_rec"] = num_unique_rec
    bench2_results["frac_services"] = frac_services
    bench2_results["frac_ips"] = frac_ips
    bench2_results["hitrate"] = hitrate
    bench2_results["normed_services"] = normed_services
    bench2_results["scanned_num"] = scanned_num
    
    return bench2_results    
    
def getRekd_v2(model, res, predictions, hist_preds, test,test_rows,train,start_choices, \
               scanned_num = [], n=1,START=0,LIM=1000,\
            HIST_BINS=100,NUM_THREADS=NUM_THREADS,pred_past = None):
    
    testcoo = test.tocoo()
    traincoo = train.tocoo()
    
    correct = 0
    
    # need this to make sure old predictions dont show up again
    # even when predictions have been updated
    if n > 1:
        
        pp = np.array(pred_past.todense()[test_rows])
        pp = pp*100
        
        #first subtract out previous pred_coo
        predictions = predictions - pp
  

    temp_sum = 0
    num_takers = [0]*len(hist_preds)
    for i in range(start_choices,HIST_BINS): 
    
        cur_num_takers = np.array(hist_preds)[:,-i]
    
        temp_sum += sum(cur_num_takers)
        num_takers += cur_num_takers

        if temp_sum > LIM:
            start_choices = i+1
            break
            
    scanned_num.append(temp_sum)
    
    chosen = parallel_maxy(predictions,num_takers)
    
    
    found_test_rows = []
    found_chosen_indexes = []
    for trows, ports in zip(test_rows,chosen):
        found_test_rows.extend([trows]*len(ports))
        found_chosen_indexes.extend(ports)
    

    
    #make matrix of predictions
    pred_all = coo_matrix(([1]*len(found_test_rows),(found_test_rows,found_chosen_indexes)),shape=test.shape)
        
    # filter for only the correctly predicted this time
    pred_correct_coo = testcoo.multiply(pred_all)
    #pred_correct_coo = pred_correct_coo.multiply(10) #weight it
        
        
    #filter for only incorrect and make them -1
    #pred_false_coo = (pred_all - pred_correct_coo).multiply(-1)
        
    combo = testcoo+pred_all
    #find how many were correct
    correct = list(combo.data).count(2)
        
        
    #log chosen port
    for i in found_chosen_indexes:
        chosen_port = inv_port_map[i]    
        if chosen_port not in res:
            res[chosen_port] = 0

        res[chosen_port] +=1 

    print("correct:",correct)
    print("Num Scanned: ",len(found_chosen_indexes) )
    hitrate = correct/len(found_chosen_indexes)
    #print("accuracy:", accuracy)
    print("Number unique ports Rec'd: ", len(res))
    #print(res)


    newtrain = pred_correct_coo + traincoo #+ pred_false_coo
    minitrain = pred_correct_coo
    
    if pred_past is not None:
        pred_all = pred_all + pred_past
    
    return pred_all,  pred_correct_coo, newtrain,minitrain,hitrate,correct,\
        res,start_choices, scanned_num


In [None]:
# Predict the most likely port per user and scan all of them...
# this does not handle stopping early for IPs which likely wont respond on anything
# (i.e., wasted bandwitdth)
# deprecated


# https://github.com/inpefess/lightfm/blob/predict_comparison/
# examples/batch_predict/predicting_with_matrix_multiplication.ipynb
def getRekd(model,user_features, test,train,n=1,START=0,LIM=1000,\
            NUM_THREADS=NUM_THREADS,item_features=None,pred_past = None):
    
    
    testcoo = test.tocoo()
    traincoo = train.tocoo()

    test_rows = list(set(testcoo.row))[START:LIM]
    #test_rows.sort()
    
    if item_features is not None:
        item_biases, item_embeddings = model.get_item_representations(item_features)
    else:
        item_biases, item_embeddings = model.get_item_representations()
        
    user_biases, user_embeddings = model.get_user_representations(user_features[test_rows])
    
    #print("this?")
    predictions = (
        user_embeddings.dot(item_embeddings.T) +
        item_biases.reshape(1, -1) + user_biases.reshape(-1, 1)
    )
    #print("yes")
    
    print(predictions[3])
   
    res = {}
    correct = 0

    if n > 1:
        
        pp = np.array(pred_past.todense()[test_rows])
        pp = pp*100
        
        #first subtract out previous pred_coo
        predictions = predictions - pp

    #what port IDs did we choose for first 10 
    chosen_indexes = np.ravel(np.argmax(predictions,1))
    #chosen_ratings = np.ravel(np.amax(predictions,1))
    #print("n: ",n)
    #print(chosen_indexes[:10])
    
    #make matrix of predictions
    pred_all = coo_matrix(([1]*LIM,(test_rows,chosen_indexes)),shape=test.shape)
        
    # filter for only the correctly predicted this time
    pred_correct_coo = testcoo.multiply(pred_all)
        
        
    #filter for only incorrect and make them -1
    pred_false_coo = (pred_all - pred_correct_coo).multiply(-1)
        
    combo = testcoo+pred_all
    #find how many were correct
    correct = list(combo.data).count(2)
        
        
    #log chosen port
    for i in chosen_indexes:
        #print(i)
        chosen_port = inv_port_map[i]    
        if chosen_port not in res:
            res[chosen_port] = 0

        res[chosen_port] +=1 

    print("correct:",correct)
    accuracy = correct/(n*(LIM))
    #print("accuracy:", accuracy)
    print("Number unique ports Rec'd: ", len(res))
    #print(res)

    #how to add more info when predicting...
    newtest =  None
    newtrain = pred_correct_coo + traincoo + pred_false_coo
    
    if pred_past is not None:
        pred_all = pred_all + pred_past
    
    return pred_all,  pred_correct_coo, newtrain,newtest, accuracy,correct

In [None]:
# How to use depcreated Rekd method ^
#with item features
LIM = 10000
START = 0


num_test_ips, num_services,num_ips_per_port, test_rows = groundTruthServices(LIM,START)
total_services_correct = 0


    
#adjust lim
if LIM > len(test_rows):
    LIM = len(test_rows)


normed_services = []
frac_services = []
frac_ips = []
hitrate = []

for i in range(1,40): 
    print("cycle: ",i)
    if i == 1:
        pred_all_past,  pred_correct, newtrain,newtest, accuracy,correct =\
        getRekd(hybrid_model,user_features,test,train, n=i,LIM=LIM, item_features=item_features)
    else:
         pred_all_past,  pred_correct, newtrain,newtest,accuracy,correct =\
            getRekd(hybrid_model,user_features,test,newtrain,n=i,\
                    pred_past=pred_all_past,LIM=LIM,item_features=item_features)

    total_services_correct += correct

    ''' 
    hybrid_model = hybrid_model.fit_partial(newtrain, #newtrain
      user_features=user_features,
      item_features=item_features,
      epochs=1,
      num_threads=NUM_THREADS, verbose=False)

    '''
    all_correctly_predicted = (newtrain - train).tocoo()
    total_ips_correct = len(set(all_correctly_predicted.row))
    print("total ips correct: ",total_ips_correct)    
        
    #TODO: sum all_correctly_predicted
    # divide by num_ips_per_port 
    # this may be easier by keeping as matrix
    num_correctly_pred_per_port = coo_matrix(all_correctly_predicted.sum(axis=0))
    
    #do some kind of sum of all of these fractions and divide by total number ports?
    norm_services = coo_matrix(num_correctly_pred_per_port/num_ips_per_port)
    norm_services.data = np.nan_to_num(norm_services.data, copy=False)
    norm_services = norm_services.sum()
    cur_hitrate = correct/LIM
    print("Total Normalized Services: ", norm_services)
    print("Total Fraction Services:", total_services_correct/num_services)
    print("Total Fraction IPs:", total_ips_correct/num_test_ips)
    print("Hitrate: ", cur_hitrate)
    normed_services.append(norm_services)
    frac_services.append(total_services_correct/num_services)
    frac_ips.append(total_ips_correct/num_test_ips)
    hitrate.append(cur_hitrate)
    
    
#TODO: accuracy for finding at least one service on an IP

In [None]:
# Get ips to be ports and visa versa to then be able to rec similar IPs
# doesnt seem to work b/w too many items and data is too sparse
def extractData_reverse(f, withItemFeats=True): 
    df_csv = pd.read_csv(DATAPATH+f) #, dtype={"minidata": str})
    df_csv = df_csv.fillna(0)

    #prune the items, as they are expensive
    #if withItemFeats:
    #    df_csv["minidata"] = df_csv["minidata"].astype('category')
    #    df_csv[["minidata"]] = df_csv[["minidata"]].apply(lambda x: x.cat.codes)

    #    df_csv['minidata'] = np.where(~df_csv['minidata'].duplicated(keep=False), 0, df_csv['minidata'])

        
        
    #introduce dataset
    dataset = Dataset()

    dataset.fit(list(df_csv["p"]),list(df_csv["ip"]),\
            item_features=list(df_csv["slash20"]))
    #user_features = list(df_csv["ip"])+list(df_csv["s1"])+list(df_csv["s2"])+list(df_csv["s3"])+list(df_csv["s4"])+

    
    num_users, num_items = dataset.interactions_shape()
    print('Num users: {}, num_items {}.'.format(num_users, num_items))


    #build interactions
    (interactions, weights) = \
    dataset.build_interactions(list(zip(df_csv['p'], df_csv['ip'])))

    print(repr(interactions))

    #build featureset
    newFeats = list(zip(df_csv['ip'], \
                        list(zip(list(df_csv["slash20"])))))
    item_features = dataset.build_item_features(newFeats)


    train,test = cold_train_test_split_reverse(interactions,num_items,train_percentage=0.8)

    #good to have mappings
    dmap= dataset.mapping()
    inv_port_map = {v: k for k, v in dmap[0].items()}
    inv_ip_map = {v: k for k, v in dmap[2].items()}
    

    return df_csv, dataset, num_users, num_items, interactions,  None, item_features, train,test,dmap, inv_ip_map, inv_port_map





def cold_train_test_split_reverse(interactions,num_items,train_percentage=0.5,random_state=7):
    
    
    #remove weird duplicates that are introduced by lightfm
    interactions_dok=dok_matrix((interactions.shape),dtype=interactions.dtype)
    interactions_dok._update(zip(zip(interactions.row,interactions.col),interactions.data))

    train_a = interactions_dok.tocsr()
    test_a = interactions_dok.tocsr()
    
    
    
    indices_train = np.random.choice(np.arange(num_items), replace=False,
                       size=int(num_items * train_percentage))
    #how to get other indices
    indices_test = list(set(np.arange(num_items)).difference(indices_train))
    

    X_train = lil_matrix(train_a[:,indices_train]).tocsr()
    X_test = lil_matrix(test_a[:,indices_test]).tocsr()
    
    

    return X_train, X_test

In [None]:
#canonical ways to evaluate models...but these dont really mean much in our setting
''' 
# Compute and print the AUC score
train_auc = auc_score(hybrid_model,train,user_features=user_features,num_threads=NUM_THREADS).mean()
print('Hybrid train AUC: %s' % train_auc)

train_prec= precision_at_k(hybrid_model,train,k=1,user_features=user_features,num_threads=NUM_THREADS).mean()
print(train_prec)


# Compute and print the AUC score
test_auc = auc_score(hybrid_model,test,train_interactions=train, user_features=user_features,num_threads=NUM_THREADS).mean()
print('Hybrid test AUC: %s' % test_auc)


# Other Eval metrics
train_prec= precision_at_k(hybrid_model,train,k=1,user_features=user_features,num_threads=NUM_THREADS).mean()
print(train_prec)
test_prec= precision_at_k(hybrid_model,test,k=1,user_features=user_features,num_threads=NUM_THREADS).mean()
print(test_prec)
'''

#print("k=1")
#train_prec= recall_at_k(hybrid_model,train,k=1,user_features=user_features,num_threads=NUM_THREADS).mean()
#print(train_prec)
#test_prec= recall_at_k(hybrid_model,test,k=1,user_features=user_features,num_threads=NUM_THREADS).mean()
#print(test_prec)

In [None]:
# how to efficiently one hot encode...but also we dont really need this for this model 
# it 1 hot encodes itself
f = "ht65k_miniset.json"
df = pd.read_json(DATAPATH+f, lines=True)
for col in df.columns:
    df[col] = df[col].astype('category')
print("df ready")

#somehow need to make this onehot automatically also have the ip labels?
mlb = MultiLabelBinarizer()
onehot = pd.DataFrame(mlb.fit_transform(df["p"]),
                   columns=mlb.classes_)


print("onehot ready")
#onehot_csr = csr_matrix(onehot)
df_ip = df["ip"].astype('category')
base_interactions = csr_matrix((np.ones(df_ip.shape[0]),(df_ip.ip.cat.codes,onehot.p.cat.codes)))