In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math
import tarfile
import pickle
from transformers import BertTokenizer, BertForMaskedLM, BertModel, AdamW, BertConfig
from transformers import RobertaConfig, RobertaModel
from tokenizers import ByteLevelBPETokenizer, BertWordPieceTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import glob
import os
pd.set_option('display.max_rows', 1000)
import warnings
warnings.filterwarnings('ignore')
from torch.utils.tensorboard import SummaryWriter
summary = SummaryWriter()
%load_ext tensorboard
# tensorboard --logdir runs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import copy

# Random_Seed 설정
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.empty_cache()

In [2]:
# # 데이터 불러오기

# df1 = pd.read_csv(tarfile_url + "\\" + tarfile_name + "\\device.csv")
# df2 = pd.read_csv(tarfile_url + "\\" + tarfile_name + "\\logon.csv")
# df3 = pd.read_csv(tarfile_url + "\\" + tarfile_name + "\\email.csv")
# df4 = pd.read_csv(tarfile_url + "\\" + tarfile_name + "\\file.csv")
# df5 = pd.read_csv(tarfile_url + "\\" + tarfile_name + "\\http.csv")

In [3]:
# # 전처리1 date컬럼을 datetime 으로 변환, date컬럼을 인덱스로 변환

# change_index_before_preprocess(df1)
# change_index_before_preprocess(df2)
# change_index_before_preprocess(df3)
# change_index_before_preprocess(df4)
# change_index_before_preprocess(df5)

In [4]:
# # 전처리2 메일, 파일열람을 activity에 추가

# df1['activity_bak'] = df1['activity']
# df2['activity_bak'] = df2['activity']
# df3['activity'] = 'Mail'
# df3['activity_bak'] = df3['activity']
# df4['activity'] = 'File'
# df4['activity_bak'] = df4['activity']
# df5['activity'] = 'Http'
# df5['activity_bak'] = df5['activity']

In [5]:
# # 전처리3 불러온 데이터를 하나의 테이블로 통합

# con_df = pd.concat([df1,df2,df3,df4,df5],axis=0)
# con_df.sort_values(by='date',inplace=True)

In [6]:
# # 전처리4 행동값(activity)을 치환

# con_df['activity'] = con_df['activity'].map({'Logon':1,
#                                              'Mail':2,
#                                              'Http':3,
#                                              'Connect':4,
#                                              'Disconnect':5,
#                                              'File':6,
#                                              'Logoff':7})

In [7]:
# # 전처리4 행동값(activity)을 역으로 치환

# con_df['activity'] = con_df['activity'].map({1:'Logon',
#                                              2:'Mail',
#                                              3:'Http',
#                                              4:'Connect',
#                                              5:'Disconnect',
#                                              6:'File',
#                                              7:'Logoff'})
# con_df['date_bak'] = con_df.index

In [8]:
# # 전처리된 데이터를 폴더에 저장

# con_df = con_df[['user','pc','activity']]
# con_df.to_csv(tarfile_url + "\\" + tarfile_name + "\\preprocessed_user_pc_activity.csv",index=True)

In [9]:
# tarfile 압축해제

def unzip(url, name):
    a = tarfile.open(url + "\\" + name + ".tar")
    a.extractall(url)
    a.close()

In [10]:
# 소프트맥스 함수

def softmax(self):
    exp_a = np.exp(self)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    return y

In [11]:
# pickle 불러오기
# ex) attention_masks = load_pickle('attention_masks', tarfile_url, tarfile_name)

def load_pickle(self, tarfile_url, tarfile_name):
    with open(tarfile_url + "\\" + tarfile_name + "\\preprocessed_" + self + ".pickle","rb") as fr:
        self = pickle.load(fr)
    return self

# pickle 저장하기
# ex) save_pickle(behavior, 'behavior', tarfile_url, tarfile_name)

def save_pickle(self, name, tarfile_url, tarfile_name):
    with open(tarfile_url + "\\" + tarfile_name + "\\preprocessed_" + name +".pickle","wb") as fw:
        pickle.dump(self, fw)

In [12]:
# date 컬럼을 datetime으로 변환하고 인덱스로 만들어서 반환하는 함수
# ex) change_index_before_preprocess(df1)

def change_index_before_preprocess(self):
    if 'date' in self.columns:
        self['date_bak'] = self['date']
        self['date'] = pd.to_datetime(self['date'], format = "%m/%d/%Y %H:%M:%S")
    if self.index.dtype == 'int64':
        self.set_index('date',inplace=True)
    return 

In [13]:
# 시간기반 행동치환(사용안함)
# ex) behavior = behavior_convert_based_timen(con_df)

def behavior_convert_based_time(con_df):
    print('[+] START : make behavior tokens')
    inside_start_index=0
    behavior=[[]]
    
    for user_index in range(len(con_df['user'].unique())):
        a = con_df.loc[(con_df['user']==con_df['user'].unique()[user_index])]
        if inside_start_index==0:
            behavior[inside_start_index].append(a.activity[0])

        for x in range(1,len(a)):
            if a.index[x-1].hour == a.index[x].hour:
                behavior[inside_start_index].append(a.activity[x])

            else:
                inside_start_index += 1
                behavior.append([a.activity[x]])
    
    return behavior

In [14]:
# 세션기반 행동치환(텍스트로)
# ex) behavior = behavior_convert_based_session(con_df)

def behavior_convert_based_session(con_df):
    print('[+] START : make behavior tokens')
    behavior=[]
    
    
    for user_index in range(len(con_df['user'].unique())):
        a = con_df.loc[(con_df['user']==con_df['user'].unique()[user_index])]
        sentence = ""
        for x in range(len(a)):
            if a['activity'][x]!="Logoff":
                if len(sentence)==0:
                    sentence = a['activity'][x]
                else:
                    sentence = sentence + " " + str(a['activity'][x])
            else:
                sentence = sentence + " " + str(a['activity'][x])
                behavior.append([sentence])
                sentence = ""
    
    return behavior

In [15]:
# 정답파일 전처리
# ex) answer =  answer_preprocess(answer_url, tarfile_name)

def answer_preprocess(answer_master_url, tarfile_name):
    answer = pd.read_csv(answer_master_url)
    change_object_to_datetime(answer,'start')
    change_object_to_datetime(answer,'end')
    answer = answer.loc[(answer['dataset']==float(tarfile_name[1:]))]
    answer['index_bak'] = [x for x in range(len(answer))]
    answer.set_index('index_bak', inplace=True)
    
    return answer

In [16]:
# Con_df 전처리
# ex) con_df = con_df_preprocess(tarfile_name, tarfile_url)

def con_df_preprocess(tarfile_name, tarfile_url):
    con_df = pd.read_csv(tarfile_url + "\\" + tarfile_name + "\\preprocessed_user_pc_activity.csv", index_col='date')
    con_df.index = pd.to_datetime(con_df.index)
    # con_df
    # con_df['activity'] = con_df['activity'].map({1:'Logon',
    #                                              2:'Mail',
    #                                              3:'Http',
    #                                              4:'Connect',
    #                                              5:'Disconnect',
    #                                              6:'File',
    #                                              7:'Logoff'})
    con_df['date_bak'] = con_df.index
    
    return con_df

In [17]:
# 토큰만들기
# min_slice = 15
# token2 =make_token(min_slice)
# token2
def make_token(min_slice):
    token={}
    token['[PAD]']=0
    token['[UNK]']=1
    token['[CLS]']=2
    token['[SEP]']=3
    token['[MASK]']=4
    
    min_slicing = int(60/min_slice) 
    indexes = ['Logon','Http', 'Mail','Disconnect','Connect', 'File','Logoff']
    start_num=5
    for love in indexes:
        for b in range(0,25):
            if b <10:
                b = str(0) + str(b)
            if min_slice != 1:
                for c in range(min_slicing):
                    if min_slice*c < 10:
                        imsi = str(0) + str(min_slice*c)
                        token[love + '-' + str(b) + ':' + str(imsi)] = start_num
                        start_num += 1
                    else:
                        token[love + '-' + str(b) + ':' + str(min_slice*c)] = start_num
                        start_num += 1
            else:
                for c in range(min_slice):
                    token[love + '-' + str(b) + ':00'] = start_num
                    start_num += 1 
    # print('startnum',start_num)
    con_dfs = load_pickle('con_df', tarfile_url, tarfile_name)
    pc_names = con_dfs['pc'].value_counts().index.to_list()
    # print(pc_names)
    for x in pc_names:
        # print(x)
        token[x] = start_num
        start_num += 1
    return token

In [18]:
# date 컬럼을 datetime으로 변환하는 함수
# ex) change_object_to_datetime(df1)

def change_object_to_datetime(answer_df, column_name):
    answer_df[column_name] = pd.to_datetime(answer_df[column_name], format = "%m/%d/%Y %H:%M:%S")

In [19]:
def minute_slice(minute, min_slice):
    minute=int(minute)+1
    minute_list=[0]
    if min_slice==60:
        return '00'
    else:
        for x in range(1,int(60/min_slice)+1):
            minute_list.append(min_slice*x)
        # print(minute_list)
        for x in range(len(minute_list)):
            if minute <= minute_list[x]:
                # if x==0:
                #     if minute_list[0] < 10:
                #         return '0' + str(minute_list[0])
                #     else:
                #         return str(minute_list[0])
                # else:
                if minute_list[x-1] < 10:
                    return '0' + str(minute_list[x-1])
                else:
                    return str(minute_list[x-1])
                    

In [20]:
# 세션기반 행동치환(시간토큰)
# ex) behavior = behavior_convert_based_session2(con_df,token)

def behavior_convert_based_session2(con_df, answer ,token, scenario_class, answer_url, min_slice, loveuser):
    # print('[+] START : make behavior tokens')
    
    
    # 개별 answer 파일을 읽고 그안의 내용을 파싱함
    all_answer=[]
    pc = []
    if loveuser in answer['user'].unique().tolist():
        answer_file = glob.glob(answer_url + "\\*" + loveuser +"*")
        with open(answer_file[0], "r") as tf:
            line = tf.readline()
            while line:
                line = line.replace('\n','')
                lines = line.split(',')
                if lines[0]=='http':
                    lines[5]='Http'
                elif lines[0]=='email':
                    lines[5]='Mail'
                elif lines[0]=='file':
                    lines[5]='File'
                cap_lines = lines[:6]
                all_answer.append(cap_lines)
                line = tf.readline()
        # print(all_answer)
        for y,x in enumerate(all_answer):
            if all_answer[y][4] not in pc:
                pc.append(all_answer[y][4])
        
    # print('pc : ',pc)
    
    normal_start_index = 0
    mal_start_index = 0
    behavior = [[]]
    mal_user = answer['user'].unique().tolist()
    mal_behavior = [[]]
    con_df = con_df.loc[con_df.pc.isin(pc)]
    
    norm_https = []
    mal_https = []
    norm_http = 0
    mal_http = 0
    
    norm_connects = []
    mal_connects = []
    norm_connect = 0
    mal_connect = 0
    
    norm_mails = []
    mal_mails = []
    norm_mail = 0
    mal_mail = 0
    
    for user_index in range(len(con_df['user'].unique())):
        
        for pc_index in range(len(con_df['pc'].unique())):
            
            # print(con_df['pc'].unique())
            user_name = con_df['user'].unique()[user_index]
            pc_name = con_df['pc'].unique()[pc_index]
            
            a = con_df.loc[(con_df['user']==user_name) & (con_df['pc']==pc_name)]
            
            if a['user'][0] in mal_user:
                b, a = answer_timeline_seperate(user_name, answer, a, con_df, pc_name, scenario_class, answer_url)
                
                
                # 비정상 행동 토크나이징
                if len(b) >= 1:
                    # if mal_start_index == 0:
                    #     # mal_behavior[mal_start_index].append(2)
                        # mal_behavior[mal_start_index].append(token[pc_name])
                        # mal_behavior[mal_start_index].append(token.get(str((b['activity'][0]) + '-' + str(b['date_bak'][0])[11:13] + ':' + minute_slice(str(b['date_bak'][0])[14:16] ,min_slice))))
                        
                    for x in range(len(b)):                 
                        if x ==0:
                            mal_behavior[mal_start_index].append(token[pc_name])
                            mal_behavior[mal_start_index].append(token.get(str((b['activity'][x]) + '-' + str(b['date_bak'][x])[11:13] + ':' + minute_slice(str(b['date_bak'][x])[14:16] ,min_slice))))
                        else:
                            if b['activity'][x] != "Logoff":                                
                                if b['activity'][x] == "Http":
                                    mal_http += 1
                                if b['activity'][x] == "Connect":
                                    mal_connect += 1
                                if b['activity'][x] == "Mail":
                                    mal_mail += 1
                                mal_behavior[mal_start_index].append(token.get(str((b['activity'][x]) + '-' + str(b['date_bak'][x])[11:13] + ':' + minute_slice(str(b['date_bak'][x])[14:16] ,min_slice))))

                            else:
                                mal_behavior[mal_start_index].append(token.get(str((b['activity'][x]) + '-' + str(b['date_bak'][x])[11:13] + ':' + minute_slice(str(b['date_bak'][x])[14:16] ,min_slice))))
                                # mal_behavior[mal_start_index].append(3)
                                # print(mal_behavior[mal_start_index])
                                mal_start_index += 1
                                mal_behavior.append([])
                                # mal_behavior[mal_start_index].append(2)
                                if x != len(b)-1:
                                    mal_behavior[mal_start_index].append(token[pc_name])
                               
                                
                                mal_https.append(mal_http)
                                mal_http = 0
                                
                                mal_connects.append(mal_connect)
                                mal_connect = 0
                                
                                mal_mails.append(mal_mail)
                                mal_mail = 0
                                
                    

            
            # 정상 행동 토크나이징
            if len(a) >= 1:
                # if normal_start_index == 0:
                #     # behavior[normal_start_index].append(2)
                #     # behavior[normal_start_index].append(token[pc_name])
                #     behavior[normal_start_index].append(token.get(str((a['activity'][0]) + '-' + str(a['date_bak'][0])[11:13] + ':' + minute_slice(str(a['date_bak'][0])[14:16] ,min_slice))))

                for x in range(len(a)):
                    if x==0 :
                        behavior[normal_start_index].append(token[pc_name])
                        behavior[normal_start_index].append(token.get(str((a['activity'][x]) + '-' + str(a['date_bak'][x])[11:13] + ':' + minute_slice(str(a['date_bak'][x])[14:16] ,min_slice))))
                    else:
                        if a['activity'][x] != "Logoff":
                            if a['activity'][x] == "Http":
                                norm_http += 1
                            if a['activity'][x] == "Connect":
                                norm_connect += 1
                            if a['activity'][x] == "Mail":
                                norm_mail += 1
                            behavior[normal_start_index].append(token.get(str((a['activity'][x]) + '-' + str(a['date_bak'][x])[11:13] + ':' + minute_slice(str(a['date_bak'][x])[14:16] ,min_slice))))

                        else:
                            behavior[normal_start_index].append(token.get(str((a['activity'][x]) + '-' + str(a['date_bak'][x])[11:13] + ':' + minute_slice(str(a['date_bak'][x])[14:16] ,min_slice))))
                            # behavior[normal_start_index].append(3)

                            normal_start_index += 1
                            behavior.append([])
                            # behavior[normal_start_index].append(2) 
                            if x != len(a)-1:
                                behavior[normal_start_index].append(token[pc_name])
                            
                            norm_https.append(norm_http)
                            norm_http = 0
                            
                            norm_connects.append(norm_connect)
                            norm_connect = 0
                            
                            norm_mails.append(norm_mail)
                            norm_mail = 0
                            
                           
    
    return behavior[:-1], mal_behavior[:-1], norm_https, mal_https, norm_connects, mal_connects, norm_mails, mal_mails

In [21]:
# session token 생성할때 내부에서 answer time_line 분리시키는 함수
# ex) b, a = answer_timeline_seperate(user_name , answer, a)

def answer_timeline_seperate(attacker_name, answer, a, con_df, pc_name, scenario_class, answer_url):
    
    # 개별 answer 파일을 읽고 그안의 내용을 파싱함
    all_answer=[]

    answer_file = glob.glob(answer_url + "\\*" + attacker_name +"*")
    with open(answer_file[0], "r") as tf:
        line = tf.readline()
        while line:
            line = line.replace('\n','')
            lines = line.split(',')
            if lines[0]=='http':
                lines[5]='Http'
            elif lines[0]=='email':
                lines[5]='Mail'
            elif lines[0]=='file':
                lines[5]='File'
            cap_lines = lines[:6]
            if lines[4] == pc_name:
                all_answer.append(cap_lines)
            line = tf.readline()
    scenario_class[int(answer.loc[answer.user==attacker_name].scenario)-1] = scenario_class[int(answer.loc[answer.user==attacker_name].scenario)-1] + len(all_answer)
    # print(scenario_class)

    
    
    # 타임라인 추출(Mal_behavior가 나온 login~ logout 까지)
    if len(all_answer) >= 1:
        df = pd.DataFrame(all_answer)
        
        df.set_index(2,inplace=True)
        df.index = pd.to_datetime(df.index)

        time_lines=[[]]
        start_index=0
        
        error_time_lines=[[]]
        error_start_index=0
        
        # print(df[df.columns.difference([1])])

        # df.index[x].date() = 2010-06-17
        # df.index[x] = 2010-06-17 09:06:37

        
        for x in range(len(df)):
            try:
                session_start = a.loc[(a['activity']=='Logon') & (a['pc']==df[4][x])][:df.index[x]][-1:].index[0]
            except:
                print('')
                # print(attacker_name, ' never login in ', pc_name, ' at ',df.index[x])
                continue
                # con_df = load_pickle('con_df', tarfile_url, tarfile_name)
                # session_start = con_df.loc[(con_df.user==df[3][x]) & (con_df['activity']=='Logon') & (con_df['pc']==df[4][x])][:df.index[x]][-1:].index[0]

            try:
                session_end = a.loc[(a['activity']=='Logoff') & (a['pc']==df[4][x])][df.index[x]:][:1].index[0]
            except:
                # print(attacker_name, ' never logout in ', pc_name, ' at ',df.index[x])
                print('')
                continue
                # con_df = load_pickle('con_df', tarfile_url, tarfile_name)
                # session_end = con_df.loc[(con_df.user==df[3][x]) & (con_df['activity']=='Logoff') & (con_df['pc']==df[4][x])][df.index[x]:][:1].index[0]          

            session_start = session_start.strftime("%Y-%m-%d %H:%M:%S")
            session_end = session_end.strftime("%Y-%m-%d %H:%M:%S")
            for z,y in enumerate(time_lines):
                if session_start and session_end in y:
                    break
                elif z == len(time_lines) -1:
                    time_lines[start_index].append(session_start)
                    time_lines[start_index].append(session_end)
                    time_lines.append([])
                    start_index += 1
                    break

        time_lines = time_lines[:-1]
        # print(time_lines)


    # 추출된 타임라인 기반으로 정상데이터, 비정상데이터 슬라이싱
    c=a.copy()
    b=a[:1].copy()
    if len(all_answer) >= 1:
        for x in range(len(time_lines)):
            start_date = time_lines[x][0]
            end_date = time_lines[x][1]
            normal_1 = c.loc[(c.index < start_date)]
            normal_2 = c.loc[(c.index > end_date)]
            normal_1 = normal_1.append(normal_2)

            b = b.append(c[start_date:end_date])
            c = normal_1
    
    return b[1:], c

In [22]:
# behavior token 생성
# con_df의 Dataframe을 받아서 list 형식의 tensor로 치환해줌
# ex) input_ids, attention_masks = make_behavior_token(con_df)

def make_behavior_token(self):
    # print('[+] START : make input_ids & attention_mask')
    
    behavior_token = [[]]
    #behavior token 만듦
    for x in range(len(self)):
        for y in range(len(self.columns)):
                behavior_token[x].append(self[y][x])
        behavior_token.append([])

    # return behavior_token
    behavior_token = behavior_token[:-1]
    
    # #behavior token, attention_mask tensor생성
    input_ids = torch.tensor(behavior_token)
    input_ids = input_ids.to(torch.long)
    attention_masks = np.where(input_ids != 0, 1, 0)
    attention_masks = torch.tensor(attention_masks)
    attention_masks = attention_masks.to(torch.long)
    
    return input_ids, attention_masks

In [23]:
# 인코딩된 input_ids를 받고 MaskedLM을 위한 Label, masked_input_ids를 생성하는 함수
# ex) masked_input_ids, labels = create_masking(input_ids)

def create_masking(self):
    # print('[+] START : make masked_tokens_ids & labels')
    
    masked_input_ids = self.detach().clone()
    
    # MaskedLM을 위한 Label 생성, Mask와 원래단어의 매핑
    labels = self.detach().clone()
    
    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(masked_input_ids.shape)
    # create mask array
    mask_arr = (rand < 0.15) * (masked_input_ids != 2) * \
               (masked_input_ids != 3) * (masked_input_ids != 0)
    
    selection = []

    for i in range(masked_input_ids.shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
        )
        
    for i in range(masked_input_ids.shape[0]):
        masked_input_ids[i, selection[i]] = 4

    return masked_input_ids, labels

In [24]:
# MaskedLM Train용 커스텀 데이터셋 설정
# ex) train_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
#     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)

class Train_Dataset(torch.utils.data.Dataset):
    def __init__(self, masked_input_ids, token_type_ids, attention_masks, labels):
        self.masked_input_ids = masked_input_ids
        self.token_type_ids = token_type_ids
        self.attention_masks = attention_masks
        self.labels = labels
        
    def __getitem__(self, idx):
        # masked_input_ids = self.masked_input_ids[idx]
        # token_type_ids = self.token_type_ids[idx]
        # attention_masks = self.attention_masks[idx]
        # labels = self.labels[idx]        
        return self.masked_input_ids[idx], self.token_type_ids[idx], self.attention_masks[idx], self.labels[idx]
    
    def __len__(self):
        return len(masked_input_ids)

In [25]:
# MaskedLM Validation용 커스텀 데이터셋 설정
# ex) train_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
#     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)

class Val_Dataset(torch.utils.data.Dataset):
    def __init__(self, val_masked_input_ids, val_token_type_ids, val_attention_masks, val_labels):
        self.val_masked_input_ids = val_masked_input_ids
        self.val_token_type_ids = val_token_type_ids
        self.val_attention_masks = val_attention_masks
        self.val_labels = val_labels
        
        
    def __getitem__(self, idx):
        # masked_input_ids = self.masked_input_ids[idx]
        # token_type_ids = self.token_type_ids[idx]
        # attention_masks = self.attention_masks[idx]
        # labels = self.labels[idx]        
        return self.val_masked_input_ids[idx], self.val_token_type_ids[idx], self.val_attention_masks[idx], self.val_labels[idx]
    
    def __len__(self):
        return len(val_masked_input_ids)

In [26]:
# 임베딩 추출용 커스텀 데이터셋 설정
# ex) embedding_dataset = Embedding_Dataset(input_ids, attention_masks)
#     embedding_loader = torch.utils.data.DataLoader(embedding_dataset, batch_size=64)

class Embedding_Dataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        
    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        attention_masks = self.attention_masks[idx]
        return input_ids, attention_masks
    
    def __len__(self):
        return len(input_ids)

In [27]:
# AutoEncoder Train용 커스텀 데이터셋 설정
# ex) ae_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
#     ae_loader = torch.utils.data.DataLoader(ae_dataset, batch_size=1)

class ae_Dataset(torch.utils.data.Dataset):
    def __init__(self, all_features):
        self.all_features = all_features
        
    def __getitem__(self, idx):
        # all_features = self.all_features[idx]
        return self.all_features[idx]
    
    def __len__(self):
        return len(all_features)

In [28]:
# 0 : min_slice
# 1 : tr_epochs)
# 2 : avg_precision)
# 3 ; avg_recall)
# 4 : f1_score)
# 5 : 정탐율
# 6 : 오탐율
# 7 : accuracy
# 8 : weight


def result_explotiation(score_list):
    
    score_list = score_list[:-1]
    
    # print(score_list)
    best_f1 = 0
    best_f1_index = 0

    best_TP = 0
    best_TP_index = 0

    best_TN = 0
    best_TN_index = 0

    best_accuracy = 0
    best_accuracy_index = 0
    
    best_scen1 = 0
    best_scen1_index = 0
    
    best_scen2 = 0
    best_scen2_index = 0
    
    best_scen3 = 0
    best_scen3_index = 0
    
    best_scen4 = 0
    best_scen4_index = 0
    
    best_scen5 = 0
    best_scen5_index = 0
    
    for y, x in enumerate(score_list):
        if x[4] > best_f1:
                best_f1 = x[4]
                best_f1_index = y 
        if x[9] > best_scen1:
            best_scen1 = x[9]
            best_scen1_index = y
        if x[11] > best_scen2:
            best_scen2 = x[11]
            best_scen2_index = y 
        if x[13] > best_scen3:
            best_scen3 = x[13]
            best_scen3_index = y 
        if x[15] > best_scen4:
            best_scen4 = x[15]
            best_scen4_index = y 
        if x[17] > best_scen5:
            best_scen5 = x[17]
            best_scen5_index = y
        # if x[5] > best_TP:
        #     best_TP = x[5]
        #     best_TP_index = y
        # if x[6] > best_TN:
        #     best_TN = x[6]
        #     best_TN_index = y
        # if x[7] > best_accuracy:
        #     best_accuracy = x[7]
        #     best_accuracy_index = y 
    
    print('===============================best_f1 combination=========================')
    print('min_slice : ',score_list[best_f1_index][0])
    print('train_epoch : ',score_list[best_f1_index][1])
    print('http weight : ',score_list[best_f1_index][8][0], '  connect weight : ',score_list[best_f1_index][8][1], '  mail weight : ',score_list[best_f1_index][8][2])
    print('')
    print(' 시나리오 1 정탐율 :',score_list[best_f1_index][19], ' -> ',score_list[best_f1_index][9])
    print(' 시나리오 1 오탐율 :',score_list[best_f1_index][10])
    print('')
    print(' 시나리오 2 정탐율 :',score_list[best_f1_index][20], ' -> ',score_list[best_f1_index][11])
    print(' 시나리오 2 오탐율 :',score_list[best_f1_index][12])
    print('')
    print(' 시나리오 3 정탐율 :',score_list[best_f1_index][21], ' -> ',score_list[best_f1_index][13])
    print(' 시나리오 3 오탐율 :',score_list[best_f1_index][14])
    print('')
    print(' 시나리오 4 정탐율 :',score_list[best_f1_index][22], ' -> ',score_list[best_f1_index][15])
    print(' 시나리오 4 오탐율 :',score_list[best_f1_index][16])
    print('')
    print(' 시나리오 5 정탐율 :',score_list[best_f1_index][23], ' -> ',score_list[best_f1_index][17])
    print(' 시나리오 5 오탐율 :',score_list[best_f1_index][18])
    print('')
    print('전체 정탐율 : ',score_list[best_f1_index][5]) 
    print('전체 오탐율 : ',score_list[best_f1_index][6])
    print('accuracy : ',score_list[best_f1_index][7])
    print('precision : ',score_list[best_f1_index][2])
    print('recall : ',score_list[best_f1_index][3])
    print('f1 score : ',score_list[best_f1_index][4])
    print('')
    
    print('=========================best_시나리오1 combination=========================')
    print('min_slice : ',score_list[best_scen1_index][0])
    print('train_epoch : ',score_list[best_scen1_index][1])
    print('http weight : ',score_list[best_scen1_index][8][0], '  connect weight : ',score_list[best_scen1_index][8][1], '  mail weight : ',score_list[best_scen1_index][8][2])
    print('')
    print(' 시나리오 1 정탐율 :',score_list[best_scen1_index][19], ' -> ',score_list[best_scen1_index][9])
    print(' 시나리오 1 오탐율 :',score_list[best_scen1_index][10])
    print('')
    print(' 시나리오 2 정탐율 :',score_list[best_scen1_index][20], ' -> ',score_list[best_scen1_index][11])
    print(' 시나리오 2 오탐율 :',score_list[best_scen1_index][12])
    print('')
    print(' 시나리오 3 정탐율 :',score_list[best_scen1_index][21], ' -> ',score_list[best_scen1_index][13])
    print(' 시나리오 3 오탐율 :',score_list[best_scen1_index][14])
    print('')
    print(' 시나리오 4 정탐율 :',score_list[best_scen1_index][22], ' -> ',score_list[best_scen1_index][15])
    print(' 시나리오 4 오탐율 :',score_list[best_scen1_index][16])
    print('')
    print(' 시나리오 5 정탐율 :',score_list[best_scen1_index][23], ' -> ',score_list[best_scen1_index][17])
    print(' 시나리오 5 오탐율 :',score_list[best_scen1_index][18])
    print('')
    print('전체 정탐율 : ',score_list[best_scen1_index][5]) 
    print('전체 오탐율 : ',score_list[best_scen1_index][6])
    print('accuracy : ',score_list[best_scen1_index][7])
    print('precision : ',score_list[best_scen1_index][2])
    print('recall : ',score_list[best_scen1_index][3])
    print('f1 score : ',score_list[best_scen1_index][4])
    print('')
    
    print('=========================best_시나리오2 combination=========================')
    print('min_slice : ',score_list[best_scen2_index][0])
    print('train_epoch : ',score_list[best_scen2_index][1])
    print('http weight : ',score_list[best_scen2_index][8][0], '  connect weight : ',score_list[best_scen2_index][8][1], '  mail weight : ',score_list[best_scen2_index][8][2])
    print('')
    print(' 시나리오 1 정탐율 :',score_list[best_scen2_index][19], ' -> ',score_list[best_scen2_index][9])
    print(' 시나리오 1 오탐율 :',score_list[best_scen2_index][10])
    print('')
    print(' 시나리오 2 정탐율 :',score_list[best_scen2_index][20], ' -> ',score_list[best_scen2_index][11])
    print(' 시나리오 2 오탐율 :',score_list[best_scen2_index][12])
    print('')
    print(' 시나리오 3 정탐율 :',score_list[best_scen2_index][21], ' -> ',score_list[best_scen2_index][13])
    print(' 시나리오 3 오탐율 :',score_list[best_scen2_index][14])
    print('')
    print(' 시나리오 4 정탐율 :',score_list[best_scen2_index][22], ' -> ',score_list[best_scen2_index][15])
    print(' 시나리오 4 오탐율 :',score_list[best_scen2_index][16])
    print('')
    print(' 시나리오 5 정탐율 :',score_list[best_scen2_index][23], ' -> ',score_list[best_scen2_index][17])
    print(' 시나리오 5 오탐율 :',score_list[best_scen2_index][18])
    print('')
    print('전체 정탐율 : ',score_list[best_scen2_index][5]) 
    print('전체 오탐율 : ',score_list[best_scen2_index][6])
    print('accuracy : ',score_list[best_scen2_index][7])
    print('precision : ',score_list[best_scen2_index][2])
    print('recall : ',score_list[best_scen2_index][3])
    print('f1 score : ',score_list[best_scen2_index][4])
    print('')
    
    print('=========================best_시나리오3 combination=========================')
    print('min_slice : ',score_list[best_scen3_index][0])
    print('train_epoch : ',score_list[best_scen3_index][1])
    print('http weight : ',score_list[best_scen3_index][8][0], '  connect weight : ',score_list[best_scen3_index][8][1], '  mail weight : ',score_list[best_scen3_index][8][2])
    print('')
    print(' 시나리오 1 정탐율 :',score_list[best_scen3_index][19], ' -> ',score_list[best_scen3_index][9])
    print(' 시나리오 1 오탐율 :',score_list[best_scen3_index][10])
    print('')
    print(' 시나리오 2 정탐율 :',score_list[best_scen3_index][20], ' -> ',score_list[best_scen3_index][11])
    print(' 시나리오 2 오탐율 :',score_list[best_scen3_index][12])
    print('')
    print(' 시나리오 3 정탐율 :',score_list[best_scen3_index][21], ' -> ',score_list[best_scen3_index][13])
    print(' 시나리오 3 오탐율 :',score_list[best_scen3_index][14])
    print('')
    print(' 시나리오 4 정탐율 :',score_list[best_scen3_index][22], ' -> ',score_list[best_scen3_index][15])
    print(' 시나리오 4 오탐율 :',score_list[best_scen3_index][16])
    print('')
    print(' 시나리오 5 정탐율 :',score_list[best_scen3_index][23], ' -> ',score_list[best_scen3_index][17])
    print(' 시나리오 5 오탐율 :',score_list[best_scen3_index][18])
    print('')
    print('전체 정탐율 : ',score_list[best_scen3_index][5]) 
    print('전체 오탐율 : ',score_list[best_scen3_index][6])
    print('accuracy : ',score_list[best_scen3_index][7])
    print('precision : ',score_list[best_scen3_index][2])
    print('recall : ',score_list[best_scen3_index][3])
    print('f1 score : ',score_list[best_scen3_index][4])
    print('')
    
    print('=========================best_시나리오4 combination=========================')
    print('min_slice : ',score_list[best_scen4_index][0])
    print('train_epoch : ',score_list[best_scen4_index][1])
    print('http weight : ',score_list[best_scen4_index][8][0], '  connect weight : ',score_list[best_scen4_index][8][1], '  mail weight : ',score_list[best_scen4_index][8][2])
    print('')
    print(' 시나리오 1 정탐율 :',score_list[best_scen4_index][19], ' -> ',score_list[best_scen4_index][9])
    print(' 시나리오 1 오탐율 :',score_list[best_scen4_index][10])
    print('')
    print(' 시나리오 2 정탐율 :',score_list[best_scen4_index][20], ' -> ',score_list[best_scen4_index][11])
    print(' 시나리오 2 오탐율 :',score_list[best_scen4_index][12])
    print('')
    print(' 시나리오 3 정탐율 :',score_list[best_scen4_index][21], ' -> ',score_list[best_scen4_index][13])
    print(' 시나리오 3 오탐율 :',score_list[best_scen4_index][14])
    print('')
    print(' 시나리오 4 정탐율 :',score_list[best_scen4_index][22], ' -> ',score_list[best_scen4_index][15])
    print(' 시나리오 4 오탐율 :',score_list[best_scen4_index][16])
    print('')
    print(' 시나리오 5 정탐율 :',score_list[best_scen4_index][23], ' -> ',score_list[best_scen4_index][17])
    print(' 시나리오 5 오탐율 :',score_list[best_scen4_index][18])
    print('')
    print('전체 정탐율 : ',score_list[best_scen4_index][5]) 
    print('전체 오탐율 : ',score_list[best_scen4_index][6])
    print('accuracy : ',score_list[best_scen4_index][7])
    print('precision : ',score_list[best_scen4_index][2])
    print('recall : ',score_list[best_scen4_index][3])
    print('f1 score : ',score_list[best_scen4_index][4])
    print('')
    
    print('=========================best_시나리오5 combination=========================')
    print('min_slice : ',score_list[best_scen5_index][0])
    print('train_epoch : ',score_list[best_scen5_index][1])
    print('http weight : ',score_list[best_scen5_index][8][0], '  connect weight : ',score_list[best_scen5_index][8][1], '  mail weight : ',score_list[best_scen5_index][8][2])
    print('')
    print(' 시나리오 1 정탐율 :',score_list[best_scen5_index][19], ' -> ',score_list[best_scen5_index][9])
    print(' 시나리오 1 오탐율 :',score_list[best_scen5_index][10])
    print('')
    print(' 시나리오 2 정탐율 :',score_list[best_scen5_index][20], ' -> ',score_list[best_scen5_index][11])
    print(' 시나리오 2 오탐율 :',score_list[best_scen5_index][12])
    print('')
    print(' 시나리오 3 정탐율 :',score_list[best_scen5_index][21], ' -> ',score_list[best_scen5_index][13])
    print(' 시나리오 3 오탐율 :',score_list[best_scen5_index][14])
    print('')
    print(' 시나리오 4 정탐율 :',score_list[best_scen5_index][22], ' -> ',score_list[best_scen5_index][15])
    print(' 시나리오 4 오탐율 :',score_list[best_scen5_index][16])
    print('')
    print(' 시나리오 5 정탐율 :',score_list[best_scen5_index][23], ' -> ',score_list[best_scen5_index][17])
    print(' 시나리오 5 오탐율 :',score_list[best_scen5_index][18])
    print('')
    print('전체 정탐율 : ',score_list[best_scen5_index][5]) 
    print('전체 오탐율 : ',score_list[best_scen5_index][6])
    print('accuracy : ',score_list[best_scen5_index][7])
    print('precision : ',score_list[best_scen5_index][2])
    print('recall : ',score_list[best_scen5_index][3])
    print('f1 score : ',score_list[best_scen5_index][4])
    print('')
    
    # print('=========================result==========================')
    # for y, x in enumerate(score_list):
    #     print('min_slice : ', x[0]) 
    #     print('train_epochs : ', x[1])
    #     print('http weight : ',x[8][0], '  connect weight : ',x[8][1], '  mail weight : ',x[8][2])
    #     print('')
    #     print(' 시나리오 1 정탐율 :',x[19], ' -> ',x[9],"%")
    #     print(' 시나리오 1 오탐율 :',x[10],"%")
    #     print('')
    #     print(' 시나리오 2 정탐율 :',x[20], ' -> ',x[11],"%")
    #     print(' 시나리오 2 오탐율 :',x[12],"%")
    #     print('')
    #     print(' 시나리오 3 정탐율 :',x[21], ' -> ',x[13],"%")
    #     print(' 시나리오 3 오탐율 :',x[14],"%")
    #     print('')
    #     print(' 시나리오 4 정탐율 :',x[22], ' -> ',x[15],"%")
    #     print(' 시나리오 4 오탐율 :',x[16],"%")
    #     print('')
    #     print(' 시나리오 5 정탐율 :',x[23], ' -> ',x[17],"%")
    #     print(' 시나리오 5 오탐율 :',x[18],"%")
    #     print('')
    #     print('전체 정탐율 : ', x[5])
    #     print('전체 오탐율 : ',score_list[best_f1_index][6])
    #     print('accuracy : ', x[7])
    #     print('avg_precision : ', x[2])
    #     print('avg_recall : ', x[3])
    #     print('f1_score : ', x[4])
    #     print('')

    # print('====best_정탐율 combination====')
    # print('min_slice : ',score_list[best_TP_index][0])
    # print('train_epoch : ',score_list[best_TP_index][1])
    # print('weight : ',score_list[best_TP_index][8])
    # print('')
    # print('정탐율 : ',score_list[best_TP_index][5]) 
    # print('오탐율 : ',score_list[best_TP_index][6])
    # print('accuracy : ',score_list[best_TP_index][7])
    # print('precision : ',score_list[best_TP_index][2])
    # print('recall : ',score_list[best_TP_index][3])
    # print('f1 score : ',score_list[best_TP_index][4])
    # print('')
    # print('====best_오탐율 combination====')
    # print('min_slice : ',score_list[best_TN_index][0])
    # print('train_epoch : ',score_list[best_TN_index][1])
    # print('weight : ',score_list[best_TN_index][8])
    # print('')
    # print('정탐율 : ',score_list[best_TN_index][5]) 
    # print('오탐율 : ',score_list[best_TN_index][6])
    # print('accuracy : ',score_list[best_TN_index][7])
    # print('precision : ',score_list[best_TN_index][2])
    # print('recall : ',score_list[best_TN_index][3])
    # print('f1 score : ',score_list[best_TN_index][4])
    # print('')
    # print('====best_accuracy combination====')
    # print('min_slice : ',score_list[best_accuracy_index][0])
    # print('train_epoch : ',score_list[best_accuracy_index][1])
    # print('weight : ',score_list[best_accuracy_index][8])
    # print('')
    # print('정탐율 : ',score_list[best_accuracy_index][5]) 
    # print('오탐율 : ',score_list[best_accuracy_index][6])
    # print('accuracy : ',score_list[best_accuracy_index][7])
    # print('precision : ',score_list[best_accuracy_index][2])
    # print('recall : ',score_list[best_accuracy_index][3])
    # print('f1 score : ',score_list[best_accuracy_index][4])

In [29]:
def calc_standardize(list1, list2,weight):
    imsi_list = list1.copy()
    for x in list2:
        imsi_list.append(x)
    
    mean = np.mean(imsi_list)
    variance = round(np.var(imsi_list),4)
    std_variance = round(math.sqrt(variance),4)
    
    imsi1 = []
    imsi2 = []
    for x in list1:
        if (x-mean)/std_variance > 0:
            imsi1.append(((x-mean)/std_variance)*weight)
        else:
            imsi1.append(0)
        
    for x in list2:
        if (x-mean)/std_variance > 0:
            imsi2.append(((x-mean)/std_variance)*weight)
        else:
            imsi2.append(0)
    return imsi1, imsi2

In [30]:
# 토큰 디코딩
# ex) decode_token(mal_behavior[0],token)

def decode_token(tokenized, token):
    a=""
    for x in tokenized:
        for key, value in token.items():
            if x == value:
                a = a + ' ' +str(key)
    print(a)

In [31]:
tarfile_name = "r4.2"
tarfile_url = "C:\\Users\\son\\Downloads\\12841247\\" + tarfile_name
answer_root_url = "C:\\Users\\son\\Downloads\\12841247\\answers\\"
answer_master_url = answer_root_url + "insiders.csv"
answer_url = answer_root_url + tarfile_name
answer =  answer_preprocess(answer_master_url,tarfile_name)
answer

Unnamed: 0_level_0,dataset,scenario,details,user,start,end
index_bak,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.2,1,r4.2-1-AAM0658.csv,AAM0658,2010-10-23 01:34:19,2010-10-29 05:23:28
1,4.2,1,r4.2-1-AJR0932.csv,AJR0932,2010-09-10 19:12:01,2010-09-18 02:02:51
2,4.2,1,r4.2-1-BDV0168.csv,BDV0168,2010-07-30 19:56:44,2010-08-10 05:16:41
3,4.2,1,r4.2-1-BIH0745.csv,BIH0745,2010-07-13 20:15:23,2010-07-13 21:20:44
4,4.2,1,r4.2-1-BLS0678.csv,BLS0678,2010-09-21 01:16:22,2010-09-30 04:48:19
5,4.2,1,r4.2-1-BTL0226.csv,BTL0226,2010-10-06 22:25:52,2010-10-14 06:43:29
6,4.2,1,r4.2-1-CAH0936.csv,CAH0936,2010-08-11 04:00:08,2010-08-12 23:56:19
7,4.2,1,r4.2-1-DCH0843.csv,DCH0843,2011-02-04 07:08:00,2011-02-04 07:36:05
8,4.2,1,r4.2-1-EHB0824.csv,EHB0824,2010-07-22 21:48:43,2010-07-29 01:08:41
9,4.2,1,r4.2-1-EHD0584.csv,EHD0584,2010-10-02 03:46:16,2010-10-08 22:26:26


In [None]:
tr_epochss = [2,3,4,5,6,7,8,9,10]
min_slices = [5,10,15,20,30,60]

# https_weight=[0]
# connects_weight=[0]
# mails_weight=[0]
https_weight=list(map(lambda x: x*0.01, range(50)))
connects_weight=list(map(lambda x: x*0.01, range(50)))
mails_weight=list(map(lambda x: x*0.01, range(50)))
weights=[[]]
start_weights_index = 0

for x in https_weight:
    for y in connects_weight:
        for z in mails_weight:
            weights[start_weights_index].append(x)
            weights[start_weights_index].append(y)
            weights[start_weights_index].append(z)
            weights.append([])
            start_weights_index += 1
weights = weights[:-1]

# tarfile_names = ['r4.2']
# tarfile_names = ['r5.2']
# tarfile_names = ['r4.2','r5.2']
# tarfile_names = ['r6.1','r6.2']
tarfile_names = ['r4.2','r5.2','r6.1','r6.2']

# 자세히보기 verbose 1은 그래프출력, 0은 출력안함
verbose=0

score_list =[[]]
start_score_index = 0

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')
print('device : ',device)

# ================================================================================================
# 1.기본 데이터 디렉터리 지정
# ================================================================================================
# Random_Seed 설정
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

for min_slice in tqdm(min_slices):
    for tr_epochs in tqdm(tr_epochss):
        
        love=[[]]
        losses_sum=[[]]
        start_fuck_index = 0

        norm_love=[[]]
        norm_losses_sum=[[]]
        norm_start_fuck_index = 0

        test_love=[[]]
        test_losses_sum=[[]]
        test_start_fuck_index = 0
        
        user_counts = [[]]
        user_counts_index = 0
        
        # Mal 행동 개수 측정
        scenario_class=[0,0,0,0,0]
        # Mal 행동 세션 개수 측정
        scenario_session=[0,0,0,0,0]
        # normal 행동 세션 개수 측정
        scenario_norm_session=[0,0,0,0,0]
        # test 행동 세션 개수 측정
        scenario_test_session=[0,0,0,0,0]
        
        for tarfile_name in tarfile_names:
            tarfile_url = "C:\\Users\\son\\Downloads\\12841247\\" + tarfile_name
            answer_root_url = "C:\\Users\\son\\Downloads\\12841247\\answers\\"
            answer_master_url = answer_root_url + "insiders.csv"
            answer_url = answer_root_url + tarfile_name
            token = make_token(min_slice)
            real_tr_epochs = tr_epochs
            # score_list[start_score_index].append(min_slice)
            # score_list[start_score_index].append(tr_epochs)

            # ================================================================================================
            # 2.(전처리 1/2) 정상데이터, 비정상데이터 분리
            # ================================================================================================
            answer =  answer_preprocess(answer_master_url,tarfile_name)

            if tarfile_name == "r5.2":
                answer =  answer.loc[answer.scenario==4]
                # answer =  answer.loc[answer.user=='GCD0194']
                print('')
            elif tarfile_name == "r4.2":
                # answer =  answer.loc[answer.scenario==2]
                # answer =  answer.loc[answer.user=='FSC0601']
                print('')
            elif tarfile_name == "r6.1":
                answer =  answer.loc[answer.scenario==5]
                # answer =  answer.loc[answer.user=='ADC1257']
                print('')
            elif tarfile_name == "r6.2":
                answer =  answer.loc[answer.scenario==5]
                # answer =  answer.loc[answer.user=='MBG3183']
                print('') 

            for loveuser in answer['user'].unique():
                # print('====================================')
                # print(loveuser)
                con_df = load_pickle('con_df', tarfile_url, tarfile_name)
                con_df = con_df.loc[con_df.user==loveuser]

                # ================================================================================================
                # 3.(전처리 2/2) BERT 학습용 토큰 만들기
                # ================================================================================================
                # 정상 행위, 비정상 행위 토큰 생성하기
                behavior, mal_behavior, norm_https, mal_https, norm_connects, mal_connects, norm_mails, mal_mails = behavior_convert_based_session2(con_df, answer, token, scenario_class, answer_url, min_slice, loveuser)
                
                user_counts[user_counts_index].append(norm_https)
                user_counts[user_counts_index].append(mal_https)
                user_counts[user_counts_index].append(norm_connects)
                user_counts[user_counts_index].append(mal_connects)
                user_counts[user_counts_index].append(norm_mails)
                user_counts[user_counts_index].append(mal_mails)
                user_counts_index += 1
                user_counts.append([])
                
                behavior, test_behavior = train_test_split(behavior, test_size=0.2, shuffle=True, random_state=34)
                behavior, val_behavior = train_test_split(behavior, test_size=0.1, shuffle=True, random_state=34)
                
                scenario_norm_session[int(answer.loc[answer.user==loveuser].scenario)-1] = scenario_norm_session[int(answer.loc[answer.user==loveuser].scenario)-1] + len(behavior)
                scenario_session[int(answer.loc[answer.user==loveuser].scenario)-1] = scenario_session[int(answer.loc[answer.user==loveuser].scenario)-1] + len(mal_behavior)
                scenario_test_session[int(answer.loc[answer.user==loveuser].scenario)-1] = scenario_test_session[int(answer.loc[answer.user==loveuser].scenario)-1] + len(test_behavior)
                
                save_pickle(behavior, 'behavior', tarfile_url, tarfile_name)
                save_pickle(val_behavior, 'val_behavior', tarfile_url, tarfile_name)
                save_pickle(mal_behavior, 'mal_behavior', tarfile_url, tarfile_name)
                save_pickle(test_behavior, 'test_behavior', tarfile_url, tarfile_name)

                # 만들어둔 정상 행위, 비정상 행위 토큰 불러오기
                behavior = load_pickle('behavior', tarfile_url, tarfile_name)
                mal_behavior = load_pickle('mal_behavior', tarfile_url, tarfile_name)
                val_behavior = load_pickle('val_behavior', tarfile_url, tarfile_name)
                test_behavior = load_pickle('test_behavior', tarfile_url, tarfile_name)

                # 트레인용 정상행위 attention_mask, masking, lables 획득
                con_df = pd.DataFrame(behavior)
                con_df = con_df.fillna(0)
                input_ids, attention_masks = make_behavior_token(con_df)
                masked_input_ids, labels = create_masking(input_ids)

                # 검증용 정상행위 attention_mask, masking, lables 획득
                val_con_df = pd.DataFrame(val_behavior)
                val_con_df = val_con_df.fillna(0)
                val_input_ids, val_attention_masks = make_behavior_token(val_con_df)
                val_masked_input_ids, val_labels = create_masking(val_input_ids)

                # 비정상행위 attention_mask, masking, lables 획득
                mal_con_df = pd.DataFrame(mal_behavior)
                mal_con_df = mal_con_df.fillna(0)
                mal_input_ids, mal_attention_masks = make_behavior_token(mal_con_df)
                mal_masked_input_ids, mal_labels = create_masking(mal_input_ids)

                # 테스트용 행위 attention_mask, masking, lables 획득
                test_con_df = pd.DataFrame(test_behavior)
                test_con_df = test_con_df.fillna(0)
                test_input_ids, test_attention_masks = make_behavior_token(test_con_df)
                test_masked_input_ids, test_labels = create_masking(test_input_ids)

                # 생성된 데이터를 Pickle로 저장
                # 학습용 정상데이터 저장
                save_pickle(input_ids, 'input_ids', tarfile_url, tarfile_name)
                save_pickle(attention_masks, 'attention_masks', tarfile_url, tarfile_name)
                save_pickle(labels, 'labels', tarfile_url, tarfile_name)
                save_pickle(masked_input_ids, 'masked_input_ids', tarfile_url, tarfile_name)

                # 검증요 정상데이터 저장
                save_pickle(val_input_ids, 'val_input_ids', tarfile_url, tarfile_name)
                save_pickle(val_attention_masks, 'val_attention_masks', tarfile_url, tarfile_name)
                save_pickle(val_labels, 'val_labels', tarfile_url, tarfile_name)
                save_pickle(val_masked_input_ids, 'val_masked_input_ids', tarfile_url, tarfile_name)

                # 비정상데이터 저장
                save_pickle(mal_input_ids, 'mal_input_ids', tarfile_url, tarfile_name)
                save_pickle(mal_attention_masks, 'mal_attention_masks', tarfile_url, tarfile_name)
                save_pickle(mal_labels, 'mal_labels', tarfile_url, tarfile_name)
                save_pickle(mal_masked_input_ids, 'mal_masked_input_ids', tarfile_url, tarfile_name)

                # 테스트 데이터 저장
                save_pickle(test_input_ids, 'test_input_ids', tarfile_url, tarfile_name)
                save_pickle(test_attention_masks, 'test_attention_masks', tarfile_url, tarfile_name)
                save_pickle(test_labels, 'test_labels', tarfile_url, tarfile_name)
                save_pickle(test_masked_input_ids, 'test_masked_input_ids', tarfile_url, tarfile_name)

                # 만들어둔 데이터를 Pickle로 불러오기
                # 학습용 정상데이터 불러오기
                input_ids = load_pickle('input_ids', tarfile_url, tarfile_name)
                masked_input_ids = load_pickle('masked_input_ids', tarfile_url, tarfile_name)
                attention_masks = load_pickle('attention_masks', tarfile_url, tarfile_name)
                labels = load_pickle('labels', tarfile_url, tarfile_name)
                token_type_ids = masked_input_ids * 0

                # 검증용 정상데이터 불러오기
                val_input_ids = load_pickle('val_input_ids', tarfile_url, tarfile_name)
                val_masked_input_ids = load_pickle('val_masked_input_ids', tarfile_url, tarfile_name)
                val_attention_masks = load_pickle('val_attention_masks', tarfile_url, tarfile_name)
                val_labels = load_pickle('val_labels', tarfile_url, tarfile_name)
                val_token_type_ids = mal_masked_input_ids * 0

                # 비정상데이터 불러오기
                mal_input_ids = load_pickle('mal_input_ids', tarfile_url, tarfile_name)
                mal_masked_input_ids = load_pickle('mal_masked_input_ids', tarfile_url, tarfile_name)
                mal_attention_masks = load_pickle('mal_attention_masks', tarfile_url, tarfile_name)
                mal_labels = load_pickle('mal_labels', tarfile_url, tarfile_name)
                mal_token_type_ids = mal_masked_input_ids * 0

                # 테스트 데이터 불러오기
                test_input_ids = load_pickle('test_input_ids', tarfile_url, tarfile_name)
                test_masked_input_ids = load_pickle('test_masked_input_ids', tarfile_url, tarfile_name)
                test_attention_masks = load_pickle('test_attention_masks', tarfile_url, tarfile_name)
                test_labels = load_pickle('test_labels', tarfile_url, tarfile_name)
                test_token_type_ids = test_masked_input_ids * 0

                # ================================================================================================
                # 6.Train용 Dataset설정, DataLoader 설정
                # ================================================================================================
                input_ids = load_pickle('input_ids', tarfile_url, tarfile_name)
                masked_input_ids = load_pickle('masked_input_ids', tarfile_url, tarfile_name)
                attention_masks = load_pickle('attention_masks', tarfile_url, tarfile_name)
                labels = load_pickle('labels', tarfile_url, tarfile_name)
                token_type_ids = masked_input_ids * 0

                val_input_ids = load_pickle('val_input_ids', tarfile_url, tarfile_name)
                val_masked_input_ids = load_pickle('val_masked_input_ids', tarfile_url, tarfile_name)
                val_attention_masks = load_pickle('val_attention_masks', tarfile_url, tarfile_name)
                val_labels = load_pickle('val_labels', tarfile_url, tarfile_name)
                val_token_type_ids = mal_masked_input_ids * 0

                train_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
                val_dataset =Val_Dataset(val_masked_input_ids, val_token_type_ids, val_attention_masks, val_labels)
                train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)
                val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)

                # ================================================================================================
                # 7.BertMaskedLM 모델설정
                # ================================================================================================
                config = BertConfig(
                    vocab_size=len(token)+1,
                    max_position_embeddings=512,
                    hidden_size=768,
                    num_attention_heads=12,
                    num_hidden_layers=6,
                    pad_token_ids=0
                )
                model = BertForMaskedLM(config).to(device)

                # ================================================================================================
                # 8.옵티마이저 설정
                # ================================================================================================
                tr_optim = AdamW(model.parameters(), lr=1e-4 ,no_deprecation_warning=True)

                # ================================================================================================
                # 9.BERT MaskedLM 모델 사전학습 (custom pre-train)
                # ================================================================================================
                tr_losses=np.empty((0))
                val_losses=np.empty((0))
                output=[]
                for tr_epoch in range(tr_epochs):

                    # Training Phase
                    loop = train_loader
                    for x,batch in enumerate(loop):
                        tr_optim.zero_grad()

                        masked_input_id = batch[0].to(device)
                        token_type_id = batch[1].to(device)
                        attention_mask = batch[2].to(device)
                        label = batch[3].to(device)

                        outputs = model(masked_input_id, token_type_ids=token_type_id, attention_mask=attention_mask, labels=label, output_hidden_states=True)
                        loss = outputs.loss
                        loss.backward()
                        tr_optim.step()
                        tr_losses = np.append(tr_losses, loss.item())

                    # Validation Phase
                    loop = val_loader
                    with torch.no_grad():
                        for x,batch in enumerate(loop):

                            val_masked_input_id = batch[0].to(device)
                            val_token_type_id = batch[1].to(device)
                            val_attention_mask = batch[2].to(device)
                            val_label = batch[3].to(device)

                            outputs = model(val_masked_input_id, token_type_ids=val_token_type_id, attention_mask=val_attention_mask, labels=val_label)
                            loss = outputs.loss
                            val_losses = np.append(val_losses, loss.item())

                model.save_pretrained(tarfile_url + "\\ITDBERT")
                torch.cuda.empty_cache()
                
                
#                 # 학습결과 시각화 한번 해봄
#                 if verbose == 1:
#                     ns.set(style='darkgrid')

#                     sns.set(font_scale=1.5)
#                     plt.rcParams["figure.figsize"] = (60,10)

#                     plt.plot(tr_losses, label='train_loss')
#                     plt.plot(val_losses, label='validation_loss')

#                     plt.title("Train & Validation loss")
#                     plt.xlabel("session_index")
#                     plt.xticks([x for x in range(len(tr_losses))])
#                     plt.xticks(rotation = 90)
#                     plt.ylabel("session_loss")
#                     plt.legend()
#                     plt.show()
        
                # ================================================================================================
                # MaskedLM 으로 이상치 탐지 - 정상
                # ================================================================================================
                input_ids = load_pickle('input_ids', tarfile_url, tarfile_name)
                masked_input_ids = load_pickle('masked_input_ids', tarfile_url, tarfile_name)
                attention_masks = load_pickle('attention_masks', tarfile_url, tarfile_name)
                labels = load_pickle('labels', tarfile_url, tarfile_name)
                token_type_ids = masked_input_ids * 0

                train_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
                train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)

                model = BertForMaskedLM.from_pretrained(tarfile_url + "\\ITDBERT").to(device)

                norm_epochs = 1
                losses=np.empty((0))
                with torch.no_grad():
                    for norm_epoch in range(norm_epochs):
                        loop = train_loader
                        for x,batch in enumerate(loop):

                            masked_input_ids = batch[0].to(device)
                            token_type_ids = batch[1].to(device)
                            attention_masks = batch[2].to(device)
                            labels = batch[3].to(device)

                            outputs = model(masked_input_ids, token_type_ids=token_type_ids, attention_mask=attention_masks, labels=labels)
                            loss = outputs.loss
                            losses = np.append(losses, loss.cpu())
                
                cut_off = losses.max()
                cut_off_line = [cut_off] * len(losses)
                norm_loss_fu = losses.tolist()
                for x in norm_loss_fu:
                    norm_losses_sum[norm_start_fuck_index].append(x)

                norm_love[norm_start_fuck_index].append(loveuser)
                norm_love[norm_start_fuck_index].append(cut_off)
                norm_love.append([])
                norm_losses_sum.append([])
                norm_start_fuck_index += 1

                # if verbose == 1:
                #     # print('https 표준편차')
                #     # print(std_norm_https)
                #     # print('')
                #     # print('connects 표준편차')
                #     # print(std_norm_connects)
                #     # print('')
                #     # print('mails 표준편차')
                #     # print(std_norm_mails)
                #     sns.set(font_scale=1.5)
                #     plt.rcParams["figure.figsize"] = (60,10)
                #     plt.plot(losses, label='loss')
                #     plt.title("Normal_session")
                #     plt.xlabel("session_index")
                #     plt.plot(cut_off_line, 'r-', label = 'cut_off')
                #     plt.xticks([x for x in range(len(losses))])
                #     plt.xticks(rotation = 90)
                #     plt.ylabel("session_loss")
                #     plt.legend()
                #     plt.show()

                torch.cuda.empty_cache()

                # ================================================================================================
                # MaskedLM 으로 이상치 탐지 - 테스트
                # ================================================================================================
                input_ids = load_pickle('test_input_ids', tarfile_url, tarfile_name)
                masked_input_ids = load_pickle('test_masked_input_ids', tarfile_url, tarfile_name)
                attention_masks = load_pickle('test_attention_masks', tarfile_url, tarfile_name)
                labels = load_pickle('test_labels', tarfile_url, tarfile_name)
                token_type_ids = masked_input_ids * 0

                train_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
                train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)

                model = BertForMaskedLM.from_pretrained(tarfile_url + "\\ITDBERT").to(device)

                test_epochs = 1
                losses=np.empty((0))
                with torch.no_grad():
                    for test_epoch in range(test_epochs):
                        loop = train_loader
                        for x,batch in enumerate(loop):

                            masked_input_ids = batch[0].to(device)
                            token_type_ids = batch[1].to(device)
                            attention_masks = batch[2].to(device)
                            labels = batch[3].to(device)

                            outputs = model(masked_input_ids, token_type_ids=token_type_ids, attention_mask=attention_masks, labels=labels)
                            loss = outputs.loss
                            losses = np.append(losses, loss.cpu())
                
                test_loss_fu = losses.tolist()
                for x in test_loss_fu:
                    test_losses_sum[test_start_fuck_index].append(x)

                test_love[start_fuck_index].append(loveuser)
                test_love[start_fuck_index].append(cut_off)
                test_love.append([])
                test_losses_sum.append([])
                test_start_fuck_index += 1

#                 # 세션나온거 시각화 한번 해봄
#                 if verbose == 1:
#                     sns.set(style='darkgrid')

#                     sns.set(font_scale=1.5)
#                     plt.rcParams["figure.figsize"] = (60,10)

#                     plt.plot(losses, label='losses')
#                     cut_off_line = [cut_off] * len(losses)
#                     plt.plot(cut_off_line, 'r-', label = 'cut_off')

#                     plt.title("Test_session")
#                     plt.xlabel("session_index")
#                     plt.ylabel("session_loss")
#                     plt.xticks([x for x in range(len(losses))])
#                     plt.xticks(rotation = 90)
#                     plt.legend()
#                     plt.show()

                    # print('https 표준편차')
                    # print(std_mal_https)
                    # print('')
                    # print('connects 표준편차')
                    # print(std_mal_connects)
                    # print('')
                    # print('mails 표준편차')
                    # print(std_mal_mails)

                torch.cuda.empty_cache()

                # ================================================================================================
                # MaskedLM 으로 이상치 탐지 - 비정상
                # ================================================================================================
                input_ids = load_pickle('mal_input_ids', tarfile_url, tarfile_name)
                masked_input_ids = load_pickle('mal_masked_input_ids', tarfile_url, tarfile_name)
                attention_masks = load_pickle('mal_attention_masks', tarfile_url, tarfile_name)
                labels = load_pickle('mal_labels', tarfile_url, tarfile_name)
                token_type_ids = masked_input_ids * 0

                train_dataset =Train_Dataset(masked_input_ids, token_type_ids, attention_masks, labels)
                train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)

                model = BertForMaskedLM.from_pretrained(tarfile_url + "\\ITDBERT").to(device)

                abnormal_epochs = 1
                losses=np.empty((0))
                with torch.no_grad():
                    for abnormal_epoch in range(abnormal_epochs):
                        loop = train_loader
                        for x,batch in enumerate(loop):

                            masked_input_ids = batch[0].to(device)
                            token_type_ids = batch[1].to(device)
                            attention_masks = batch[2].to(device)
                            labels = batch[3].to(device)

                            outputs = model(masked_input_ids, token_type_ids=token_type_ids, attention_mask=attention_masks, labels=labels)
                            loss = outputs.loss
                            losses = np.append(losses, loss.cpu())                

                loss_fu = losses.tolist()

                for x in loss_fu:
                    losses_sum[start_fuck_index].append(x)

                love[start_fuck_index].append(loveuser)
                love[start_fuck_index].append(cut_off)
                love.append([])
                losses_sum.append([])
                start_fuck_index += 1
                
                # 세션나온거 시각화 한번 해봄
#                 if verbose == 1:
#                     sns.set(style='darkgrid')

#                     sns.set(font_scale=1.5)
#                     plt.rcParams["figure.figsize"] = (60,10)

#                     plt.plot(losses, label='losses')
#                     cut_off_line = [cut_off] * len(losses)
#                     plt.plot(cut_off_line, 'r-', label = 'cut_off')

#                     plt.title("Abnormal_session")
#                     plt.xlabel("session_index")
#                     plt.ylabel("session_loss")

#                     plt.xticks([x for x in range(len(losses))])
#                     plt.xticks(rotation = 90)
#                     plt.legend()
#                     plt.show()

                torch.cuda.empty_cache()
        
        love = love[:-1]
        losses_sum = losses_sum[:-1]

        norm_love = norm_love[:-1]
        norm_losses_sum = norm_losses_sum[:-1]

        test_love = test_love[:-1]
        test_losses_sum = test_losses_sum[:-1]
        
        user_counts = user_counts[:-1]
        
        norm_losses_sum_bak = copy.deepcopy(norm_losses_sum)
        test_losses_sum_bak = copy.deepcopy(test_losses_sum)
        losses_sum_bak = copy.deepcopy(losses_sum)
        
        
        # 0 : user_counts[user_counts_index].append(norm_https)
        # 1 : user_counts[user_counts_index].append(mal_https)
        # 2 : user_counts[user_counts_index].append(norm_connects)
        # 3 : user_counts[user_counts_index].append(mal_connects)
        # 4 : user_counts[user_counts_index].append(norm_mails)
        # 5 : user_counts[user_counts_index].append(mal_mails)

        for weight in tqdm(weights):    
            scenario_thres_session=[0,0,0,0,0]
            test_scenario_thres_session=[0,0,0,0,0]

            for x, user_count in enumerate(user_counts):  
                std_mal_https, std_norm_https = calc_standardize(user_count[1], user_count[0], weight[0])
                std_mal_connects, std_norm_connects = calc_standardize(user_count[3], user_count[2], weight[1])
                std_mal_mails, std_norm_mails = calc_standardize(user_count[5], user_count[4], weight[2])

                std_norm_https, std_test_norm_https = train_test_split(std_norm_https, test_size=0.2, shuffle=True, random_state=34)
                std_norm_https, std_val_norm_https = train_test_split(std_norm_https, test_size=0.1, shuffle=True, random_state=34)

                std_norm_connects, std_test_norm_connects = train_test_split(std_norm_connects, test_size=0.2, shuffle=True, random_state=34)
                std_norm_connects, std_val_norm_connects = train_test_split(std_norm_connects, test_size=0.1, shuffle=True, random_state=34)

                std_norm_mails, std_test_norm_mails = train_test_split(std_norm_mails, test_size=0.2, shuffle=True, random_state=34)
                std_norm_mails, std_val_norm_mails = train_test_split(std_norm_mails, test_size=0.1, shuffle=True, random_state=34)

                std_norm_https = np.array(std_norm_https)
                std_norm_https[np.isnan(std_norm_https)] = 0

                std_test_norm_https = np.array(std_test_norm_https)
                std_test_norm_https[np.isnan(std_test_norm_https)] = 0

                std_val_norm_https = np.array(std_val_norm_https)
                std_val_norm_https[np.isnan(std_val_norm_https)] = 0

                std_mal_https = np.array(std_mal_https)
                std_mal_https[np.isnan(std_mal_https)] = 0

                std_norm_connects = np.array(std_norm_connects)
                std_norm_connects[np.isnan(std_norm_connects)] = 0

                std_test_norm_connects = np.array(std_test_norm_connects)
                std_test_norm_connects[np.isnan(std_test_norm_connects)] = 0

                std_val_norm_connects = np.array(std_val_norm_connects)
                std_val_norm_connects[np.isnan(std_val_norm_connects)] = 0

                std_mal_connects = np.array(std_mal_connects)
                std_mal_connects[np.isnan(std_mal_connects)] = 0

                std_norm_mails = np.array(std_norm_mails)
                std_norm_mails[np.isnan(std_norm_mails)] = 0

                std_test_norm_mails = np.array(std_test_norm_mails)
                std_test_norm_mails[np.isnan(std_test_norm_mails)] = 0

                std_val_norm_mails = np.array(std_val_norm_mails)
                std_val_norm_mails[np.isnan(std_val_norm_mails)] = 0

                std_mal_mails = np.array(std_mal_mails)
                std_mal_mails[np.isnan(std_mal_mails)] = 0

                #========== weight에 따른 cutoff 갱신=======================
                norm_losses_sum[x] = copy.deepcopy(norm_losses_sum_bak[x])
                test_losses_sum[x] = copy.deepcopy(test_losses_sum_bak[x])
                losses_sum[x] = copy.deepcopy(losses_sum_bak[x])

                # print('===========================')
                # print(love[x][0])

        #         print('norm_losses_sum : ',len(norm_losses_sum[x]))
        #         print(norm_losses_sum[x])
        #         print('norm_https : ',len(std_norm_https))
        #         print(std_norm_https)
        #         print('norm_connects : ',len(std_norm_connects))
        #         print(std_norm_connects)
        #         print('norm_mails : ',len(std_norm_mails))
        #         print(std_norm_mails)
        #         print('')

        #         print('test_losses_sum : ',len(test_losses_sum[x]))
        #         print(test_losses_sum[x])
        #         print('test_https : ',len(std_test_norm_https))
        #         print(std_test_norm_https)
        #         print('test_connects : ',len(std_test_norm_connects))
        #         print(std_test_norm_connects)
        #         print('test_mails : ',len(std_test_norm_mails))
        #         print(std_test_norm_mails)
        #         print('')

        #         print('mal_losses_sum : ',len(losses_sum[x]))
        #         print(losses_sum[x])
        #         print('mal_https : ',len(std_mal_https))
        #         print(std_mal_https)
        #         print('mal_connects : ',len(std_mal_connects))
        #         print(std_mal_connects)
        #         print('mal_mails : ',len(std_mal_mails))
        #         print(std_mal_mails)
        #         print('')

                #print('최초 임계치 : ', love[x][1])

                for y in range(len(norm_losses_sum[x])):
                    norm_losses_sum[x][y] = norm_losses_sum[x][y] + std_norm_https[y] + std_norm_connects[y] + std_norm_mails[y]

                for y in range(len(test_losses_sum[x])):
                    test_losses_sum[x][y] = test_losses_sum[x][y] +  std_test_norm_https[y] +  std_test_norm_connects[y] + std_test_norm_mails[y]

                for y in range(len(losses_sum[x])):
                    losses_sum[x][y] = losses_sum[x][y] + std_mal_https[y] +  std_mal_connects[y] + std_mal_mails[y]

                threshold_imsi = 0
                for y in norm_losses_sum[x]:
                    if y > threshold_imsi:
                        threshold_imsi = y
                love[x][1] = threshold_imsi
                test_love[x][1] = threshold_imsi

                #print('바뀐 임계치 : ', love[x][1])

                if verbose == 1:
                    cut_off_line = [love[x][1]] * len(norm_losses_sum[x])
                    sns.set(style='darkgrid')
                    sns.set(font_scale=1.5)
                    plt.rcParams["figure.figsize"] = (60,10)
                    plt.plot(norm_losses_sum[x], label='loss')
                    plt.title("Normal_session")
                    plt.xlabel("session_index")
                    plt.plot(cut_off_line, 'r-', label = 'cut_off')
                    plt.xticks([x for x in range(len(norm_losses_sum[x]))])
                    plt.xticks(rotation = 90)
                    plt.ylabel("session_loss")
                    plt.legend()
                    plt.show()

                    sns.set(style='darkgrid')
                    sns.set(font_scale=1.5)
                    plt.rcParams["figure.figsize"] = (60,10)
                    plt.plot(test_losses_sum[x], label='losses')
                    cut_off_line = [love[x][1]] * len(test_losses_sum[x])
                    plt.plot(cut_off_line, 'r-', label = 'cut_off')
                    plt.title("Test_session")
                    plt.xlabel("session_index")
                    plt.ylabel("session_loss")
                    plt.xticks([x for x in range(len(test_losses_sum[x]))])
                    plt.xticks(rotation = 90)
                    plt.legend()
                    plt.show()

                    sns.set(style='darkgrid')
                    sns.set(font_scale=1.5)
                    plt.rcParams["figure.figsize"] = (60,10)
                    plt.plot(losses_sum[x], label='losses')
                    cut_off_line = [love[x][1]] * len(losses_sum[x])
                    plt.plot(cut_off_line, 'r-', label = 'cut_off')
                    plt.title("Abnormal_session")
                    plt.xlabel("session_index")
                    plt.ylabel("session_loss")
                    plt.xticks([x for x in range(len(losses_sum[x]))])
                    plt.xticks(rotation = 90)
                    plt.legend()
                    plt.show()

            #=================================================================================================================================================================================================================
            all_sum = 0
            under_cutoff =0
            under_founds = [[]]
            under_index = 0

            for x in range(len(love)):
                under_found = 0
                all_sum += len(losses_sum[x])
                for y in range(len(losses_sum[x])):
                    if losses_sum[x][y] < love[x][1]:
                        under_found += 1
                if under_found >= 1:
                    try:
                        answer =  answer_preprocess(answer_master_url,'r4.2')
                        scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + under_found
                    except:
                        try:
                            answer =  answer_preprocess(answer_master_url,'r5.2')
                            scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + under_found
                        except:
                            try:
                                answer =  answer_preprocess(answer_master_url,'r6.1')
                                scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + under_found
                            except:
                                answer =  answer_preprocess(answer_master_url,'r6.2')
                                scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + under_found

                    under_founds[under_index].append(love[x][0])
                    under_founds[under_index].append(under_found)
                    under_founds.append([])
                    under_index +=1

            under_founds = under_founds[:-1]
            for x in range(len(under_founds)):
                under_cutoff += under_founds[x][1]

            #=================================================================================================================================================================================================================    
            test_all_sum = 0
            test_under_cutoff =0
            test_under_founds = [[]]
            test_under_index = 0

            for x in range(len(test_love)):
                test_under_found = 0
                test_all_sum += len(test_losses_sum[x])
                for y in range(len(test_losses_sum[x])):
                    if test_losses_sum[x][y] > test_love[x][1]:
                        test_under_found += 1
                if test_under_found >= 1:

                    try:
                        answer =  answer_preprocess(answer_master_url,'r4.2')
                        test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + test_under_found
                    except:
                        try:
                            answer =  answer_preprocess(answer_master_url,'r5.2')
                            test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + test_under_found
                        except:
                            try:
                                answer =  answer_preprocess(answer_master_url,'r6.1')
                                test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + test_under_found
                            except:
                                answer =  answer_preprocess(answer_master_url,'r6.2')
                                test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] = test_scenario_thres_session[int(answer.loc[answer.user==love[x][0]].scenario)-1] + test_under_found

                    test_under_founds[test_under_index].append(test_love[x][0])
                    test_under_founds[test_under_index].append(test_under_found)
                    test_under_founds.append([])
                    test_under_index +=1

            test_under_founds = test_under_founds[:-1]
            for x in range(len(test_under_founds)):
                test_under_cutoff += test_under_founds[x][1]

            #=================================================================================================================================================================================================================    
        #     print('=========================== 정보 =======================')

        #     print('')
        #     print('tr_epochs : ',real_tr_epochs,  'min_slice : ', min_slice, 'http weight : ',weight[0], 'connect weight : ',weight[1], 'mail weight : ',weight[2])
        #     print('')

        #     for x in range(len(scenario_class)):
        #         print(' 시나리오 ', x+1, '비정상 행동 수 :',scenario_class[x])
        #     print(' ')
        #     for x in range(len(scenario_session)):
        #         print(' 시나리오 ', x+1, '정상 세션 수 :',scenario_norm_session[x])
        #     print(' ')
        #     for x in range(len(scenario_session)):
        #         print(' 시나리오 ', x+1, '비정상 세션 수 :',scenario_session[x])
        #     print(' ')
        #     for x in range(len(scenario_session)):
        #         print(' 시나리오 ', x+1, '비정상 정탐 수 :',scenario_session[x]-scenario_thres_session[x])    
        #     print(' ')
        #     for x in range(len(scenario_session)):
        #         print(' 시나리오 ', x+1, '비정상 미탐 수 :',scenario_thres_session[x])
        #     print(' ')
        #     for x in range(len(scenario_session)):
        #         print(' 시나리오 ', x+1, '테스트 세션 수 :',scenario_test_session[x])
        #     print(' ')
        #     for x in range(len(scenario_session)):
        #         print(' 시나리오 ', x+1, '테스트 오탐 수 :',test_scenario_thres_session[x])
        #     print(' ')

        #     print('all_mal_count : ',all_sum)
        #     print('under_cutoff_count : ',under_cutoff)
        #     print('under_found list\n', under_founds)

        #     print(' ')
        #     print('all_test_count : ',test_all_sum)
        #     print('test_upper_cutoff_count : ',test_under_cutoff)
        #     print('test_upper_found list\n', test_under_founds)

            # Precision, Recall, F1 score
            try:
                avg_precision = ((all_sum - under_cutoff)/(all_sum - under_cutoff + test_under_cutoff) + (test_all_sum - test_under_cutoff)/(test_all_sum - test_under_cutoff + under_cutoff))/2
            except:
                avg_precision = 0
                print('zero found')

            avg_recall = ((all_sum - under_cutoff)/(all_sum - under_cutoff + under_cutoff) + (test_all_sum - test_under_cutoff)/(test_all_sum - test_under_cutoff + test_under_cutoff))/2
            f1_score = 2*((avg_precision * avg_recall)/(avg_precision + avg_recall))
            accuracy = ((all_sum - under_cutoff) + (test_all_sum - test_under_cutoff))/(all_sum + test_all_sum)

            #시나리오별 정, 오탐
           # for x in range(len(scenario_session)):
                #try:
                    #print(' 시나리오 ', x+1, '정탐율 :',(scenario_session[x] - scenario_thres_session[x])/scenario_session[x] *100,"%")
                    #print(' 시나리오 ', x+1, '오탐율 :',test_scenario_thres_session[x]/scenario_test_session[x] *100,"%")
                    #print('')
               # except:
                    #print('zero divide')

           # print('')
          ##  print('종합 정탐율 : ',(all_sum - under_cutoff)/all_sum*100,"%")
            #print('종합 오탐율 : ',test_under_cutoff/test_all_sum*100,"%")
           # print('accuracy : ', accuracy)
          #  print('precision : ', avg_precision)
           # print('recall : ',avg_recall)
            #print('f1_score : ',f1_score)

            score_list[start_score_index].append(min_slice)
            score_list[start_score_index].append(tr_epochs)
            score_list[start_score_index].append(avg_precision)
            score_list[start_score_index].append(avg_recall)
            score_list[start_score_index].append(f1_score)
            score_list[start_score_index].append((all_sum - under_cutoff)/all_sum*100)
            score_list[start_score_index].append(test_under_cutoff/test_all_sum*100)
            score_list[start_score_index].append(accuracy)
            score_list[start_score_index].append(weight) # (구)weight

            for x in range(len(scenario_session)):
                try:
                    score_list[start_score_index].append((scenario_session[x] - scenario_thres_session[x])/scenario_session[x] *100)
                    score_list[start_score_index].append(test_scenario_thres_session[x]/scenario_test_session[x] *100)
                except:
                    score_list[start_score_index].append(0)
                    score_list[start_score_index].append(0)

            for x in range(len(scenario_session)):
                score_list[start_score_index].append(str(scenario_session[x]-scenario_thres_session[x]) + "/" + str(scenario_session[x]))

            score_list.append([])
            start_score_index += 1

device :  cuda


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]





















































  0%|          | 0/125000 [00:00<?, ?it/s]





















































  0%|          | 0/125000 [00:00<?, ?it/s]

In [None]:
result_explotiation(score_list)