# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports


In [None]:
import os
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import statistics
from collections import Counter
import nltk

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid

# Constants

In [None]:
SAVED_MODEL_PATH = '/content/drive/Shared drives/Methods for Detecting Attacks/'
DATA_PATH = '/content/drive/Shared drives/Methods for Detecting Attacks/FraudedRawData'
NUM_OF_TRAIN_SEQUENCE_PER_USER = 50
NUM_OF_TEST_SEQUENCE_PER_USER = 100
WORDS_PER_SEQUENCE = 100
NUM_USERS = 40

# Load Data

In [None]:
# returns a set that contains all the words in the corpus
def get_vocab():
  vocab = set()
  for i in range(NUM_USERS):
    file_path = os.path.join(DATA_PATH,'User' + str(i))
    with open(file_path,'r') as file:
      vocab.update(file.read().splitlines())

  return vocab

# creates dicts mapping from word to index and from index to word
def create_index_dicts(vocab):
  i = 1
  words_to_index = {}
  index_to_words = {}
  for w in sorted(vocab):
      words_to_index[w] = i
      index_to_words[i] = w
      i = i + 1
  return words_to_index, index_to_words

# returns user's sequence with int embeddings
def get_user_sequence(user_num, type):
  file_path = os.path.join(DATA_PATH,'User' + str(user_num))
  user_seqs = []
  with open(file_path, 'r') as file:
    for i in range(NUM_OF_TRAIN_SEQUENCE_PER_USER + NUM_OF_TEST_SEQUENCE_PER_USER):
      if type == 'int':
        user_seq = [words_to_index[file.readline().rstrip()] for i in range(WORDS_PER_SEQUENCE)]
      elif type == 'str':
        user_seq = [file.readline().rstrip() for i in range(WORDS_PER_SEQUENCE)]
      user_seqs.append(user_seq)
  return user_seqs

In [None]:
vocab = get_vocab()
words_to_index, index_to_words = create_index_dicts(vocab)
user_seqs_ints = [get_user_sequence(i,'int') for i in range(NUM_USERS)] # load entire corpus
user_seqs_strs = [get_user_sequence(i,'str') for i in range(NUM_USERS)]

# Create Data Frames

In [None]:
full_df_ints = pd.DataFrame(user_seqs_ints) # convert corpus to dataframe
train_df_ints = full_df_ints.loc[:, :NUM_OF_TRAIN_SEQUENCE_PER_USER-1]  # [40 users, 50 training segments]
val_df_ints = full_df_ints.loc[:9,NUM_OF_TRAIN_SEQUENCE_PER_USER:]  # [10 users, 100 validation segments]
test_df_ints = full_df_ints.loc[10:,NUM_OF_TRAIN_SEQUENCE_PER_USER:]  # [30 users, 100 test segments]

In [None]:
full_df_strings = pd.DataFrame(user_seqs_strs) # convert corpus to dataframe
train_df_strings = full_df_strings.loc[:, :NUM_OF_TRAIN_SEQUENCE_PER_USER-1]  # [40 users, 50 training segments]
val_df_strings = full_df_strings.loc[:9,NUM_OF_TRAIN_SEQUENCE_PER_USER:]  # [10 users, 100 validation segments]
test_df_strings = full_df_strings.loc[10:,NUM_OF_TRAIN_SEQUENCE_PER_USER:]  # [30 users, 100 test segments]

In [None]:
train_df = pd.DataFrame({'data_int':train_df_ints.to_numpy().flatten(),'data_string':train_df_strings.to_numpy().flatten()})
val_df = pd.DataFrame({'data_int':val_df_ints.to_numpy().flatten(),'data_string':val_df_strings.to_numpy().flatten()})
test_df = pd.DataFrame({'data_int':test_df_ints.to_numpy().flatten(),'data_string':test_df_strings.to_numpy().flatten()})

# Feature Extraction

In [None]:
def get_most_common(df):
  df['most_common'] = pd.DataFrame(df['data_int'].values.tolist()).mode(1)[0].astype(int)

def get_rarest(df):
  rear_list = []
  for i in df.index:
    rear_list.append(pd.Series(df['data_int'].values.tolist()[i]).value_counts().index[-1])
  df['rarest'] = rear_list

def get_most_common_count(df):
  rear_list = []
  for i in df.index:
    rear_list.append(pd.Series(df['data_int'].values.tolist()[i]).value_counts().values[0])
  df['most_common_count'] = rear_list

def get_unique_count(df):
  df1 = pd.DataFrame(df['data_int'].values.tolist())
  df['unique_count'] =  [len(set(v[pd.notna(v)].tolist())) for v in df1.values]

def create_ngrams_feature(df, num_ngrams):
  def get_ones_in_val():
    val_col_indexes = np.arange(NUM_OF_TRAIN_SEQUENCE_PER_USER + 1,
                                NUM_OF_TRAIN_SEQUENCE_PER_USER + 1 + NUM_OF_TEST_SEQUENCE_PER_USER)
    val_df_true = pd.read_csv('/content/drive/Shared drives/Methods for Detecting Attacks/challengeToFill.csv', usecols=val_col_indexes)
    val_df_true = val_df_true.loc[:9, :].astype(int)
    val_flatten = pd.DataFrame(val_df_true.to_numpy().flatten(), columns=['data'])
    anomaly_indices = val_flatten.index[val_flatten['data'] == 1]
    row_index = (anomaly_indices / 100).astype(int)
    col_index = (anomaly_indices % 100) + 50
    return row_index, col_index

  def extract_ngrams_ones(ngram_size, row_index, col_index):
    user_seqs = set()
    for user_num in range(NUM_USERS):
      file_path = os.path.join(DATA_PATH, 'User' + str(user_num))
      with open(file_path, 'r') as file:
        for i in range(NUM_OF_TEST_SEQUENCE_PER_USER):
          user_seq = [next(file).rstrip() for _ in range(WORDS_PER_SEQUENCE)]
          for x in range(len(row_index)):
            if user_num == row_index[x] and i == col_index[x]:
              n_grams = nltk.ngrams(user_seq, ngram_size)
              for n_gram in n_grams:
                user_seqs.add(n_gram)
    return user_seqs

  def x_in_y(ngram, sequences):
    result = []
    l = len(ngram)
    for sequence in sequences:
      count = 0
      for i in range(len(sequence)-l):
        if sequence[i:i + l] == ngram:
          count+=1
      result.append(count)
    return result

  if num_ngrams <= 2:
    row_index = np.concatenate([np.repeat(x, 150) for x in range(40)]).ravel().tolist()
    col_index = np.concatenate([[x for x in range(150)] for y in range(40)]).ravel().tolist()
  else:
    row_index, col_index = get_ones_in_val()

  ngram_set = extract_ngrams_ones(num_ngrams, row_index, col_index)
  
  for ngram in ngram_set:
    df['ngram'+str(ngram)] = pd.DataFrame(x_in_y(list(ngram), df['data_string'].values))

def get_longest_command(df):
  df['longest_command'] = [words_to_index[max(arr, key=len)] for arr in df['data_string'].values.tolist()]

def get_longest_command_length(df):
  df['longest_command_length'] = [len(max(arr, key=len)) for arr in df['data_string'].values.tolist()]

def command_exists(df, command):
  l = []
  for i in df.index:
      if command in pd.Series(df['data_string'].values.tolist()[i]).value_counts().keys():
          l.append(1)
      else:
          l.append(0)
  df[command+'_exists'] = l

In [None]:
for df in [train_df, val_df, test_df]:
  get_most_common(df)
  get_most_common_count(df)
  get_rarest(df)
  get_unique_count(df)
  get_longest_command(df)
  get_longest_command_length(df)
  create_ngrams_feature(df, num_ngrams=3)
  create_ngrams_feature(df, num_ngrams=2)
  create_ngrams_feature(df, num_ngrams=1)

In [None]:
train_df = train_df.drop(['data_int', 'data_string'], axis=1)
val_df = val_df.drop(['data_int', 'data_string'], axis=1)
test_df = test_df.drop(['data_int', 'data_string'], axis=1)

# Pre processing

In [None]:
all_dfs_concated = pd.DataFrame(pd.concat([train_df,val_df,test_df]))

In [None]:
sc = MinMaxScaler()
all_dfs_concated[all_dfs_concated.columns.values] = sc.fit_transform(all_dfs_concated[all_dfs_concated.columns.values])

In [None]:
train_col_num = train_df.shape[0]
val_col_num = val_df.shape[0]
test_col_num = test_df.shape[0]

train_df = pd.DataFrame(all_dfs_concated.iloc[0:train_col_num,:])
val_df = pd.DataFrame(all_dfs_concated.iloc[train_col_num:train_col_num+val_col_num,:])
test_df = pd.DataFrame(all_dfs_concated.iloc[train_col_num+val_col_num:train_col_num+val_col_num+test_col_num,:])

## Add target class

In [None]:
train_df['target'] = None
for user_num in range(NUM_USERS):
  train_df.at[50*user_num:50*(user_num+1), 'target']= user_num

In [None]:
# shuffle
train_df = train_df.sample(frac=1).reset_index(drop=True)

## Train test split

In [None]:
training_set, validation_set = train_test_split(train_df, test_size = 0.1, random_state = 21)
#classifying the predictors and target variables as X and Y
X_train = training_set.iloc[:,0:-1].values
Y_train = (training_set.iloc[:,-1].values).astype(int)
X_val = validation_set.iloc[:,0:-1].values
y_val = (validation_set.iloc[:,-1].values).astype(int)

In [None]:
val_realX = val_df.values
test_realX = test_df.values

# Model Training

In [None]:
classifier = MLPClassifier(hidden_layer_sizes=(70,),
                           learning_rate_init=0.002,
                           max_iter=500, 
                           random_state=1,
                           shuffle=True,
                           verbose=True)

In [None]:
#Fitting the training data to the network
classifier.fit(X_train, Y_train)

Iteration 1, loss = 3.45623979
Iteration 2, loss = 2.64608612
Iteration 3, loss = 1.83302776
Iteration 4, loss = 1.19201629
Iteration 5, loss = 0.78395157
Iteration 6, loss = 0.54916340
Iteration 7, loss = 0.40971823
Iteration 8, loss = 0.32230114
Iteration 9, loss = 0.26437035
Iteration 10, loss = 0.22185964
Iteration 11, loss = 0.19133841
Iteration 12, loss = 0.16719549
Iteration 13, loss = 0.14950371
Iteration 14, loss = 0.13345663
Iteration 15, loss = 0.12136446
Iteration 16, loss = 0.11154287
Iteration 17, loss = 0.10182175
Iteration 18, loss = 0.09350482
Iteration 19, loss = 0.08728840
Iteration 20, loss = 0.07988665
Iteration 21, loss = 0.07623545
Iteration 22, loss = 0.07088373
Iteration 23, loss = 0.06603113
Iteration 24, loss = 0.06239923
Iteration 25, loss = 0.06013484
Iteration 26, loss = 0.05915202
Iteration 27, loss = 0.05333807
Iteration 28, loss = 0.05080224
Iteration 29, loss = 0.04818025
Iteration 30, loss = 0.04629218
Iteration 31, loss = 0.04380563
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(70,), learning_rate='constant',
              learning_rate_init=0.002, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

## Measure accuracy on validatin set (not real validation data from file)

In [None]:
#Predicting y for X_val
y_pred = classifier.predict(X_val) 

In [None]:
#Importing Confusion Matrix
from sklearn.metrics import confusion_matrix
#Comparing the predictions against the actual observations in y_val
cm = confusion_matrix(y_pred, y_val)

In [None]:
def accuracy(confusion_matrix):
   diagonal_sum = confusion_matrix.trace()
   sum_of_all_elements = confusion_matrix.sum()
   return diagonal_sum / sum_of_all_elements

In [None]:
#Printing the accuracy
print("Accuracy of MLPClassifier : ", accuracy(cm))

Accuracy of MLPClassifier :  0.89


# Predict

## Predict and calculate challenge validation

In [None]:
val_col_indexes = np.arange(NUM_OF_TRAIN_SEQUENCE_PER_USER + 1,
                            NUM_OF_TRAIN_SEQUENCE_PER_USER + 1 + NUM_OF_TEST_SEQUENCE_PER_USER)
val_df_true = pd.read_csv('/content/drive/Shared drives/Methods for Detecting Attacks/challengeToFill.csv', usecols=val_col_indexes)
cols = val_df_true.columns
val_df_true = val_df_true.loc[:9, :].astype(int)

In [None]:
def predict_final_results(X_check, check_type):
  val_df_preds = []
  for user_num in range(X_check.shape[0]//100):
    curr_user_pred = classifier.predict(X_check[user_num*100:(user_num+1)*100])
    curr_user_pred_prob = classifier.predict_proba(X_check[user_num*100:(user_num+1)*100])
    user_number = user_num 
    if check_type == 'test':
      user_number += 10
    curr_user_pred = np.where(curr_user_pred != user_number, True, False)
    val_df_pred = pd.DataFrame([curr_user_pred], dtype=int, columns=cols)
    idx_obj = pd.Index(val_df_pred.to_numpy().flatten().tolist())
    idx_df = pd.DataFrame(val_df_pred.to_numpy().flatten().tolist())
    ones_indices = idx_df.index[idx_df[0]==1]
    ones_idx = idx_obj.get_loc(1)
    ones_prob = curr_user_pred_prob[ones_idx]
    sorted_prob = -np.sort(-ones_prob, axis=1)
    ones_best_prob = [arr[0] for arr in sorted_prob]
    best_probs = sorted(ones_best_prob, reverse=True)[:30]
    scaled_user_pred = np.array(curr_user_pred)
    for i, idx in enumerate(ones_indices.values):
      if curr_user_pred[idx] == True and sorted_prob[i,0] not in best_probs:
        scaled_user_pred[idx] = False
    scaled_user_pred_df = pd.DataFrame([scaled_user_pred], dtype=int, columns=cols)
    val_df_preds.append(scaled_user_pred_df)
  return val_df_preds

In [None]:
val_df_preds = predict_final_results(val_realX, 'val')

In [None]:
def val_score(val_df_pred, val_df_true):
  num_of_correct_ones = (val_df_pred.mul(val_df_true)).values.sum()
  num_of_correct_zeros = (val_df_pred.replace({0:1, 1:0}).mul(val_df_true.replace({0:1, 1:0}))).values.sum()
  print(f'num of correct ones: {num_of_correct_ones}/{10}, num of correct zeros: {num_of_correct_zeros}/{90}')
  print(f'Our Score: {9*num_of_correct_ones}+{num_of_correct_zeros}={9*num_of_correct_ones+num_of_correct_zeros}')
  print(f'Max Score: {9*10}+{1*90}={90+90}')
  return 9*num_of_correct_ones+num_of_correct_zeros

In [None]:
total_score = 0
for user, val_df_pred in enumerate(val_df_preds):
  print('User'+str(user))
  total_score += val_score(val_df_pred, val_df_true.loc[user, :])
print("Total Score: " + str(total_score))
print("Max Total Score: " + str(len(val_df_preds)*180))

User0
num of correct ones: 10/10, num of correct zeros: 76/90
Our Score: 90+76=166
Max Score: 90+90=180
User1
num of correct ones: 10/10, num of correct zeros: 70/90
Our Score: 90+70=160
Max Score: 90+90=180
User2
num of correct ones: 10/10, num of correct zeros: 78/90
Our Score: 90+78=168
Max Score: 90+90=180
User3
num of correct ones: 10/10, num of correct zeros: 70/90
Our Score: 90+70=160
Max Score: 90+90=180
User4
num of correct ones: 10/10, num of correct zeros: 78/90
Our Score: 90+78=168
Max Score: 90+90=180
User5
num of correct ones: 9/10, num of correct zeros: 69/90
Our Score: 81+69=150
Max Score: 90+90=180
User6
num of correct ones: 10/10, num of correct zeros: 85/90
Our Score: 90+85=175
Max Score: 90+90=180
User7
num of correct ones: 5/10, num of correct zeros: 65/90
Our Score: 45+65=110
Max Score: 90+90=180
User8
num of correct ones: 10/10, num of correct zeros: 70/90
Our Score: 90+70=160
Max Score: 90+90=180
User9
num of correct ones: 10/10, num of correct zeros: 70/90
Our 

## Predict test set

In [None]:
test_df_preds = predict_final_results(test_realX, 'test')
results_df = pd.concat(test_df_preds , ignore_index=True)

In [None]:
results_df.to_csv('test_results.csv', mode='w', header=True)