#### Threat prediction
This notebook train a classification algorithm for detection of different types of attack.   
Classes inslude valid requests, sql injection, xss, command injection and path traversal attacks.  
Features are calculated using TFIDF on characters and N-grams and SVM is used for classification.  

[reference]   
https://github.com/vladan-stojnic/ML-based-WAF/blob/master/Classifier/TheratPrediction.ipynb

#### Import Lib

In [2]:
import sklearn # Check Sklearn version
sklearn.__version__

'1.0.1'

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('../dataset/A-Track-clean.csv')

In [5]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,payload,label_action
0,0,"get /forum1_professionnel.asp?n=/.\\\""./.\\\""....",System_Cmd_Execution
1,1,"post /owa/auth/logon.aspx?replacecurrent=1"") a...",System_Cmd_Execution


In [6]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
df.columns

Index(['payload', 'label_action'], dtype='object')

In [8]:
df.head(2)

Unnamed: 0,payload,label_action
0,"get /forum1_professionnel.asp?n=/.\\\""./.\\\""....",System_Cmd_Execution
1,"post /owa/auth/logon.aspx?replacecurrent=1"") a...",System_Cmd_Execution


In [9]:
counts = df['label_action'].value_counts

train-set, test-set split 분리

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df['payload'], df['label_action'], test_size=0.2, random_state=42
)


In [12]:
# trainX = pd.DataFrame(X_train)
# trainX['label_action'] = y_train

# testX = pd.DataFrame(X_test)
# testX['label_action'] = y_test

# trainX.head() # 데이터가 제대로 나눠져 있는지 확인

In [13]:
pipe = make_pipeline(TfidfVectorizer(input = 'content', lowercase = True, analyzer = 'char', max_features = 1024), SVC())

param_grid = {'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 4)], 'svc__C': [1, 10], 'svc__kernel': ['linear', 'rbf']}

grid = GridSearchCV(pipe, param_grid, cv = 2, verbose = 4)

grid.fit(X_train, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV 1/2] END svc__C=1, svc__kernel=linear, tfidfvectorizer__ngram_range=(1, 1);, score=0.717 total time=  48.3s
[CV 2/2] END svc__C=1, svc__kernel=linear, tfidfvectorizer__ngram_range=(1, 1);, score=0.718 total time=  47.8s
[CV 1/2] END svc__C=1, svc__kernel=linear, tfidfvectorizer__ngram_range=(1, 2);, score=0.779 total time= 9.7min
[CV 2/2] END svc__C=1, svc__kernel=linear, tfidfvectorizer__ngram_range=(1, 2);, score=0.767 total time= 9.7min
[CV 1/2] END svc__C=1, svc__kernel=linear, tfidfvectorizer__ngram_range=(1, 4);, score=0.753 total time=12.6min
[CV 2/2] END svc__C=1, svc__kernel=linear, tfidfvectorizer__ngram_range=(1, 4);, score=0.748 total time=12.6min
[CV 1/2] END svc__C=1, svc__kernel=rbf, tfidfvectorizer__ngram_range=(1, 1);, score=0.760 total time=  47.7s
[CV 2/2] END svc__C=1, svc__kernel=rbf, tfidfvectorizer__ngram_range=(1, 1);, score=0.772 total time=  48.1s
[CV 1/2] END svc__C=1, svc__kernel=rbf, tfidfvect

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('tfidfvectorizer',
                                        TfidfVectorizer(analyzer='char',
                                                        max_features=1024)),
                                       ('svc', SVC())]),
             param_grid={'svc__C': [1, 10], 'svc__kernel': ['linear', 'rbf'],
                         'tfidfvectorizer__ngram_range': [(1, 1), (1, 2),
                                                          (1, 4)]},
             verbose=4)

In [14]:
grid.score(X_test, y_test)

0.8741111111111111

In [16]:
from sklearn.metrics import accuracy_score, classification_report
, precision_score, recall_score, f1_score, roc_curve, auc


preds = grid.predict(X_test)

print(classification_report(y_test, preds))

                               precision    recall  f1-score   support

Automatically_Searching_Infor       0.99      1.00      1.00       226
         Cross_Site_Scripting       0.89      0.83      0.86       267
           Directory_Indexing       0.97      0.95      0.96        76
                    HOST_Scan       0.87      0.87      0.87      1310
           Leakage_Through_NW       1.00      1.00      1.00       198
              Path_Disclosure       0.90      0.88      0.89       965
                SQL_Injection       0.89      0.86      0.87       632
         System_Cmd_Execution       0.82      0.82      0.82      1945
           Vulnerability_Scan       0.88      0.89      0.88      3381

                     accuracy                           0.87      9000
                    macro avg       0.91      0.90      0.91      9000
                 weighted avg       0.87      0.87      0.87      9000



In [19]:
# from sklearn.metrics import plot_confusion_matrix

# plot_confusion_matrix(grid, X_test, y_test)

In [20]:
grid.best_params_
# {'svc__C': 10, 'svc__kernel': 'rbf', 'tfidfvectorizer__ngram_range': (1, 2)}

{'svc__C': 10, 'svc__kernel': 'rbf', 'tfidfvectorizer__ngram_range': (1, 2)}

In [None]:
model = make_pipeline(TfidfVectorizer(input = 'content', lowercase = True, analyzer = 'char', max_features = 1024, ngram_range = (1, 2)), SVC(C = 10, kernel = 'rbf'))

model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred_test = model.predict(X_test)
test_acc = accuracy_score(y_test,y_pred_test)
test_rep = classification_report(y_test,y_pred_test)

print()
print("---- METRICS RESULTS FOR TESTING DATA ----")
print()
print("Total Rows are: ", X_test.shape[0])
print('[TESTING] Model Accuracy is: ', test_acc)
print('[TESTING] Testing Report: ')
print(test_rep)

In [None]:
# plot_confusion_matrix(model, X_test, y_test)

In [None]:
# from sklearn.externals import joblib
import joblib
joblib.dump(model, 'tdidf-svc.joblib')