# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
import time
import joblib

In [None]:
import glob

all_files = glob.glob("../input/cicids2017/MachineLearningCSV/MachineLearningCVE/*.csv")
df_from_each_file = (pd.read_csv(f) for f in all_files)
concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
concatenated_df.to_csv('full.csv')


# Loadig data

In [None]:
#file = "../input/cicids2017/MachineLearningCSV/MachineLearningCVE/*.csv"
file = "./full.csv"

In [None]:
df = pd.read_csv(file)
df = df.reset_index()

In [None]:
df.head()

# Some data diagnosis

In [None]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_')
df.columns = map(str.lower, df.columns)

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum() 

In [None]:
df.head()

In [None]:
df.rename(columns={'fwd_avg_packets/bulk':'fwd_packet/bulk_avg', 'bwd_avg_bulk_rate':'bwd_bulk_rate_avg','fwd_avg_bulk_rate':'fwd_bulk_rate_avg', 'bwd_avg_packets/bulk':'bwd_packet/bulk_avg', 'fwd_avg_bytes/bulk':'fwd_bytes/bulk_avg', 'avg_bwd_segment_size':'bwd_segment_size_avg', 'avg_fwd_segment_size':'fwd_segment_size_avg','cwe_flag_count':'cwr_flag_count','total_length_of_bwd_packets':'total_length_of_bwd_packet','total_length_of_fwd_packets': 'total_length_of_fwd_packet','total_fwd_packets': 'total_fwd_packet','total_backward_packets': 'total_bwd_packets', 'init_win_bytes_forward': 'fwd_init_win_bytes', 'init_win_bytes_backward':'bwd_init_win_bytes', 'act_data_pkt_fwd':'fwd_act_data_pkts', 'min_seg_size_forward':'fwd_seg_size_min'}, inplace=True)

# Basci preprocessing

In [None]:
df['label'].value_counts()

In [None]:
#df['Label'] = df['Label'].replace(r'PortScan|Dos Hulk|DDos*|Web*|', 1)
df['label'] = df['label'].replace('DDoS', 1)
df['label'] = df['label'].replace('DoS Hulk', 1)
df['label'] = df['label'].replace('PortScan', 1)
df['label'] = df['label'].replace('DoS GoldenEye', 1)
df['label'] = df['label'].replace('DoS slowloris', 1)
df['label'] = df['label'].replace('DoS Slowhttptest', 1)
df['label'] = df['label'].replace('Web Attack � Brute Force', 1)
df['label'] = df['label'].replace('Web Attack � XSS', 1)
df['label'] = df['label'].replace('Infiltration', 1)
df['label'] = df['label'].replace('Web Attack � Sql Injection', 1)
df['label'] = df['label'].replace('Heartbleed', 1)
df['label'] = df['label'].replace('FTP-Patator', 1)
df['label'] = df['label'].replace('SSH-Patator', 1)
df['label'] = df['label'].replace('Bot', 1)
df['label'] = df['label'].replace('BENIGN', 0)

In [None]:
df['label'].value_counts()

### Fix some issues with nan and inf values with out dataset

#### Check the `NaN` values

In [None]:
#np.any(np.isnan(df))
df = df.dropna()


In [None]:
df['label'].value_counts()

In [None]:
col_mask=df.isnull().any(axis=0)
row_mask=df.isnull().any(axis=1)
df.loc[row_mask,col_mask]

#### As we can see we have a NaN values in the Flow_Bytes column, so we gonna deal with it

In [None]:
df.shape

In [None]:
df = df.dropna()

In [None]:
df = df.apply (pd.to_numeric, errors='coerce')
df = df.dropna()
df = df.reset_index(drop=True)

In [None]:
df.shape

In [None]:
col_mask=df.isnull().any(axis=0)
row_mask=df.isnull().any(axis=1)
df.loc[row_mask,col_mask]

In [None]:
#np.any(np.isnan(df))

#### We have removed the NaN values in our dataset

#### Check `infinity` values

In [None]:
#np.all(np.isfinite(df))

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [None]:
df = clean_dataset(df)

In [None]:
df.shape

In [None]:
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum() 

### we remove 1509 rows with invalid data

In [None]:
np.all(np.isfinite(df))

#### here we remove the infinity values 

In [None]:
df.describe()

# Specifying features and targets 

In [None]:
X = df.drop(['index','unnamed:_0','label','destination_port', 'min_packet_length', 'max_packet_length', 'fwd_header_length.1','bwd_avg_bytes/bulk'], axis=1)
y = df['label']

In [None]:
down_dataset = {
    0: 600000,
    1: 556556
}
down_df=RandomUnderSampler(sampling_strategy=down_dataset, random_state=0) 

In [None]:
from imblearn import under_sampling

rus = under_sampling.RandomUnderSampler(sampling_strategy=down_dataset)

In [None]:
test_percentage = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percentage)

In [None]:
y_train.value_counts()

In [None]:
X, y = down_df.fit_sample(X, y) 

In [None]:
y.value_counts()

### Features Analysis

In [None]:
X.loc[:, (X == 0).all()].describe()

In [None]:
drop_list = ["bwd_psh_flags", "bwd_urg_flags", "fwd_bytes/bulk_avg", "fwd_packet/bulk_avg", "fwd_bulk_rate_avg", "bwd_packet/bulk_avg", "bwd_bulk_rate_avg"]
len(drop_list)

In [None]:
X = X.drop(drop_list, axis=1)

In [None]:
X.shape

### Features Reducation

In [None]:
from sklearn.ensemble.forest import RandomForestClassifier


In [None]:
print("Total dataset: {}".format(X.shape))
print("Training dataset: {}:".format(X_train.shape))
print("Testing dataset: {}:".format(X_test.shape))

In [None]:
X.shape

In [None]:
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
rfModel = rf.fit(X,y)


In [None]:
importance = rfModel.feature_importances_

In [None]:
sorted(zip(map(lambda x: round(x, 4), rfModel.feature_importances_)), 
             reverse=True)

In [None]:
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
important

# Splitting the data set 

In [None]:
print("Total dataset: {}".format(df.shape))
print("Training dataset: {}:".format(X_train.shape))
print("Testing dataset: {}:".format(X_test.shape))

# Training the model

### LogisticRegression Algorihtm

In [None]:
print("------------------ LogisticRegression -----------------")
start = time.time()

In [None]:
clf_lr = LogisticRegression( solver='newton-cg')
clf_lr.fit(X_train, y_train)

In [None]:
print("Total time take {}".format(time.time() - start))

In [None]:
##Evaluating the model

In [None]:
y_pred = clf_lr.predict(X_test)
print("Model accuracy on test dataset")
clf_lr.score(X_test, y_test)

In [None]:
tn, fp, fn, tp  = metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None).ravel()

In [None]:
print('true positives  rate {}'.format(tp))
print('false positives  rate {}'.format(fp))
print('true negatives  rate {}'.format(tn))
print('false negatives  rate {}'.format(fn))
print("F1 Score = {}".format(metrics.f1_score(y_test, y_pred)))
print("Recall {}".format(tp / (tp + fn)))
print("Precession {}".format(tp / (tp + fp)))

In [None]:
file_name = "LogisticRegression.sav"
joblib.dump(clf_lr, file_name)

## RandomForest Algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
print('------- RandomForest------------------\n')

In [None]:
start = time.time()

In [None]:
clf_rf = RandomForestClassifier(max_depth=2, random_state=0)
clf_rf.fit(X_train, y_train)

In [None]:
print("Total time take for {}".format(time.time() - start))

In [None]:
y_pred = clf_rf.predict(X_test)
print("Model accuracy on test dataset")
clf_rf.score(X_test, y_test)

In [None]:
tn, fp, fn, tp  = metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None).ravel()

In [None]:
print('true positives  rate {}'.format(tp))
print('false positives  rate {}'.format(fp))
print('true negatives  rate {}'.format(tn))
print('false negatives  rate {}'.format(fn))
print("F1 Score = {}".format(metrics.f1_score(y_test, y_pred)))
print("Recall {}".format(tp / (tp + fn)))
print("Precession {}".format(tp / (tp + fp)))

In [None]:
file_name = "RF.sav"
joblib.dump(clf_rf, file_name)

## Naive_bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
print('------- Naive_Bayes------------------\n')

In [None]:
start = time.time()

In [None]:
gnb = GaussianNB()

In [None]:
y_pred = gnb.fit(X_train, y_train)

In [None]:
print("Total time take for RF {}".format(time.time() - start))

In [None]:
y_pred = gnb.predict(X_test)
print("Model accuracy on test dataset")
gnb.score(X_test, y_test)

In [None]:
tn, fp, fn, tp  = metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None).ravel()

In [None]:
print('true positives  rate {}'.format(tp))
print('false positives  rate {}'.format(fp))
print('true negatives  rate {}'.format(tn))
print('false negatives  rate {}'.format(fn))
print("F1 Score = {}".format(metrics.f1_score(y_test, y_pred)))
print("Recall {}".format(tp / (tp + fn)))
print("Precession {}".format(tp / (tp + fp)))

In [None]:
file_name = "gnb.sav"
joblib.dump(gnb, file_name)