### Import Relevant Libraries

In [None]:
# Data Manipulation libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# for supporting sklearn-deap
!pip install scikit-learn==0.23.2
# for data preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.ensemble import RandomForestClassifier

# for model evaluation
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# for hyperparameter tuning using genetic algorithm
!cp -r ../input/sklearndeap024/* ./
!python setup.py install
import evolutionary_search

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# calc time
import timeit
# Seed for random state
SEED = 42

### Load the dataset

In [None]:
df = pd.read_csv("../input/network-intrusion-detection/Train_data.csv")

In [None]:
# Let's view the data.
print("Training data has {} rows & {} columns".format(df.shape[0],df.shape[1]))
df.head()

### Data Analysis and Preprocessing

In [None]:
df.info()

In [None]:
target_col = "class"
cate_cols = df.drop("class",1).select_dtypes("object").columns
num_cols = df.select_dtypes("number").columns

In [None]:
# Impute numerical features
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

# Impute categorical features
df[cate_cols] = df[cate_cols].apply(lambda x: x.fillna(x.value_counts().index[0]))

# Encode target column
df[target_col] = df[target_col].apply(lambda x: 1 if x=="anomaly" else 0)

# Filter missing values
df = df.dropna()

**We are `encoding` the target class to 0s and 1s, so that it can be used for further analysis and training.**

#### Encoding Categorical Data

In [None]:
# encode categorical columns
encoder = OneHotEncoder(sparse=False)
encoded = encoder.fit_transform(df[cate_cols])
feat_names = encoder.get_feature_names(cate_cols)
encoded_df = pd.DataFrame(data = encoded, columns=feat_names)

# scale numerical columns
scaler = StandardScaler()
scaled = scaler.fit_transform(df[num_cols])
scaled_df = pd.DataFrame(scaled, columns = num_cols)

# concat encoded and scaled data
df_processed = pd.concat([encoded_df,scaled_df,df["class"]],axis=1)

In [None]:
df_processed.info()

#### Normalizing the numerical data.

In [None]:
# Correlation Heatmap
plt.figure(figsize=(14,10))
sns.heatmap(df_processed.corr().apply(abs))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_processed.drop("class", axis=1), df_processed["class"], test_size=0.25, random_state=SEED)

### Model Selection

In [None]:
y_train.value_counts()

In [None]:
# Random Forest Model
rf = RandomForestClassifier()

In [None]:
grid = {'max_depth':[None,3,5,7,9], 'n_estimators': [50,100,200], 'min_samples_split': [3,5,7,9], 'max_features' : ['auto', 'sqrt', 'log2']}
eascv = evolutionary_search.EvolutionaryAlgorithmSearchCV(rf, grid, verbose=1)
eascv.fit(X_train, y_train)

In [None]:
bestParams = eascv.best_params_

In [None]:
rf.set_params(**bestParams)

In [None]:
start_time = timeit.timeit()
rf.fit(X_train,y_train)
print("Training complete.")
print("Time taken =",abs(timeit.timeit()-start_time))

In [None]:
train_preds = rf.predict(X_train)
print(classification_report(y_train, train_preds))

In [None]:
test_preds = rf.predict(X_test)
print(classification_report(y_test, test_preds))