In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cicids2017/Benign-Monday-no-metadata.parquet
/kaggle/input/cicids2017/Bruteforce-Tuesday-no-metadata.parquet
/kaggle/input/cicids2017/Portscan-Friday-no-metadata.parquet
/kaggle/input/cicids2017/WebAttacks-Thursday-no-metadata.parquet
/kaggle/input/cicids2017/DoS-Wednesday-no-metadata.parquet
/kaggle/input/cicids2017/DDoS-Friday-no-metadata.parquet
/kaggle/input/cicids2017/Infiltration-Thursday-no-metadata.parquet
/kaggle/input/cicids2017/Botnet-Friday-no-metadata.parquet



---

# Predictive Analysis for Threat Intelligence on CIC-IDS2017

### Time-Series Forecasting  
**Predicting future network attacks by analyzing historical traffic patterns to enhance cybersecurity measures.**

---


### Step 1: Define the Predictive Analysis Task - Time-Series Forecasting

In this project, we aim to **predict future network attacks** based on historical traffic patterns from the CIC-IDS-2017 dataset. The goal is to leverage past network traffic data to forecast potential attacks and help in proactive threat detection.

The steps involved in the time-series forecasting task include:

1. **Data Preprocessing:**
   - Clean and prepare the data to convert it into a time-series format.
   - Aggregate the data over a time window (e.g., hourly, daily).

2. **Feature Engineering:**
   - Extract time-related features (e.g., day of the week, hour of the day).
   - Generate statistical features like moving averages, rolling windows, and lags.

3. **Model Selection:**
   - Train time-series forecasting models (e.g., ARIMA, LSTM, Prophet) to predict future attack occurrences.

4. **Model Evaluation:**
   - Evaluate the model's performance using metrics like Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and other suitable time-series metrics.

5. **Prediction:**
   - Use the trained model to forecast future attacks and detect patterns in the network traffic.

By analyzing the historical traffic data, we aim to predict the likelihood of attacks occurring in the future, enhancing security measures and reducing potential risks.

---



In [2]:
# Importing necessary libraries for Linear Algebra operations
import numpy as np

# Importing pandas for data processing and handling file I/O operations
import pandas as pd

# Importing matplotlib and seaborn for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Importing time library for measuring latency and execution time
import time

# Importing LabelEncoder to convert categorical target variable into numerical labels
from sklearn.preprocessing import LabelEncoder

# Importing train_test_split for splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split

# Importing Normalizer and QuantileTransformer for feature scaling and transformation
from sklearn.preprocessing import Normalizer, QuantileTransformer

# Importing mutual_info_classif for feature selection based on mutual information
from sklearn.feature_selection import mutual_info_classif

# Importing evaluation metrics for model performance measurement
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Importing SMOTE for handling class imbalance by oversampling the minority class
from imblearn.over_sampling import SMOTE

# Importing os for file path handling and other system-level operations
import os

# Importing IPython's display and HTML to create and show a report in a Jupyter notebook
from IPython.display import display, HTML


---

## Data Preprocessing


The **CIC-IDS-2017** dataset can be accessed on [Kaggle](https://www.kaggle.com/) or through the **Canadian Institute for Cybersecurity** website: [CIC-IDS-2017 Dataset](https://www.unb.ca/cic/datasets/ids-2017.html).

This dataset includes various features such as **flow duration**, **packet size**, **protocol type**, and **attack labels**, which are essential for detecting and analyzing network security threats.


---

### Loading the Data

In [3]:
# Load the CIC-IDS-2017 datasets
datasets = {
    'Benign-Monday': '/kaggle/input/cicids2017/Benign-Monday-no-metadata.parquet',
    'Botnet-Friday': '/kaggle/input/cicids2017/Botnet-Friday-no-metadata.parquet',
    'Bruteforce-Tuesday': '/kaggle/input/cicids2017/Bruteforce-Tuesday-no-metadata.parquet',
    'DDoS-Friday': '/kaggle/input/cicids2017/DDoS-Friday-no-metadata.parquet',
    'DoS-Wednesday': '/kaggle/input/cicids2017/DoS-Wednesday-no-metadata.parquet',
    'Infiltration-Thursday': '/kaggle/input/cicids2017/Infiltration-Thursday-no-metadata.parquet',
    'Portscan-Friday': '/kaggle/input/cicids2017/Portscan-Friday-no-metadata.parquet',
    'WebAttacks-Thursday': '/kaggle/input/cicids2017/WebAttacks-Thursday-no-metadata.parquet'
}

# Read the datasets into DataFrames
df_data = {key: pd.read_parquet(path) for key, path in datasets.items()}


In [4]:
# Display the first 5 rows of each DataFrame
for name, df in df_data.items():
    print(f"Head of {name}:")
    # print(df.head(), "\n")


Head of Benign-Monday:
Head of Botnet-Friday:
Head of Bruteforce-Tuesday:
Head of DDoS-Friday:
Head of DoS-Wednesday:
Head of Infiltration-Thursday:
Head of Portscan-Friday:
Head of WebAttacks-Thursday:


In [5]:
# Concatenate all DataFrames into one
df_all = pd.concat(df_data.values(), ignore_index=True)

# Display the first 5 rows of the concatenated DataFrame
print(df_all.head())


   Protocol  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0         6              4                  2                       0   
1         6              1                  2                       0   
2         6              3                  2                       0   
3         6              1                  2                       0   
4         6            609                  7                       4   

   Fwd Packets Length Total  Bwd Packets Length Total  Fwd Packet Length Max  \
0                        12                         0                      6   
1                        12                         0                      6   
2                        12                         0                      6   
3                        12                         0                      6   
4                       484                       414                    233   

   Fwd Packet Length Min  Fwd Packet Length Mean  Fwd Packet Length Std  ...  \
