In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import joblib

# Load datasets from the specified paths
train_df = pd.read_csv('/content/drive/MyDrive/Group5/Dataset2/train_70pct.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Group5/Dataset2/validation_15pct.csv')

# Drop the 'type' column if it exists (as per instructions)
if 'type' in train_df.columns:
    train_df.drop(columns=['type'], inplace=True)
if 'type' in val_df.columns:
    val_df.drop(columns=['type'], inplace=True)

# Replace "-" with np.nan to mark meaningless values as missing
train_df.replace("-", np.nan, inplace=True)
val_df.replace("-", np.nan, inplace=True)

# Print initial shape of the datasets
print("Initial train dataset shape:", train_df.shape)
print("Initial validation dataset shape:", val_df.shape)

# Calculate missing data ratio for each column in the training set
missing_ratio = train_df.isna().sum() / len(train_df)
print("\nMissing ratio per column in the training set:")
print(missing_ratio)

# Identify columns with more than 90% missing values
cols_to_drop = missing_ratio[missing_ratio > 0.6].index.tolist()
print("\nColumns to drop (more than 90% missing):", cols_to_drop)

# Before dropping columns, print number of rows (the row count remains the same)
print("\nNumber of rows in training set before dropping columns:", train_df.shape[0])
print("Number of rows in validation set before dropping columns:", val_df.shape[0])

# Drop columns with >90% missing values in both train and validation sets
train_df.drop(columns=cols_to_drop, inplace=True)
val_df.drop(columns=cols_to_drop, inplace=True)

# Print shapes after dropping columns to verify changes in number of columns
print("\nTrain dataset shape after dropping columns:", train_df.shape)
print("Validation dataset shape after dropping columns:", val_df.shape)


Initial train dataset shape: (139995, 45)
Initial validation dataset shape: (30005, 45)

Missing ratio per column in the training set:
ts                        0.000000
src_ip                    0.000000
src_port                  0.000000
dst_ip                    0.000000
dst_port                  0.000000
proto                     0.000000
service                   0.666360
duration                  0.000000
src_bytes                 0.000000
dst_bytes                 0.000000
conn_state                0.000000
missed_bytes              0.000000
src_pkts                  0.000000
src_ip_bytes              0.000000
dst_pkts                  0.000000
dst_ip_bytes              0.000000
dns_query                 0.832023
dns_qclass                0.000000
dns_qtype                 0.000000
dns_rcode                 0.000000
dns_AA                    0.830673
dns_RD                    0.830673
dns_RA                    0.830673
dns_rejected              0.830673
ssl_version              

  train_df.replace("-", np.nan, inplace=True)
  val_df.replace("-", np.nan, inplace=True)


In [4]:
# Calculate the count of missing values for each column in the training set
missing_counts = train_df.isna().sum()

# Filter to only columns with missing values
cols_with_missing = missing_counts[missing_counts > 0]

# Calculate missing ratio for each of these columns
missing_ratio = cols_with_missing / len(train_df)

print("Columns with missing values and their counts:")
print(cols_with_missing)
print("\nMissing ratio per column:")
print(missing_ratio)

Columns with missing values and their counts:
Series([], dtype: int64)

Missing ratio per column:
Series([], dtype: float64)


In [5]:
# Print the data types for each column in the training set
print("Data types in the training set:")
print(train_df.dtypes)

# Identify categorical features (typically those with object/string data type)
categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()
print("\nCategorical features in the training set:")
print(categorical_columns)

# For each categorical column, print the top 10 unique values and their counts
for col in categorical_columns:
    print(f"\nTop unique values in '{col}':")
    print(train_df[col].value_counts(dropna=False).head(10))


Data types in the training set:
ts                          int64
src_ip                     object
src_port                    int64
dst_ip                     object
dst_port                    int64
proto                      object
duration                  float64
src_bytes                  object
dst_bytes                   int64
conn_state                 object
missed_bytes                int64
src_pkts                    int64
src_ip_bytes                int64
dst_pkts                    int64
dst_ip_bytes                int64
dns_qclass                  int64
dns_qtype                   int64
dns_rcode                   int64
http_request_body_len       int64
http_response_body_len      int64
http_status_code            int64
label                       int64
dtype: object

Categorical features in the training set:
['src_ip', 'dst_ip', 'proto', 'src_bytes', 'conn_state']

Top unique values in 'src_ip':
src_ip
192.168.1.195    22356
192.168.1.190    22163
192.168.1.152    1933

In [6]:
# High-cardinality columns to be label encoded: 'src_ip', 'dst_ip'
high_card_cols = ['src_ip', 'dst_ip']

for col in high_card_cols:
    # Convert column to string (if not already) and create mapping from training set
    train_df[col] = train_df[col].astype(str)
    val_df[col] = val_df[col].astype(str)

    # Create mapping dictionary based on training set unique values
    unique_vals = train_df[col].unique()
    mapping = {val: idx for idx, val in enumerate(unique_vals)}

    # Apply mapping to the training set
    train_df[col] = train_df[col].map(mapping)

    # For the validation set, map the values and assign -1 for unseen values
    val_df[col] = val_df[col].map(mapping).fillna(-1).astype(int)

# One-hot encode low-cardinality columns using pd.get_dummies
low_card_cols = ['proto', 'conn_state']
train_df = pd.get_dummies(train_df, columns=low_card_cols, drop_first=True)
val_df = pd.get_dummies(val_df, columns=low_card_cols, drop_first=True)

# To ensure both train and validation have the same columns after one-hot encoding,
# reindex the validation set based on the training set's columns
val_df = val_df.reindex(columns=train_df.columns, fill_value=0)

# Display the new shape and a few rows to confirm encoding
print("Train dataset shape after encoding:", train_df.shape)
print("Validation dataset shape after encoding:", val_df.shape)
print("\nSample of processed training data:")
print(train_df.head())


Train dataset shape after encoding: (139995, 34)
Validation dataset shape after encoding: (30005, 34)

Sample of processed training data:
           ts  src_ip  src_port  dst_ip  dst_port  duration src_bytes  \
0  1556027889       0     15544       0     64609  0.000130         0   
1  1556311612       1     43674       1        21  0.000149         0   
2  1556327860       2     50463       2      1900  0.618052       375   
3  1556548159       3     49754       3     41952  0.015973       201   
4  1556426221       4     49189       4      1880  0.000000         0   

   dst_bytes  missed_bytes  src_pkts  ...  conn_state_RSTOS0  conn_state_RSTR  \
0          0             0         1  ...              False            False   
1          0             0         1  ...              False            False   
2          0             0         3  ...              False            False   
3          0             0        22  ...              False            False   
4          0      

In [7]:
# Define the continuous features that need scaling
continuous_cols = [
    'ts', 'src_port', 'dst_port', 'duration', 'src_bytes', 'dst_bytes',
    'missed_bytes', 'src_pkts', 'src_ip_bytes', 'dst_pkts', 'dst_ip_bytes',
    'dns_qclass', 'dns_qtype', 'dns_rcode', 'http_request_body_len',
    'http_response_body_len', 'http_status_code'
]

# Ensure that continuous columns are numeric by converting them
for col in continuous_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    val_df[col] = pd.to_numeric(val_df[col], errors='coerce')   # Added this to perform same transformation as training set

# Re-run outlier analysis
def calculate_outliers(df, column):
    # Calculate the 25th and 75th percentiles
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Identify outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return lower_bound, upper_bound, len(outliers), len(df)

print("Outlier Analysis using IQR method:")
for col in continuous_cols:
    lower, upper, outlier_count, total_count = calculate_outliers(train_df, col)
    percent_outliers = (outlier_count / total_count) * 100
    print(f"Column: {col}")
    print(f"  Lower Bound: {lower:.2f}, Upper Bound: {upper:.2f}")
    print(f"  Outliers: {outlier_count} out of {total_count} ({percent_outliers:.2f}%)\n")


Outlier Analysis using IQR method:
Column: ts
  Lower Bound: 1555436379.75, Upper Bound: 1557015997.75
  Outliers: 28186 out of 139995 (20.13%)

Column: src_port
  Lower Bound: -55197.25, Upper Bound: 116520.75
  Outliers: 0 out of 139995 (0.00%)

Column: dst_port
  Lower Bound: -15553.00, Upper Bound: 26135.00
  Outliers: 24015 out of 139995 (17.15%)

Column: duration
  Lower Bound: -0.07, Upper Bound: 0.12
  Outliers: 32410 out of 139995 (23.15%)

Column: src_bytes
  Lower Bound: -64.50, Upper Bound: 107.50
  Outliers: 23580 out of 139995 (16.84%)

Column: dst_bytes
  Lower Bound: -111.00, Upper Bound: 185.00
  Outliers: 27327 out of 139995 (19.52%)

Column: missed_bytes
  Lower Bound: 0.00, Upper Bound: 0.00
  Outliers: 650 out of 139995 (0.46%)

Column: src_pkts
  Lower Bound: -2.00, Upper Bound: 6.00
  Outliers: 19708 out of 139995 (14.08%)

Column: src_ip_bytes
  Lower Bound: -480.50, Upper Bound: 907.50
  Outliers: 20585 out of 139995 (14.70%)

Column: dst_pkts
  Lower Bound: -1

In [8]:
from sklearn.preprocessing import RobustScaler

# Initialize RobustScaler
robust_scaler = RobustScaler()

# Fit the scaler on the training data for the continuous columns
train_df[continuous_cols] = robust_scaler.fit_transform(train_df[continuous_cols])

# Transform the validation data using the same scaler
val_df[continuous_cols] = robust_scaler.transform(val_df[continuous_cols])

# Display summary statistics for the scaled continuous features in the training set
print("Summary statistics for scaled continuous features (training set) using RobustScaler:")
print(train_df[continuous_cols].describe())


Summary statistics for scaled continuous features (training set) using RobustScaler:
                  ts       src_port       dst_port      duration  \
count  139995.000000  139995.000000  139995.000000  1.399950e+05   
mean       -0.844077      -0.186926       0.883975  1.262646e+02   
std         2.045427       0.498449       1.685476  5.441389e+03   
min        -5.089552      -1.013988      -0.182307 -4.522646e-04   
25%        -0.454562      -0.799753      -0.174631 -4.522646e-04   
50%         0.000000       0.000000       0.000000  0.000000e+00   
75%         0.545438       0.200247       0.825369  9.995477e-01   
max         0.863275       0.512561       6.105258  1.734747e+06   

          src_bytes     dst_bytes  missed_bytes       src_pkts   src_ip_bytes  \
count  1.398890e+05  1.399950e+05  1.399950e+05  139995.000000  139995.000000   
mean   4.101174e+03  2.454458e+03  7.712455e+03      10.895193       7.031493   
std    3.079419e+05  2.216291e+05  1.406378e+06     878.318

In [9]:
# Check for missing values in each column for the training set
print("Missing values in the training set:")
missing_counts_train = train_df.isnull().sum()
print(missing_counts_train)

# Check for missing values in each column for the validation set
print("\nMissing values in the validation set:")
missing_counts_val = val_df.isnull().sum()
print(missing_counts_val)


Missing values in the training set:
ts                          0
src_ip                      0
src_port                    0
dst_ip                      0
dst_port                    0
duration                    0
src_bytes                 106
dst_bytes                   0
missed_bytes                0
src_pkts                    0
src_ip_bytes                0
dst_pkts                    0
dst_ip_bytes                0
dns_qclass                  0
dns_qtype                   0
dns_rcode                   0
http_request_body_len       0
http_response_body_len      0
http_status_code            0
label                       0
proto_tcp                   0
proto_udp                   0
conn_state_REJ              0
conn_state_RSTO             0
conn_state_RSTOS0           0
conn_state_RSTR             0
conn_state_RSTRH            0
conn_state_S0               0
conn_state_S1               0
conn_state_S2               0
conn_state_S3               0
conn_state_SF               0
conn

In [11]:
# Reload the original training dataset
orig_train_df = pd.read_csv('/content/drive/MyDrive/Group5/Dataset2/train_70pct.csv')

# Replace "-" with np.nan as before
orig_train_df.replace("-", np.nan, inplace=True)

# Attempt to convert the 'src_bytes' column to numeric; non-convertible values will become NaN
converted_src_bytes = pd.to_numeric(orig_train_df['src_bytes'], errors='coerce')

# Identify rows where conversion resulted in NaN but the original value is not NaN
problem_rows = orig_train_df[pd.isna(converted_src_bytes) & orig_train_df['src_bytes'].notna()]

# Display the problematic rows and the original values in 'src_bytes'
print("Rows with problematic 'src_bytes' values:")
print(problem_rows[['src_bytes']])


Rows with problematic 'src_bytes' values:
       src_bytes
2010     0.0.0.0
2043     0.0.0.0
7552     0.0.0.0
11137    0.0.0.0
11919    0.0.0.0
...          ...
127332   0.0.0.0
131910   0.0.0.0
135663   0.0.0.0
136459   0.0.0.0
136600   0.0.0.0

[106 rows x 1 columns]


  orig_train_df.replace("-", np.nan, inplace=True)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

# Separate features and target (if not already done)
X_train = train_df.drop(columns=['label'])
y_train = train_df['label']

X_val = val_df.drop(columns=['label'])
y_val = val_df['label']

# Build a pipeline that first imputes missing values then fits LogisticRegression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('lr', LogisticRegression(max_iter=1000))
])

# Set up a stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=89)

# Define the parameter grid for the pipeline
param_grid = {
    'lr__C': [0.01, 0.1, 1, 10, 100],
    'lr__penalty': ['l2'],  # using l2 penalty
    'lr__solver': ['lbfgs'],  # compatible with l2 penalty
    'lr__class_weight': [None, 'balanced']  # handle class imbalance if needed
}

# Initialize GridSearchCV with F1-score as the scoring metric
grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Output best parameters and best CV score
print("Best parameters:", grid_search.best_params_)
print("Best CV F1 score:", grid_search.best_score_)

# Evaluate the best model on the validation set
y_pred = grid_search.predict(X_val)
print("\nValidation set classification report:")
print(classification_report(y_val, y_pred))

# Save the best Logistic Regression model
joblib.dump(grid_search.best_estimator_, '/content/drive/MyDrive/Group5/Dataset2/logistic_regression_model.pkl')
print("Logistic Regression model saved as logistic_regression_model.pkl")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:

joblib.dump(grid_search.best_estimator_, '/content/drive/MyDrive/Group5/Dataset2/logistic_regression_model.pkl')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

# Build a pipeline for LightGBM
pipeline_lgbm = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # impute missing values with median
    ('lgbm', LGBMClassifier(random_state=89))
])

# Define the parameter grid for LightGBM
param_grid_lgbm = {
    'lgbm__num_leaves': [31, 50],
    'lgbm__learning_rate': [0.1, 0.05],
    'lgbm__n_estimators': [100, 200],
    'lgbm__class_weight': [None, 'balanced']  # helps if there is class imbalance
}

# Set up stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=89)

# Initialize GridSearchCV with F1-score as the scoring metric
grid_search_lgbm = GridSearchCV(pipeline_lgbm, param_grid_lgbm, cv=skf, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search on the training data
grid_search_lgbm.fit(X_train, y_train)

# Output best parameters and best CV score
print("Best parameters for LightGBM:", grid_search_lgbm.best_params_)
print("Best CV F1 score for LightGBM:", grid_search_lgbm.best_score_)

# Evaluate the best model on the validation set
y_pred_lgbm = grid_search_lgbm.predict(X_val)
print("\nValidation set classification report for LightGBM:")
print(classification_report(y_val, y_pred_lgbm))


# Save the best LightGBM model
joblib.dump(grid_search_lgbm.best_estimator_, '/content/drive/MyDrive/Group5/Dataset2/lightgbm_model.pkl')
print("LightGBM model saved as lightgbm_model.pkl")


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import joblib

# Build a pipeline for Gaussian Naive Bayes
pipeline_nb = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # impute missing values with median
    ('nb', GaussianNB())
])

# Define the parameter grid for GaussianNB (var_smoothing controls stability)
param_grid_nb = {
    'nb__var_smoothing': [1e-9, 1e-8, 1e-7]
}

# Set up stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=89)

# Initialize GridSearchCV with F1-score as the scoring metric
grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=skf, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search on the training data
grid_search_nb.fit(X_train, y_train)

# Output best parameters and best CV score
print("Best parameters for Gaussian Naive Bayes:", grid_search_nb.best_params_)
print("Best CV F1 score for Gaussian Naive Bayes:", grid_search_nb.best_score_)

# Evaluate the best model on the validation set
y_pred_nb = grid_search_nb.predict(X_val)
print("\nValidation set classification report for Gaussian Naive Bayes:")
print(classification_report(y_val, y_pred_nb))

# Save the best Gaussian Naive Bayes model using joblib
joblib.dump(grid_search_nb.best_estimator_, '/content/drive/MyDrive/Group5/Dataset2/gaussian_nb_model.pkl')
print("Gaussian Naive Bayes model saved as gaussian_nb_model.pkl")


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import joblib

# Build a pipeline for MLPClassifier
pipeline_mlp = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the median
    ('mlp', MLPClassifier(max_iter=500, random_state=89))
])

# Define the parameter grid for MLPClassifier
param_grid_mlp = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'mlp__alpha': [0.0001, 0.001, 0.01],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam']  # 'adam' is a good default solver for neural networks
}

# Set up stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=89)

# Initialize GridSearchCV with F1-score as the scoring metric
grid_search_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp, cv=skf, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search on the training data
grid_search_mlp.fit(X_train, y_train)

# Output best parameters and best CV score
print("Best parameters for MLP:", grid_search_mlp.best_params_)
print("Best CV F1 score for MLP:", grid_search_mlp.best_score_)

# Evaluate the best MLP model on the validation set
y_pred_mlp = grid_search_mlp.predict(X_val)
print("\nValidation set classification report for MLP:")
print(classification_report(y_val, y_pred_mlp))

# Save the best MLP model using joblib
joblib.dump(grid_search_mlp.best_estimator_, '/content/drive/MyDrive/Group5/Dataset2/mlp_model.pkl')
print("MLP model saved as mlp_model.pkl")


In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import joblib

# Build a pipeline for SVM
pipeline_svm = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Ensure no missing values remain
    ('svm', SVC(random_state=89))
])

# Define the parameter grid for SVM
param_grid_svm = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['rbf'],       # Use RBF kernel for non-linear decision boundaries
    'svm__gamma': ['scale', 'auto'],
    'svm__class_weight': [None, 'balanced']  # Handle potential class imbalance
}

# Set up stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=89)

# Initialize GridSearchCV using F1-score as the scoring metric
grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=skf, scoring='f1', n_jobs=-1, verbose=1)

# Fit the grid search on the training data
grid_search_svm.fit(X_train, y_train)

# Output best parameters and best CV score
print("Best parameters for SVM:", grid_search_svm.best_params_)
print("Best CV F1 score for SVM:", grid_search_svm.best_score_)

# Evaluate the best SVM model on the validation set
y_pred_svm = grid_search_svm.predict(X_val)
print("\nValidation set classification report for SVM:")
print(classification_report(y_val, y_pred_svm))

# Save the best SVM model using joblib
joblib.dump(grid_search_svm.best_estimator_, '/content/drive/MyDrive/Group5/Dataset2/svm_model.pkl')
print("SVM model saved as svm_model.pkl")


In [None]:
import pandas as pd
from sklearn.metrics import f1_score

# Dictionary to store results for each model
results = {}

# List of models and their corresponding grid search objects
models = {
    'Logistic Regression': grid_search,
    'LightGBM': grid_search_lgbm,
    'Gaussian Naive Bayes': grid_search_nb,
    'MLP': grid_search_mlp,
    'SVM': grid_search_svm
}

# Calculate metrics for each model
for name, grid in models.items():
    # Best cross-validation F1 score from grid search
    best_cv_f1 = grid.best_score_

    # Predict on the validation set
    y_pred = grid.predict(X_val)
    # Calculate the validation F1 score (assuming binary classification)
    val_f1 = f1_score(y_val, y_pred, average='binary')

    # Store the results along with the best parameters found
    results[name] = {
        'Best CV F1': best_cv_f1,
        'Validation F1': val_f1,
        'Best Params': grid.best_params_
    }

# Convert results into a DataFrame for easier comparison
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df.sort_values(by='Validation F1', ascending=False, inplace=True)

print("Model Ranking based on Validation F1 Score:")
print(results_df)
