In [2]:
#!pip install xgboost

In [3]:
import pandas as pd

data_filename = "kddcup.data_10_percent.txt"
names_filename = "kddcup.names"
num_columns = 41

# Read the kddcup.names file to extract the column names
with open(names_filename, "r") as names_file:
    lines = names_file.readlines()
    column_lines = [line.strip() for line in lines if ":" in line]
    column_names = [line.split(":")[0] for line in column_lines]

data = []

with open(data_filename, "r") as file:
    for line in file:
        try:
            # Split the line into individual values
            values = line.strip().split(",")
            
            # Append only the first 42 values to the data list
            data.append(values[:num_columns])
        except Exception as e:
            # Skip the error line and continue to the next line
            print(f"Error: {e}. Skipping line: {line}")
            continue

# Create a DataFrame from the data list with only the first 42 columns
df = pd.DataFrame(data)
df.columns = column_names[:num_columns]

# Print the resulting DataFrame
#df

In [4]:
# Create an empty list to store the inferred numeric column names
numeric_columns = []

# Iterate over each column in the DataFrame
for column in df.columns:
    # Check if the column can be converted to numeric
    if pd.to_numeric(df[column], errors='coerce').notnull().all():
        # Add the column to the list of numeric columns
        numeric_columns.append(column)

# Print the inferred numeric column names
print(numeric_columns)

['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [5]:
# Convert numeric columns to their appropriate data types
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column])

# Print the updated data types
#print(df.dtypes)

In [6]:
# Handling missing values
df.dropna(inplace=True)

# Removing duplicates
df.drop_duplicates(inplace=True)

# Handling outliers (example: removing rows with very large values in 'src_bytes')
df['src_bytes'] = pd.to_numeric(df['src_bytes'], errors='coerce')
threshold = 1e6
df = df.loc[df['src_bytes'] < threshold, :]

# Print the cleaned DataFrame
#df

In [7]:
# Select only categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns

# Perform one-hot encoding on categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns)

df = df_encoded.copy()
# Print the encoded DataFrame
#df_encoded

In [8]:
print(df.dtypes)

duration                   int64
src_bytes                  int64
dst_bytes                  int64
land                       int64
wrong_fragment             int64
                           ...  
num_file_creations_http     bool
num_shells_0                bool
num_shells_1                bool
num_shells_2                bool
num_shells_SF               bool
Length: 159, dtype: object


In [9]:
# Select the numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=[float, int]).columns

# Compute the correlation matrix
corr_matrix = df[numeric_columns].corr()

# Reset the index of the correlation matrix
corr_matrix = corr_matrix.reset_index()

# Melt the correlation matrix dataframe into two columns
melted_corr = pd.melt(corr_matrix, id_vars='index', var_name='column', value_name='correlation')

# Rename the index column
melted_corr = melted_corr.rename(columns={'index': 'variable1'})

# Sort the melted correlation dataframe
sorted_corr = melted_corr.sort_values('correlation')

# Save correlation results to an excel file
sorted_corr.to_excel("correlation_table.xlsx")


In [10]:
import numpy as np
import pandas as pd

# Select the numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Compute the correlation matrix
corr_matrix = df[numeric_columns].corr()

# Create a DataFrame to store the correlated variables and correlation value
drop_columns_df = pd.DataFrame(columns=['column_to_drop', 'correlated_column', 'correlation'])

# Set correlation threshold
correlation_threshold = 0.85

# Traverse the correlation matrix
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > correlation_threshold:
            colname = corr_matrix.columns[i]  # Getting the name of column
            correlated_column = corr_matrix.columns[j]
            correlation_value = abs(corr_matrix.iloc[i, j])
            # Add correlated column pair and value to drop_columns_df
            new_row = pd.DataFrame({'column_to_drop': [colname], 'correlated_column': [correlated_column], 'correlation': [correlation_value]})
            drop_columns_df = pd.concat([drop_columns_df, new_row], ignore_index=True)
            if colname in df.columns:
                del df[colname]  # Deleting the column from the original dataset

# Print the dropped columns along with their correlated pairs
print(drop_columns_df)

# Save correlation results to an excel file
drop_columns_df.to_excel("dropped_correlation_table.xlsx")


              column_to_drop     correlated_column  correlation
0          num_outbound_cmds      num_access_files     0.997076
1             is_guest_login                   hot     0.850921
2            srv_serror_rate           serror_rate     0.996346
3            srv_rerror_rate           rerror_rate     0.991316
4     dst_host_same_srv_rate         same_srv_rate     0.858563
5     dst_host_same_srv_rate    dst_host_srv_count     0.937125
6       dst_host_serror_rate           serror_rate     0.995520
7       dst_host_serror_rate       srv_serror_rate     0.994084
8   dst_host_srv_serror_rate           serror_rate     0.995151
9   dst_host_srv_serror_rate       srv_serror_rate     0.998334
10  dst_host_srv_serror_rate  dst_host_serror_rate     0.994574
11      dst_host_rerror_rate           rerror_rate     0.975529
12      dst_host_rerror_rate       srv_rerror_rate     0.967391
13  dst_host_srv_rerror_rate           rerror_rate     0.972647
14  dst_host_srv_rerror_rate       srv_r

In [13]:
df1

NameError: name 'df1' is not defined

In [15]:
# Get list of columns to drop
#columns_to_drop_list = drop_columns_df['column_to_drop'].tolist()

# Drop columns from original DataFrame
#df1 = df.drop(columns=columns_to_drop_list)

In [16]:
import pandas as pd
import numpy as np

# Compute the correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than a threshold (for example, 0.85)
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]

# Drop the highly correlated features 
df_reduced = df.drop(df[to_drop], axis=1)


In [17]:
df = df_reduced.copy()

In [20]:
from itertools import chain, combinations
from pyod.models.iforest import IForest
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

def all_subsets(features, min_features, max_features):
    # Generate all subsets of the feature set from size min_features to size max_features
    return chain(*map(lambda x: combinations(features, x), range(min_features, min(len(features), max_features) + 1)))

def calculate_score(model, data):
    # Calculate a score for the fitted model
    # This could be a function of model.decision_scores_, or any other metric
    # In this example, we use the mean anomaly score
    return np.mean(model.decision_function(data))

best_score = -np.inf
best_features = None
min_features = 1  # Minimum number of features in a subset
max_features = 1  # Maximum number of features in a subset

for feature_subset in all_subsets(df.columns, min_features, max_features):
    subset = df[list(feature_subset)]
    model = IForest()
    model.fit(subset)
    score = calculate_score(model, subset)

    if score > best_score:
        best_score = score
        best_features = feature_subset

print("Best features:", best_features)


Best features: ('duration',)


In [22]:
data

[['0',
  'tcp',
  'http',
  'SF',
  '181',
  '5450',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '8',
  '8',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '1.00',
  '0.00',
  '0.00',
  '9',
  '9',
  '1.00',
  '0.00',
  '0.11',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '0.00'],
 ['0',
  'tcp',
  'http',
  'SF',
  '239',
  '486',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '8',
  '8',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '1.00',
  '0.00',
  '0.00',
  '19',
  '19',
  '1.00',
  '0.00',
  '0.05',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '0.00'],
 ['0',
  'tcp',
  'http',
  'SF',
  '235',
  '1337',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '8',
  '8',
  '0.00',
  '0.00',
  '0.00',
  '0.00',
  '1.00',
  '0.00',
  '0.00',
  '29',
  '29',
  '1.00',
  '0.00',
  '0.03',
  '0.00',
  '0.0