In [None]:
import pandas as pd

# Replace 'your_file.csv' with the path to your actual CSV file
file_path = 'kddcup_origin.csv'
output_file_path = 'kddcup_filtered.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Remove big sized attacks
df = df[df['label'] != 'smurf.']
df = df[df['label'] != 'neptune.']

# Save the resulting DataFrame to a new CSV file
df.to_csv(output_file_path, index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
data = pd.read_csv('kddcup_filtered.csv')

# Define features and target
X = data.drop('label', axis=1)
y = data['label']

# Identify rare classes (threshold can be adjusted)
rare_classes = y.value_counts()[y.value_counts() < 3].index

# Separate rare classes
rare_indices = y[y.isin(rare_classes)].index
common_indices = y[~y.isin(rare_classes)].index

X_rare = X.loc[rare_indices]
y_rare = y.loc[rare_indices]
X_common = X.loc[common_indices]
y_common = y.loc[common_indices]

# Stratified split for common data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X_common, y_common):
    X_train_common, X_test_common = X_common.iloc[train_index], X_common.iloc[test_index]
    y_train_common, y_test_common = y_common.iloc[train_index], y_common.iloc[test_index]

# Split rare data ensuring each class is represented in both sets
X_train_rare, X_test_rare, y_train_rare, y_test_rare = train_test_split(X_rare, y_rare, test_size=0.5, random_state=42)

# Combine common and rare data
X_train = pd.concat([X_train_common, X_train_rare])
y_train = pd.concat([y_train_common, y_train_rare])
X_test = pd.concat([X_test_common, X_test_rare])
y_test = pd.concat([y_test_common, y_test_rare])

# Define numerical and categorical features
numeric_features = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
categorical_features = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']

# Preprocessing pipeline for numerical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit and transform the training data, transform the test data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert transformed arrays back to DataFrames
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=preprocessor.get_feature_names_out())
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=preprocessor.get_feature_names_out())

# Add the target column back to the transformed data
train_df = pd.concat([X_train_transformed_df, y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test_transformed_df, y_test.reset_index(drop=True)], axis=1)

# Save the preprocessed data to CSV files
train_df.to_csv('kdd_train.csv', index=False)
test_df.to_csv('kdd_test.csv', index=False)