In [3]:
import pandas as pd

# Example: Replace these with actual paths to your datasets
csic_2010_path = 'csic_final.csv'
pkdd_2007_path = 'ecml_final.csv'

# Load datasets
csic_2010_df = pd.read_csv(csic_2010_path)
pkdd_2007_df = pd.read_csv(pkdd_2007_path)


In [6]:
print(csic_2010_df.columns)

Index(['Class', 'Method', 'URI', 'Host-Header', 'Host', 'Connection', 'Accept',
       'Accept-Charset', 'Accept-Language', 'Cache-control', 'Cookie',
       'Pragma', 'User-Agent', 'Content-Length', 'Content-Type', 'POST-Data',
       'GET-Query'],
      dtype='object')


In [7]:
print(pkdd_2007_df.columns)


Index(['Class', 'Method', 'URI', 'Host-Header', 'Host', 'Connection', 'Accept',
       'Accept-Charset', 'Accept-Language', 'Cache-control', 'Cookie',
       'Pragma', 'User-Agent', 'Content-Length', 'Content-Type', 'POST-Data',
       'GET-Query'],
      dtype='object')


In [8]:
def combine_request_columns(df):
    """
    Combine relevant columns into a single HTTP request string.
    
    Args:
        df (pd.DataFrame): The dataset containing the HTTP request components.
        
    Returns:
        pd.Series: Series with combined HTTP request strings.
    """
    # Combine Method, URI, POST-Data, and GET-Query columns
    combined_requests = df['Method'].astype(str) + ' ' + \
                        df['URI'].astype(str) + ' ' + \
                        df['POST-Data'].fillna('').astype(str) + ' ' + \
                        df['GET-Query'].fillna('').astype(str)
    
    return combined_requests



In [9]:
csic_2010_combined_requests = combine_request_columns(csic_2010_df)
pkdd_2007_combined_requests = combine_request_columns(pkdd_2007_df)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def tokenize_and_extract_features_from_combined(df, combined_requests):
    """
    Tokenize and extract features from combined HTTP request strings.
    
    Args:
        df (pd.DataFrame): The original dataset.
        combined_requests (pd.Series): The combined HTTP request strings.
        
    Returns:
        sparse_matrix, feature_names: Sparse matrix of features and list of feature names.
    """
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
    X = vectorizer.fit_transform(combined_requests)
    
    # Don't convert to dense, just return the sparse matrix and feature names
    feature_names = vectorizer.get_feature_names_out()
    
    return X, feature_names

# Apply to each dataset
csic_2010_features, csic_2010_feature_names = tokenize_and_extract_features_from_combined(csic_2010_df, csic_2010_combined_requests)
pkdd_2007_features, pkdd_2007_feature_names = tokenize_and_extract_features_from_combined(pkdd_2007_df, pkdd_2007_combined_requests)

# Display sparse matrix shape and feature names for debugging
print("CSIC 2010 features shape:", csic_2010_features.shape)
print("PKDD 2007 features shape:", pkdd_2007_features.shape)
print("Feature names (first 10):", csic_2010_feature_names[:10])


CSIC 2010 features shape: (61065, 32746)
PKDD 2007 features shape: (23892, 435297)
Feature names (first 10): ['0' '00041295x' '0004533796646501' '0007950776968836' '00098872h' '001'
 '00101492q' '0010509413049118' '00151522k' '0017018534732457']


In [12]:
def tokenize_and_extract_features_with_limit(df, combined_requests, max_features=10000):
    """
    Tokenize and extract features with a limit on the number of features.
    
    Args:
        df (pd.DataFrame): The original dataset.
        combined_requests (pd.Series): The combined HTTP request strings.
        max_features (int): Maximum number of features to consider.
        
    Returns:
        sparse_matrix, feature_names: Sparse matrix of features and list of feature names.
    """
    vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', max_features=max_features)
    X = vectorizer.fit_transform(combined_requests)
    
    # Return sparse matrix and feature names
    feature_names = vectorizer.get_feature_names_out()
    
    return X, feature_names

# Apply with reduced vocabulary size
csic_2010_features, csic_2010_feature_names = tokenize_and_extract_features_with_limit(csic_2010_df, csic_2010_combined_requests)
pkdd_2007_features, pkdd_2007_feature_names = tokenize_and_extract_features_with_limit(pkdd_2007_df, pkdd_2007_combined_requests)

# Display sparse matrix shape and feature names for debugging
print("CSIC 2010 features shape:", csic_2010_features.shape)
print("PKDD 2007 features shape:", pkdd_2007_features.shape)
print("Feature names (first 10):", csic_2010_feature_names[:10])


CSIC 2010 features shape: (61065, 10000)
PKDD 2007 features shape: (23892, 10000)
Feature names (first 10): ['0' '0010509413049118' '0034994508490764' '0040408141488994' '00423300p'
 '0070151205403865' '00776957v' '0088492463086794' '0093197595343977'
 '01039']


In [13]:
def combine_request_columns(df):
    """
    Combine relevant columns into a single HTTP request string.
    
    Args:
        df (pd.DataFrame): The dataset containing the HTTP request components.
        
    Returns:
        pd.Series: Series with combined HTTP request strings.
    """
    # Combine Method, URI, POST-Data, and GET-Query columns
    combined_requests = df['Method'].astype(str) + ' ' + \
                        df['URI'].astype(str) + ' ' + \
                        df['POST-Data'].fillna('').astype(str) + ' ' + \
                        df['GET-Query'].fillna('').astype(str)
    
    return combined_requests

# Example usage
csic_2010_combined_requests = combine_request_columns(csic_2010_df)
pkdd_2007_combined_requests = combine_request_columns(pkdd_2007_df)



In [15]:
from sklearn.feature_selection import mutual_info_classif
import numpy as np
import pandas as pd


In [19]:
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

def calculate_information_gain(features, target, feature_names):
    """
    Calculate the information gain (mutual information) for each feature.
    
    Args:
        features (scipy.sparse matrix): The feature matrix.
        target (pd.Series): The target variable (Class labels).
        feature_names (list): List of feature names corresponding to the columns in the feature matrix.
        
    Returns:
        pd.Series: A series with feature names as the index and their corresponding information gain.
    """
    # Calculate mutual information
    info_gain = mutual_info_classif(features, target, discrete_features=True)
    
    # Return the information gain with the feature names as the index
    return pd.Series(info_gain, index=feature_names)


# Before calculating mutual information, let's check the lengths
print(f"Number of features in matrix: {csic_2010_features.shape[1]}")
print(f"Length of feature names: {len(csic_2010_feature_names)}")

# Ensure lengths match before proceeding
assert csic_2010_features.shape[1] == len(csic_2010_feature_names), "Mismatch in feature count and feature names length."

# Proceed with the calculation
csic_2010_info_gain = calculate_information_gain(csic_2010_features, csic_2010_df['Class'], csic_2010_feature_names)
pkdd_2007_info_gain = calculate_information_gain(pkdd_2007_features, pkdd_2007_df['Class'], pkdd_2007_feature_names)

# Example usage with your dataset



Number of features in matrix: 32746
Length of feature names: 10000


AssertionError: Mismatch in feature count and feature names length.

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Assuming you already have a vectorizer fitted to the data:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
X = vectorizer.fit_transform(csic_2010_combined_requests)  # Assuming csic_2010_combined_requests is your combined data

# Now get the correct feature names from the fitted vectorizer
csic_2010_feature_names = vectorizer.get_feature_names_out()

# Now you can calculate information gain
def calculate_information_gain(features, target, feature_names):
    info_gain = mutual_info_classif(features, target, discrete_features=True)
    return pd.Series(info_gain, index=feature_names)

# Calculate information gain
csic_2010_info_gain = calculate_information_gain(X, csic_2010_df['Class'], csic_2010_feature_names)

# Display the top 10 features with the highest information gain
print("Top 10 CSIC 2010 features by information gain:")
print(csic_2010_info_gain.sort_values(ascending=False).head(10))


Top 10 CSIC 2010 features by information gain:
imagenes    0.048587
b1          0.030431
tienda1     0.029950
27          0.029316
publico     0.028146
2f          0.026662
3d          0.025685
jpg         0.023272
modo        0.021713
3b          0.021557
dtype: float64


In [21]:
def select_top_features(info_gain, top_n=20):
    """
    Select the top N features based on information gain.
    
    Args:
        info_gain (pd.Series): Information gain values for each feature.
        top_n (int): Number of top features to select.
        
    Returns:
        list: List of top N feature names.
    """
    top_features = info_gain.sort_values(ascending=False).head(top_n).index
    return top_features

# Example: Select top 20 features
top_csic_2010_features = select_top_features(csic_2010_info_gain, top_n=20)
print("Top 20 CSIC 2010 features:")
print(top_csic_2010_features)


Top 20 CSIC 2010 features:
Index(['imagenes', 'b1', 'tienda1', '27', 'publico', '2f', '3d', 'jpg', 'modo',
       '3b', 'wide', 'asf', 'global', '3e', '3c', 'cookie', '29', '253a',
       'get', 'nombre'],
      dtype='object')


In [23]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

def filter_features(features, top_feature_names, feature_names):
    """
    Filter the features to include only the top feature names.
    
    Args:
        features (scipy.sparse matrix): The original feature matrix.
        top_feature_names (list): List of top feature names to retain.
        feature_names (numpy.ndarray): Array of all feature names.
        
    Returns:
        pd.DataFrame: DataFrame with only the top features.
    """
    # Find indices of top feature names
    indices = np.array([np.where(feature_names == name)[0][0] for name in top_feature_names])
    # Create a new feature matrix with only the top features
    return features[:, indices]

# Example usage
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
X = vectorizer.fit_transform(csic_2010_combined_requests)  # Fit and transform with your combined requests
feature_names = vectorizer.get_feature_names_out()

# Assume top_csic_2010_features is already defined
csic_2010_top_features_matrix = filter_features(X, top_csic_2010_features, feature_names)

# Convert to DataFrame for easier handling
csic_2010_top_features_df = pd.DataFrame(csic_2010_top_features_matrix.toarray(), columns=top_csic_2010_features)
print("Shape of top CSIC 2010 features DataFrame:", csic_2010_top_features_df.shape)


Shape of top CSIC 2010 features DataFrame: (61065, 20)


In [24]:
def combine_with_target(features_df, target):
    """
    Combine features DataFrame with target variable.
    
    Args:
        features_df (pd.DataFrame): DataFrame of selected features.
        target (pd.Series): The target variable (Class labels).
        
    Returns:
        pd.DataFrame: Combined DataFrame with features and target.
    """
    return pd.concat([features_df, target.reset_index(drop=True)], axis=1)

# Combine top features with the target variable
csic_2010_final_df = combine_with_target(csic_2010_top_features_df, csic_2010_df['Class'])
print("Final dataset shape:", csic_2010_final_df.shape)


Final dataset shape: (61065, 21)
