In [4]:
import pandas as pd
import urllib.parse
from typing import List, Dict
from sklearn.preprocessing import LabelEncoder

def decode_url_encoded(encoded_string: str) -> str:
    return urllib.parse.unquote_plus(encoded_string)

def parse_query_string(query: str) -> Dict[str, str]:
    return dict(urllib.parse.parse_qsl(query))

def extract_features(row: pd.Series) -> Dict[str, any]:
    features = {}
    
    # Basic features
    features['class'] = row['Class']
    features['method'] = row['Method']
    
    # URI features
    uri_parts = urllib.parse.urlparse(row['URI'])
    features['path_depth'] = len(uri_parts.path.split('/')) - 1
    features['file_extension'] = uri_parts.path.split('.')[-1] if '.' in uri_parts.path else ''
    
    # Header features
    features['host_length'] = len(row['Host'])
    features['accept_count'] = len(row['Accept'].split(','))
    features['accept_language'] = row['Accept-Language'].split(',')[0]
    features['has_cache_control'] = int(row['Cache-control'] != '')
    features['has_cookie'] = int(row['Cookie'] != '')
    features['user_agent_length'] = len(row['User-Agent'])
    
    # Content features
    features['has_content'] = int(row['Content-Length'] != '')
    features['content_type'] = row['Content-Type'] if row['Content-Type'] else 'None'
    
    # Query features
    if row['Method'] == 'GET':
        query_params = parse_query_string(row['GET-Query'])
    else:  # POST
        query_params = parse_query_string(row['POST-Data'])
    
    features['query_param_count'] = len(query_params)
    features['max_param_length'] = max([len(v) for v in query_params.values()]) if query_params else 0
    features['has_numeric_param'] = int(any(v.isdigit() for v in query_params.values()))
    
    return features

def preprocess_dataset(file_path: str) -> pd.DataFrame:
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Decode URL-encoded fields
    for column in ['URI', 'GET-Query', 'POST-Data']:
        df[column] = df[column].apply(decode_url_encoded)
    
    # Extract features
    features_df = df.apply(extract_features, axis=1, result_type='expand')
    
    # Encode categorical variables
    le = LabelEncoder()
    for column in features_df.select_dtypes(include=['object']):
        features_df[column] = le.fit_transform(features_df[column].astype(str))
    
    return features_df

# Example usage
file_path = 'csic_final.csv'
preprocessed_data = preprocess_dataset(file_path)
print(preprocessed_data.head())

# Optional: Save preprocessed data to a new CSV file
preprocessed_data.to_csv('preprocessed_data.csv', index=False)

AttributeError: 'float' object has no attribute 'replace'

In [5]:
import pandas as pd
import urllib.parse
from typing import List, Dict
from sklearn.preprocessing import LabelEncoder

def decode_url_encoded(encoded_string: str) -> str:
    if pd.isna(encoded_string):
        return ''
    return urllib.parse.unquote_plus(str(encoded_string))

def parse_query_string(query: str) -> Dict[str, str]:
    return dict(urllib.parse.parse_qsl(query))

def extract_features(row: pd.Series) -> Dict[str, any]:
    features = {}
    
    # Basic features
    features['class'] = row['Class']
    features['method'] = row['Method']
    
    # URI features
    uri_parts = urllib.parse.urlparse(row['URI'])
    features['path_depth'] = len(uri_parts.path.split('/')) - 1
    features['file_extension'] = uri_parts.path.split('.')[-1] if '.' in uri_parts.path else ''
    
    # Header features
    features['host_length'] = len(str(row['Host']))
    features['accept_count'] = len(str(row['Accept']).split(','))
    features['accept_language'] = str(row['Accept-Language']).split(',')[0]
    features['has_cache_control'] = int(str(row['Cache-control']) != '')
    features['has_cookie'] = int(str(row['Cookie']) != '')
    features['user_agent_length'] = len(str(row['User-Agent']))
    
    # Content features
    features['has_content'] = int(str(row['Content-Length']) != '')
    features['content_type'] = row['Content-Type'] if pd.notna(row['Content-Type']) else 'None'
    
    # Query features
    if row['Method'] == 'GET':
        query_params = parse_query_string(str(row['GET-Query']))
    else:  # POST
        query_params = parse_query_string(str(row['POST-Data']))
    
    features['query_param_count'] = len(query_params)
    features['max_param_length'] = max([len(str(v)) for v in query_params.values()]) if query_params else 0
    features['has_numeric_param'] = int(any(str(v).isdigit() for v in query_params.values()))
    
    return features

def preprocess_dataset(file_path: str) -> pd.DataFrame:
    # Read the CSV file
    df = pd.read_csv(file_path, na_values=[''], keep_default_na=False)
    
    # Ensure string type for specific columns
    string_columns = ['URI', 'GET-Query', 'POST-Data', 'Host', 'Accept', 'Accept-Language', 'Cache-control', 'Cookie', 'User-Agent', 'Content-Type']
    for column in string_columns:
        df[column] = df[column].astype(str)
    
    # Decode URL-encoded fields
    for column in ['URI', 'GET-Query', 'POST-Data']:
        df[column] = df[column].apply(decode_url_encoded)
    
    # Extract features
    features_df = df.apply(extract_features, axis=1, result_type='expand')
    
    # Encode categorical variables
    le = LabelEncoder()
    for column in features_df.select_dtypes(include=['object']):
        features_df[column] = le.fit_transform(features_df[column].astype(str))
    
    return features_df

# Example usage
file_path = 'csic_final.csv'
preprocessed_data = preprocess_dataset(file_path)
print(preprocessed_data.head())

# Optional: Save preprocessed data to a new CSV file
preprocessed_data.to_csv('preprocessed_data.csv', index=False)

   class  method  path_depth  file_extension  host_length  accept_count  \
0      1       0           2              43           14             7   
1      1       0           3              43           14             7   
2      1       1           3              43           14             7   
3      1       0           3              43           14             7   
4      1       1           3              43           14             7   

   accept_language  has_cache_control  has_cookie  user_agent_length  \
0                0                  1           1                 71   
1                0                  1           1                 71   
2                0                  1           1                 71   
3                0                  1           1                 71   
4                0                  1           1                 71   

   has_content  content_type  query_param_count  max_param_length  \
0            1             1                  0