In [None]:
!pip install pandas

In [None]:
import pandas

In [None]:
import re
import pandas as pd
from datetime import datetime

# -------------------------------------------------------------------
# 1) Regex pattern for your specific log format
# -------------------------------------------------------------------
# A line example is:
# - - [29/Oct/2019:09:18:00 +0000] "POST /storage/store_sess_total_mousemv_db.php HTTP/1.1" 200 449 "https://160.40.52.164/" g2gh9qmk9krld14h5uojlg7g10 "Mozilla/5.0 ..."
#
# Explanation of groups:
#   (?P<ident1>\S+)     -> The first '-' (could be IP if you have logs with IP)
#   (?P<ident2>\S+)     -> The second '-'
#   \[(?P<time>[^\]]+)\]-> [29/Oct/2019:09:18:00 +0000]
#   "(?P<method>\S+)    -> "POST
#   (?P<endpoint>\S+)   -> /storage/store_sess_total_mousemv_db.php
#   (?P<protocol>[^"]+)"-> HTTP/1.1"
#   (?P<status>\d+)     -> 200
#   (?P<size>\d+)       -> 449
#   "(?P<referrer>[^"]*)"-> "https://160.40.52.164/"
#   (?P<session_id>\S+) -> g2gh9qmk9krld14h5uojlg7g10
#   "(?P<user_agent>[^"]*)"-> "Mozilla/5.0 ..."
#
# We anchor with ^ and $ so it must match the full line.
# If your log lines differ, you'll need to adjust this pattern.
# -------------------------------------------------------------------
log_pattern = re.compile(
    r'^'
    r'(?P<ident1>\S+)\s+'                     # e.g. '-'
    r'(?P<ident2>\S+)\s+'                     # e.g. '-'
    r'\[(?P<time>[^\]]+)\]\s+'                # [29/Oct/2019:09:18:00 +0000]
    r'"(?P<method>\S+)\s+(?P<endpoint>\S+)\s+(?P<protocol>[^"]+)"\s+'  # "POST /storage/... HTTP/1.1"
    r'(?P<status>\d+)\s+'                     # 200
    r'(?P<size>\d+)\s+'                       # 449
    r'"(?P<referrer>[^"]*)"\s+'               # "https://160.40.52.164/"
    r'(?P<session_id>\S+)\s+'                 # g2gh9qmk9krld14h5uojlg7g10
    r'"(?P<user_agent>[^"]*)"'                # "Mozilla/5.0 ..."
    r'$'
)

def parse_apache_line(line):
    """
    Parse a single log line using the regex above.
    Return a dictionary with named fields or None if the line doesn't match.
    """
    match = log_pattern.search(line)
    if not match:
        return None
    
    data = match.groupdict()
    
    # Convert the time string to a datetime object
    # Example: 29/Oct/2019:09:18:00 +0000
    time_str = data["time"]
    try:
        # Using %z to parse the +0000 timezone
        dt_obj = datetime.strptime(time_str, "%d/%b/%Y:%H:%M:%S %z")
        data["time"] = dt_obj
    except ValueError:
        # If parsing fails, store None
        data["time"] = None
    
    # Convert status and size to integer
    data["status"] = int(data["status"])
    data["size"] = int(data["size"])
    
    return data

def extract_features_from_logs(log_file_path):
    """
    Reads a log file in the given format, groups lines by session_id,
    and computes a few example features:
      - total_requests
      - total_session_bytes
      - number of GET/POST requests
      - time_spent (approx. difference between first and last request)
      - average inter-request time
    Returns a pandas DataFrame with one row per session.
    """
    sessions = {}
    
    # 1) Read the log file and parse each line
    with open(log_file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parsed = parse_apache_line(line)
            if parsed is None:
                # Could not parse; skip or handle differently
                continue
            
            sess_id = parsed["session_id"]
            if sess_id not in sessions:
                sessions[sess_id] = {
                    "entries": []
                }
            sessions[sess_id]["entries"].append(parsed)
    
    # 2) Compute features per session
    rows = []
    for sess_id, sess_data in sessions.items():
        entries = sess_data["entries"]
        if not entries:
            continue
        
        # Sort by time to compute durations & inter-request intervals
        entries.sort(key=lambda x: x["time"] if x["time"] else datetime.min)
        
        # --- Feature: total requests
        total_requests = len(entries)
        
        # --- Feature: total bytes
        total_session_bytes = sum(e["size"] for e in entries)
        
        # --- Feature: method counts
        method_counts = {}
        for e in entries:
            m = e["method"]
            method_counts[m] = method_counts.get(m, 0) + 1
        num_get = method_counts.get("GET", 0)
        num_post = method_counts.get("POST", 0)
        
        # --- Feature: time spent (approx)
        # Difference between first and last request timestamps
        time_spent = 0
        if entries[0]["time"] and entries[-1]["time"]:
            time_spent = (entries[-1]["time"] - entries[0]["time"]).total_seconds()
        
        # --- Feature: average inter-request time
        inter_times = []
        for i in range(1, len(entries)):
            t1 = entries[i-1]["time"]
            t2 = entries[i]["time"]
            if t1 and t2:
                diff = (t2 - t1).total_seconds()
                inter_times.append(diff)
        
        if inter_times:
            avg_inter_request = sum(inter_times) / len(inter_times)
        else:
            avg_inter_request = 0
        
        row = {
            "session_id": sess_id,
            "total_requests": total_requests,
            "total_session_bytes": total_session_bytes,
            "num_get": num_get,
            "num_post": num_post,
            "time_spent": time_spent,
            "avg_inter_request": avg_inter_request,
        }
        
        rows.append(row)
    
    # 3) Create a DataFrame
    df = pd.DataFrame(rows)
    return df

# -------------------------------------------------------------------
# Example usage
# -------------------------------------------------------------------
if __name__ == "__main__":
    # Update this path to your actual log file
    log_file = "web_bot_detection_dataset/phase1/data/web_logs/bots/access_advanced_bots.log"
    df_features = extract_features_from_logs(log_file)
    print(df_features.head())


In [None]:
!pip install scikit-learn

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Regex for parsing the log lines
log_pattern = re.compile(
    r'^'
    r'(?P<ident1>\S+)\s+'
    r'(?P<ident2>\S+)\s+'
    r'\[(?P<time>[^\]]+)\]\s+'
    r'"(?P<method>\S+)\s+(?P<endpoint>\S+)\s+(?P<protocol>[^"]+)"\s+'
    r'(?P<status>\d+)\s+'
    r'(?P<size>\d+)\s+'
    r'"(?P<referrer>[^"]*)"\s+'
    r'(?P<session_id>\S+)\s+'
    r'"(?P<user_agent>[^"]*)"'
    r'$'
)

def parse_log_line(line):
    match = log_pattern.search(line)
    if not match:
        return None
    data = match.groupdict()
    time_str = data["time"]
    try:
        dt_obj = datetime.strptime(time_str, "%d/%b/%Y:%H:%M:%S %z")
        data["time"] = dt_obj
    except ValueError:
        data["time"] = None
    data["status"] = int(data["status"])
    data["size"] = int(data["size"])
    return data

def extract_log_features_from_dir(logs_dir):
    sessions = {}
    # Iterate over all log files in the given directory
    for filename in os.listdir(logs_dir):
        file_path = os.path.join(logs_dir, filename)
        if not os.path.isfile(file_path):
            continue
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parsed = parse_log_line(line)
                if parsed is None:
                    continue
                sid = parsed["session_id"]
                if sid not in sessions:
                    sessions[sid] = {"entries": []}
                sessions[sid]["entries"].append(parsed)
    return sessions

def extract_log_features(phase_dir):
    sessions = {}
    # Phase directory contains "data/web_logs" with subdirectories "human" and "bots"
    human_logs_dir = os.path.join(phase_dir, "data", "web_logs", "human")
    bot_logs_dir = os.path.join(phase_dir, "data", "web_logs", "bots")
    # Process human logs
    if os.path.exists(human_logs_dir):
        human_sessions = extract_log_features_from_dir(human_logs_dir)
        sessions.update(human_sessions)
    # Process bot logs
    if os.path.exists(bot_logs_dir):
        bot_sessions = extract_log_features_from_dir(bot_logs_dir)
        for sid, data in bot_sessions.items():
            if sid in sessions:
                sessions[sid]["entries"].extend(data["entries"])
            else:
                sessions[sid] = data
    rows = []
    for sid, sess_data in sessions.items():
        entries = sess_data["entries"]
        if not entries:
            continue
        entries.sort(key=lambda x: x["time"] if x["time"] else datetime.min)
        total_requests = len(entries)
        total_session_bytes = sum(e["size"] for e in entries)
        method_counts = {}
        for e in entries:
            m = e["method"]
            method_counts[m] = method_counts.get(m, 0) + 1
        num_get = method_counts.get("GET", 0)
        num_post = method_counts.get("POST", 0)
        time_spent = 0
        if entries[0]["time"] and entries[-1]["time"]:
            time_spent = (entries[-1]["time"] - entries[0]["time"]).total_seconds()
        inter_times = []
        for i in range(1, len(entries)):
            t1 = entries[i-1]["time"]
            t2 = entries[i]["time"]
            if t1 and t2:
                inter_times.append((t2 - t1).total_seconds())
        avg_inter_request = sum(inter_times) / len(inter_times) if inter_times else 0
        row = {
            "session_id": sid,
            "total_requests": total_requests,
            "total_session_bytes": total_session_bytes,
            "num_get": num_get,
            "num_post": num_post,
            "time_spent": time_spent,
            "avg_inter_request": avg_inter_request,
        }
        rows.append(row)
    return pd.DataFrame(rows)

def extract_mouse_features(mouse_root):
    all_features = []
    # mouse_root is a folder containing subdirectories for each session
    for session_id in os.listdir(mouse_root):
        session_path = os.path.join(mouse_root, session_id)
        if not os.path.isdir(session_path):
            continue
        mouse_json = os.path.join(session_path, "mouse_movements.json")
        if not os.path.exists(mouse_json):
            continue
        with open(mouse_json, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except:
                continue
        coords = data.get("mousemove_total_behaviour", [])
        num_moves = len(coords)
        avg_x, avg_y = 0, 0
        if coords and isinstance(coords[0], list) and len(coords[0]) == 2:
            xs = [c[0] for c in coords]
            ys = [c[1] for c in coords]
            avg_x = np.mean(xs)
            avg_y = np.mean(ys)
        all_features.append({
            "session_id": session_id,
            "num_moves": num_moves,
            "avg_x": avg_x,
            "avg_y": avg_y
        })
    return pd.DataFrame(all_features)

def merge_features(df_logs, df_mouse):
    return pd.merge(df_logs, df_mouse, on="session_id", how="inner")

def preprocess_data(df, label_col="label"):
    df = df.dropna()
    X = df.drop(["session_id", label_col], axis=1, errors="ignore")
    y = df[label_col] if label_col in df.columns else None
    if y is not None:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        return X_train_scaled, X_test_scaled, y_train, y_test
    else:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        return X_scaled

if __name__ == "__main__":
    # Directory structure for Phase 1
    # dataset/
    # └── phase1/
    #     ├── annotations/
    #     │   └── humans_and_moderate_bots/
    #     │       ├── train
    #     │       └── test
    #     └── data/
    #         ├── web_logs/
    #         │   ├── human/ (multiple log files: access_1.log, access_2.log, ...)
    #         │   └── bots/  (access_advanced_bots.log, access_moderate_bots.log)
    #         └── mouse_movements/
    #             └── humans_and_moderate_bots/
    #                 ├── <session_id1>/mouse_movements.json
    #                 ├── <session_id2>/mouse_movements.json
    #                 └── ...
    
    phase1_dir = "web_bot_detection_dataset/phase1"
    # Change this to the appropriate mouse movements folder (e.g., humans_and_moderate_bots)
    mouse_dir = os.path.join(phase1_dir, "data", "mouse_movements", "humans_and_moderate_bots")
    # Annotations file for training
    annotations_file = os.path.join(phase1_dir, "annotations", "humans_and_moderate_bots", "train")
    
    df_logs = extract_log_features(phase1_dir)
    df_mouse = extract_mouse_features(mouse_dir)
    df_merged = merge_features(df_logs, df_mouse)
    
    if os.path.exists(annotations_file):
        df_labels = pd.read_csv(annotations_file, sep=" ", header=None, names=["session_id", "label"])
        df_merged = pd.merge(df_merged, df_labels, on="session_id", how="inner")
    
    if "label" in df_merged.columns:
        X_train, X_test, y_train, y_test = preprocess_data(df_merged, label_col="label")
    else:
        X_scaled = preprocess_data(df_merged)


In [None]:
import os
import re
import json
import logging
import numpy as np
import pandas as pd
from typing import Dict, List, Any

# Machine Learning Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class BotDetectionFeatureExtractor:
    def __init__(self, dataset_path: str):
        """
        Initialize the feature extractor with dataset path
        
        Args:
            dataset_path (str): Root path of the dataset
        """
        self.dataset_path = dataset_path
    
    def parse_custom_log(self, log_file: str) -> pd.DataFrame:
        """
        Parse custom log format with precise matching
        
        Args:
            log_file (str): Path to log file
        
        Returns:
            pd.DataFrame: Parsed log data
        """
        # Precise log pattern matching the specific format
        log_pattern = re.compile(
            r'^- - \[(?P<timestamp>[^\]]+)\] '
            r'"(?P<method>\S+) (?P<url>\S+) (?P<protocol>\S+)" '
            r'(?P<status>\d+) (?P<bytes>\d+) '
            r'"(?P<referrer>[^"]*)" '
            r'(?P<session_id>\S+) '
            r'"(?P<user_agent>[^"]*)"'
        )
        
        logs = []
        try:
            with open(log_file, 'r') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    match = log_pattern.match(line)
                    
                    if match:
                        log_entry = match.groupdict()
                        logs.append(log_entry)
                    else:
                        # Handle alternative format with '-' as user agent
                        alt_pattern = re.compile(
                            r'^- - \[(?P<timestamp>[^\]]+)\] '
                            r'"(?P<method>\S+) (?P<url>\S+) (?P<protocol>\S+)" '
                            r'(?P<status>\d+) (?P<bytes>\d+) '
                            r'"(?P<referrer>[^"]*)" '
                            r'- '
                            r'"?(?P<user_agent>[^"]*)"?'
                        )
                        alt_match = alt_pattern.match(line)
                        
                        if alt_match:
                            log_entry = alt_match.groupdict()
                            logs.append(log_entry)
                        else:
                            logger.warning(f"Skipping line {line_num} in {log_file}: {line}")
            
            if not logs:
                logger.error(f"No logs parsed from file: {log_file}")
                return pd.DataFrame()
            
            df = pd.DataFrame(logs)
            
            # Timestamp parsing
            try:
                df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
            except ValueError:
                df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
            
            return df
        
        except Exception as e:
            logger.error(f"Error parsing log file {log_file}: {e}")
            return pd.DataFrame()
    def process_dataset(self, phase: str = 'phase1') -> Dict[str, List[Dict[str, Any]]]:
        """
        Process entire dataset and extract features with robust error handling
        
        Args:
            phase (str): Dataset phase to process
        
        Returns:
            Dict[str, List[Dict[str, Any]]]: Extracted features for different classes
        """
        phase_path = os.path.join(self.dataset_path, phase)
        features = {
            'human': [],
            'moderate_bots': [],
            'advanced_bots': []
        }
        
        # More flexible path handling
        try:
            web_logs_path = os.path.join(phase_path, 'data', 'web_logs')
            mouse_movements_path = os.path.join(phase_path, 'data', 'mouse_movements')
            
            # Process human logs
            human_log_dir = os.path.join(web_logs_path, 'humans')
            if not os.path.exists(human_log_dir):
                logger.error(f"Human log directory not found: {human_log_dir}")
                return features
            
            for log_file in os.listdir(human_log_dir):
                log_path = os.path.join(human_log_dir, log_file)
                log_df = self.parse_custom_log(log_path)
                
                if log_df.empty:
                    logger.warning(f"Skipping empty log file: {log_path}")
                    continue
                
                log_features = self.extract_log_features(log_df)
                
                # Find corresponding mouse movement file
                mouse_file = self._find_mouse_movement_file(mouse_movements_path, log_file)
                if mouse_file:
                    mouse_features = self.extract_mouse_movement_features(mouse_file)
                    combined_features = {**log_features, **mouse_features, 'label': 0}  # 0 for human
                    features['human'].append(combined_features)
            
            # Process moderate bot logs
            bot_log_dir = os.path.join(web_logs_path, 'bots')
            if not os.path.exists(bot_log_dir):
                logger.error(f"Bot log directory not found: {bot_log_dir}")
                return features
            
            for log_file in os.listdir(bot_log_dir):
                if 'moderate' in log_file.lower():
                    log_path = os.path.join(bot_log_dir, log_file)
                    log_df = self.parse_custom_log(log_path)
                    
                    if log_df.empty:
                        logger.warning(f"Skipping empty log file: {log_path}")
                        continue
                    
                    log_features = self.extract_log_features(log_df)
                    
                    # Find corresponding mouse movement file
                    mouse_file = self._find_mouse_movement_file(mouse_movements_path, log_file)
                    if mouse_file:
                        mouse_features = self.extract_mouse_movement_features(mouse_file)
                        combined_features = {**log_features, **mouse_features, 'label': 1}  # 1 for moderate bot
                        features['moderate_bots'].append(combined_features)
        
        except Exception as e:
            logger.error(f"Error processing dataset: {e}")
        
        return features
    
    def _find_mouse_movement_file(self, mouse_movements_path: str, log_file: str) -> str:
        """
        Find corresponding mouse movement file
        
        Args:
            mouse_movements_path (str): Path to mouse movements directory
            log_file (str): Log filename to match
        
        Returns:
            str: Path to mouse movement file or None
        """
        try:
            for session_dir in os.listdir(mouse_movements_path):
                mouse_file = os.path.join(mouse_movements_path, session_dir, 'mouse_movements.json')
                if os.path.exists(mouse_file):
                    return mouse_file
        except Exception as e:
            logger.error(f"Error finding mouse movement file: {e}")
        return None
    
    def extract_log_features(self, log_df: pd.DataFrame) -> Dict[str, Any]:
        """
        Extract comprehensive features from log DataFrame
        Simplified version with more robust error handling
        """
        try:
            if len(log_df) == 0:
                return {}
            
            # Basic features with error handling
            total_requests = len(log_df)
            total_bytes = log_df['bytes'].astype(int).sum() if 'bytes' in log_df.columns else 0
            
            # Timing features
            if 'timestamp' in log_df.columns:
                session_duration = (log_df['timestamp'].max() - log_df['timestamp'].min()).total_seconds()
                browsing_speed = total_requests / (session_duration + 1e-10)
            else:
                session_duration = 0
                browsing_speed = 0
            
            return {
                'total_requests': total_requests,
                'total_session_bytes': total_bytes,
                'session_duration': session_duration,
                'browsing_speed': browsing_speed
            }
        
        except Exception as e:
            logger.error(f"Error extracting log features: {e}")
            return {}
    
    def extract_mouse_movement_features(self, mouse_movement_file: str) -> Dict[str, Any]:
        """
        Simplified mouse movement feature extraction
        """
        try:
            with open(mouse_movement_file, 'r') as f:
                mouse_data = json.load(f)
            
            mouse_df = pd.DataFrame(mouse_data)
            
            return {
                'total_mouse_movements': len(mouse_df)
            }
        
        except Exception as e:
            logger.error(f"Error processing mouse movement file {mouse_movement_file}: {e}")
            return {}
    
    def prepare_training_data(self, features: Dict[str, List[Dict[str, Any]]]):
        """
        Prepare features and labels for machine learning with additional error handling
        """
        try:
            # Combine features from different classes
            all_features = features['human'] + features['moderate_bots']
            
            # Convert to DataFrame
            df = pd.DataFrame(all_features)
            
            # Handle missing data
            df = df.fillna(0)
            
            # Separate features and labels
            X = df.drop('label', axis=1)
            y = df['label']
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            return X_train_scaled, X_test_scaled, y_train, y_test
        
        except Exception as e:
            logger.error(f"Error preparing training data: {e}")
            return None, None, None, None

def train_bot_detection_model(X_train, X_test, y_train, y_test):
    """
    Train a Random Forest classifier for bot detection
    """
    try:
        # Initialize and train model
        rf_classifier = RandomForestClassifier(
            n_estimators=100, 
            random_state=42, 
            n_jobs=-1
        )
        rf_classifier.fit(X_train, y_train)
        
        # Predict and evaluate
        y_pred = rf_classifier.predict(X_test)
        
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=['Human', 'Moderate Bot']))
        
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        
        return rf_classifier
    
    except Exception as e:
        logger.error(f"Error training bot detection model: {e}")
        return None

def main():
    # Set your dataset path
    dataset_path = 'web_bot_detection_dataset'
    
    # Initialize feature extractor
    extractor = BotDetectionFeatureExtractor(dataset_path)
    
    # Process dataset and extract features
    features = extractor.process_dataset('phase1')
    print(features)
    
    # Validate features
    # if not features['human'] and not features['moderate_bots']:
    #     logger.error("No features extracted. Check dataset path and log files.")
    #     return
    
    # # Prepare training data
    # X_train, X_test, y_train, y_test = extractor.prepare_training_data(features)
    
    # # Validate training data
    # if X_train is None:
    #     logger.error("Failed to prepare training data.")
    #     return
    
    # # Train bot detection model
    # model = train_bot_detection_model(X_train, X_test, y_train, y_test)

if __name__ == "__main__":
    main()

In [11]:
def parse_custom_log(log_file: str) -> pd.DataFrame:
        """
        Parse custom log format with precise matching
        
        Args:
            log_file (str): Path to log file
        
        Returns:
            pd.DataFrame: Parsed log data
        """
        # Precise log pattern matching the specific format
        log_pattern = re.compile(
            r'^- - \[(?P<timestamp>[^\]]+)\] '
            r'"(?P<method>\S+) (?P<url>\S+) (?P<protocol>\S+)" '
            r'(?P<status>\d+) (?P<bytes>\d+) '
            r'"(?P<referrer>[^"]*)" '
            r'(?P<session_id>\S+) '
            r'"(?P<user_agent>[^"]*)"'
        )
        
        logs = []
        try:
            with open(log_file, 'r') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    match = log_pattern.match(line)
                    
                    if match:
                        log_entry = match.groupdict()
                        logs.append(log_entry)
                    else:
                        # Handle alternative format with '-' as user agent
                        alt_pattern = re.compile(
                            r'^- - \[(?P<timestamp>[^\]]+)\] '
                            r'"(?P<method>\S+) (?P<url>\S+) (?P<protocol>\S+)" '
                            r'(?P<status>\d+) (?P<bytes>\d+) '
                            r'"(?P<referrer>[^"]*)" '
                            r'- '
                            r'"?(?P<user_agent>[^"]*)"?'
                        )
                        alt_match = alt_pattern.match(line)
                        
                        if alt_match:
                            log_entry = alt_match.groupdict()
                            logs.append(log_entry)
                        else:
                            logger.warning(f"Skipping line {line_num} in {log_file}: {line}")
            
            if not logs:
                logger.error(f"No logs parsed from file: {log_file}")
                return pd.DataFrame()
            
            df = pd.DataFrame(logs)
            
            # Timestamp parsing
            try:
                df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%b/%Y:%H:%M:%S %z')
            except ValueError:
                df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
            
            return df
        
        except Exception as e:
            logger.error(f"Error parsing log file {log_file}: {e}")
            return pd.DataFrame()

In [23]:
def extract_log_features(log_df: pd.DataFrame) -> Dict[str, Any]:
        """
        Extract comprehensive features from log DataFrame
        Simplified version with more robust error handling
        """
        try:
            if len(log_df) == 0:
                return {}
            
            # Basic features with error handling
            total_requests = len(log_df)
            total_bytes = log_df['bytes'].astype(int).sum() if 'bytes' in log_df.columns else 0
            
            # Timing features
            if 'timestamp' in log_df.columns:
                session_duration = (log_df['timestamp'].max() - log_df['timestamp'].min()).total_seconds()
                browsing_speed = total_requests / (session_duration + 1e-10)
            else:
                session_duration = 0
                browsing_speed = 0
            
            return {
                'total_requests': total_requests,
                'total_session_bytes': total_bytes,
                'session_duration': session_duration,
                'browsing_speed': browsing_speed
            }
        
        except Exception as e:
            logger.error(f"Error extracting log features: {e}")
            return {}

In [24]:
temp = parse_custom_log(log_file="web_bot_detection_dataset/phase1/data/web_logs/humans/access_1.log")
temp2 =extract_log_features(temp)



In [25]:
temp2

{'total_requests': 25101,
 'total_session_bytes': np.int64(16094655),
 'session_duration': 72912.0,
 'browsing_speed': 0.3442643186306776}

In [20]:
temp[10:20]

Unnamed: 0,timestamp,method,url,protocol,status,bytes,referrer,session_id,user_agent
10,2019-10-29 09:17:58+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
11,2019-10-29 09:18:00+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
12,2019-10-29 09:18:02+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
13,2019-10-29 09:18:04+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
14,2019-10-29 09:18:06+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
15,2019-10-29 09:18:06+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
16,2019-10-29 09:18:06+00:00,GET,/content/computer_security.php,HTTP/1.1,200,1574,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
17,2019-10-29 09:18:06+00:00,POST,/storage/store_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
18,2019-10-29 09:18:06+00:00,GET,/favicon.ico,HTTP/1.1,200,1940,https://160.40.52.164/content/computer_securit...,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...
19,2019-10-29 09:18:08+00:00,POST,/storage/store_sess_total_mousemv_db.php,HTTP/1.1,200,449,https://160.40.52.164/content/computer_securit...,g2gh9qmk9krld14h5uojlg7g10,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...


In [None]:
df_mouse.head(100)

In [None]:
df_merged