In [1]:
import re
from datetime import datetime
import pandas as pd


In [2]:
##Helpers

def categorize_status_code(status_code):
    if status_code >= 500 and status_code <= 600:
        return 0
    elif status_code >= 200 and status_code <= 300:
        return 1
    else:
        return None

## Data Preprocessing

In [3]:

log_entries = []

# Read the log file
with open('data/haproxy.log', 'r') as file:
    log_entries = file.readlines()

# Define regular expression for field extraction
regex_pattern = r'^(?P<timestamp>\w+\s+\d+\s\d+:\d+:\d+)\s+(?P<server>\S+)\s+haproxy\[\d+\]:\s(?P<client_ip>[\d.:]+):(?P<client_port>\d+)\s+\[(?P<datetime>[^\]]+)\]\s+(?P<frontend>\S+)~\s+(?P<backend>[^\s/]+)/(?P<backendservername>[^\s]+)\s+(?P<timing>[^\s]+)\s+(?P<http_status>\d+)\s+(?P<bytes_read>\d+)\s+.*\s+(?P<act_conn>\d+)/(?P<fe_conn>\d+)/(?P<be_conn>\d+)/(?P<srv_conn>\d+)/(?P<retries>\d+)\s+.*\s+"(?P<request_line>[^"]+)"'

fields = []

# Iterate through log entries
for log_entry in log_entries:
    # Extract fields using regular expression
    match = re.match(regex_pattern, log_entry)
    if match:
        
         # Convert timestamp to datetime value
        timestamp_str = match.group('timestamp')
        timestamp = datetime.strptime(timestamp_str, '%b %d %H:%M:%S')
        # Set the current year as the placeholder
        current_year = datetime.now().year
        timestamp = timestamp.replace(year=current_year)
        
        
         # Convert datetime to datetime value
        datetime_str =  match.group('datetime')
        datetime_format = '%d/%b/%Y:%H:%M:%S.%f'
        mdatetime = datetime.strptime(datetime_str, datetime_format).timestamp()
        
        timing_dict=''
        
        timing_columns = ['act_time', 'fe_time', 'be_time', 'srv_time', 'total_time']
        values_list = match.group('timing').split("/")
        timing_dict = {column: int(value) for column, value in zip(timing_columns, values_list)}
        
        # Split the request line by spaces
        request_line_parts = match.group('request_line').split()
        http_method = request_line_parts[0]
        endpoint = request_line_parts[1]
        http_version = request_line_parts[2]

        #print(timing_dict)
        
        
        if "/svapayments/transactions/" in endpoint:
            new_endpoint = endpoint.split("/svapayments/transactions/")[0] + "/svapayments/transactions/"
        elif "/api/v2/quickteller/agent/accountBalance/" in endpoint:
            new_endpoint = endpoint.split("/api/v2/quickteller/agent/accountBalance/")[0] + "/api/v2/quickteller/agent/accountBalance/"
        elif "/api/v1/appservice/users/" in endpoint:
            new_endpoint = endpoint.split("/api/v1/appservice/users/")[0] + "/api/v1/appservice/users/"
        elif "/tugende/recon/v1/check-transaction-status/" in endpoint:
            new_endpoint = endpoint.split("/tugende/recon/v1/check-transaction-status/")[0] + "/tugende/recon/v1/check-transaction-status/"
        elif "/api/v2/quickteller/agent/appNotifications/" in endpoint:
            new_endpoint = endpoint.split("/api/v2/quickteller/agent/appNotifications/")[0] + "/api/v2/quickteller/agent/appNotifications/"
        
        else:
            new_endpoint = endpoint
        
        fields.append({
            'xtimestamp': timestamp,
            'agg_timestamp': timestamp.strftime('%Y:%m:%d %H:%M'),
            'day_of_week': timestamp.strftime('%A'),
            'hour': timestamp.strftime('%H'),
            'server': match.group('server'),
            'client_ip': match.group('client_ip'),
            'client_port': int(match.group('client_port')),
            #'xdatetime': mdatetime,
            'frontend': match.group('frontend'),
            'backend': match.group('backend'),
            'backendservername': match.group('backendservername'),
            'timing': match.group('timing'),
            'http_status': categorize_status_code(int(match.group('http_status'))),
            'bytes_read': int(match.group('bytes_read')),
            'request_cookie': '-',
            'response_cookie': '-',
            'termination_state': '-',
            'act_conn': int(match.group('act_conn')),
            'fe_conn': int(match.group('fe_conn')),
            'be_conn': int(match.group('be_conn')),
            'srv_conn': int(match.group('srv_conn')),
            'act_time': int(timing_dict['act_time']),
            'fe_time': int(timing_dict['fe_time']),
            'be_time': int(timing_dict['be_time']),
            'srv_time': int(timing_dict['srv_time']),
            'total_time': int(timing_dict['total_time']),
            'act_sess': 0,
            'fe_sess': 0,
            'be_sess': 0,
            'srv_sess': 0,
            'retries': int(match.group('retries')),
            #'request_line': match.group('request_line'),
            'http_method': http_method,
            'endpoint': new_endpoint,
            'http_version': http_version,
            'count_value': 1
            
        })

# Print the first n extracted fields
for field in fields[:1]:
    print(field)
    
# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(fields)


{'xtimestamp': datetime.datetime(2023, 6, 6, 7, 35, 12), 'agg_timestamp': '2023:06:06 07:35', 'day_of_week': 'Tuesday', 'hour': '07', 'server': 'interswitch-virtual-machine', 'client_ip': '41.210.186.148', 'client_port': 51732, 'frontend': 'www-https', 'backend': 'interswitchstore_backend', 'backendservername': 'interswitchstore_webserver', 'timing': '371/0/1/37/409', 'http_status': 1, 'bytes_read': 217, 'request_cookie': '-', 'response_cookie': '-', 'termination_state': '-', 'act_conn': 452, 'fe_conn': 452, 'be_conn': 0, 'srv_conn': 1, 'act_time': 371, 'fe_time': 0, 'be_time': 1, 'srv_time': 37, 'total_time': 409, 'act_sess': 0, 'fe_sess': 0, 'be_sess': 0, 'srv_sess': 0, 'retries': 0, 'http_method': 'POST', 'endpoint': '/interswitchstore/device/remoteConfigs', 'http_version': 'HTTP/1.1', 'count_value': 1}


In [4]:
#print(df.isnull().sum())
## Drop null
df.dropna(inplace=True)

## Perform analysis on the DataFrame

In [5]:
# Request Analysis
request_counts = df['endpoint'].value_counts()
print("Request Counts:")
print(request_counts)


# Response Analysis
response_counts = df['http_status'].value_counts()
print("\nResponse Counts:")
print(response_counts)

Request Counts:
/api/v2/quickteller/agent/accountBalance/                                                                                        695483
/api/v1A/svapayments/validateCustomer                                                                                            572551
/interswitchstore/device/trackupdate                                                                                             140157
/interswitchstore/device/geolocationstatus                                                                                       130513
/api/v1A/svapayments/sendAdviceRequest                                                                                            92300
                                                                                                                                  ...  
/extraswitch/reportDownload.do?reportId=272680078&show=attachment&con=EBO_SACCO_Issuer_pdf_2023_0606_012008.pdf.pdf                   1
/extraswitch/reportDownload.do?r

In [6]:
# Timing Analysis
timing_columns = ['act_time', 'fe_time', 'be_time', 'srv_time', 'total_time']
df[timing_columns] = df['timing'].str.split('/', expand=True).astype(int)
timing_summary = df[timing_columns].describe()
print("\nTiming Summary:")
print(timing_summary)


Timing Summary:
           act_time       fe_time       be_time      srv_time    total_time
count  2.370364e+06  2.370364e+06  2.370364e+06  2.370364e+06  2.370364e+06
mean   6.305557e+03 -5.943349e-02  5.888496e-01  1.600037e+03  9.825243e+03
std    2.200778e+04  2.364343e-01  3.676219e+01  9.693202e+03  3.230876e+04
min   -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00  4.000000e+00
25%    6.600000e+01  0.000000e+00  0.000000e+00  3.000000e+01  1.700000e+02
50%    2.070000e+02  0.000000e+00  0.000000e+00  9.400000e+01  5.340000e+02
75%    1.403000e+03  0.000000e+00  1.000000e+00  5.080000e+02  3.163000e+03
max    2.624990e+05  0.000000e+00  1.501000e+04  2.557710e+05  4.796950e+05


In [7]:
# Connection Analysis
connection_columns = ['act_conn', 'fe_conn', 'be_conn', 'srv_conn']
connection_summary = df[connection_columns].describe()
print("\nConnection Summary:")
print(connection_summary)


Connection Summary:
           act_conn       fe_conn       be_conn      srv_conn
count  2.370364e+06  2.370364e+06  2.370364e+06  2.370364e+06
mean   3.013297e+03  3.013277e+03  5.213312e+01  1.911712e+01
std    8.170410e+02  8.170341e+02  1.766426e+02  8.419456e+01
min    4.050000e+02  4.050000e+02  0.000000e+00  0.000000e+00
25%    2.545000e+03  2.545000e+03  2.000000e+00  2.000000e+00
50%    3.332000e+03  3.332000e+03  7.000000e+00  4.000000e+00
75%    3.516000e+03  3.516000e+03  3.400000e+01  1.200000e+01
max    5.219000e+03  5.219000e+03  2.125000e+03  2.125000e+03


In [8]:
# Bytes Analysis
bytes_summary = df['bytes_read'].describe()
print("\nBytes Summary:")
print(bytes_summary)


Bytes Summary:
count    2.370364e+06
mean     3.676995e+03
std      1.074225e+05
min      1.060000e+02
25%      3.140000e+02
50%      5.140000e+02
75%      5.180000e+02
max      6.900052e+07
Name: bytes_read, dtype: float64


In [9]:
# from sklearn.feature_extraction import FeatureHasher

# # Initialize the FeatureHasher
# hasher = FeatureHasher(n_features=1, input_type='string')

# # Apply the Hashing Trick on the 'category' column
# hashed_features = hasher.transform(df['day_of_week'])

# # Convert the hashed features to a NumPy array
# hashed_array = hashed_features.toarray()

# # Create a new DataFrame with the hashed features
# hashed_df = pd.DataFrame(hashed_array)

# # Concatenate the original DataFrame with the hashed DataFrame
# df_hashed = pd.concat([df, hashed_df], axis=1)

# # Print the resulting DataFrame
# print(df_hashed.shape)
# df_hashed.head(5)

In [10]:
# import matplotlib.pyplot as plt
# # Plotting
# plt.figure(figsize=(12, 6))
# yearly_traffic.plot(kind='bar', color='blue')
# plt.title('Yearly Traffic')
# plt.xlabel('Year')
# plt.ylabel('Bytes Read')
# plt.show()

## Feature Engineering

1. Date and Time Features: Extract additional information from the timestamp field, such as hour of the day, day of the week, month, etc. These features can help capture any temporal patterns or trends in the data.

2. Categorical Encoding: Convert categorical variables like server, frontend, backend, etc., into numeric representations using one-hot encoding or label encoding. This allows machine learning models to work with categorical data effectively.

3. Session Duration: Calculate the duration of each session by subtracting the start and end timestamps. This can provide insights into session lengths and help identify sessions with unusually long or short durations.

4. Request Analysis: Extract features from the request_line, such as the HTTP method (GET, POST, etc.), endpoint, or API version. These features can help analyze request patterns and identify popular endpoints or API versions.

5. Response Analysis: Create binary features based on the HTTP status code, such as whether the response was successful (200-299) or had an error (400-599). This can help identify patterns in successful and failed responses.

In [11]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in df.columns:
    # Check if the column contains categorical data
    if df[column].dtype == 'object':
        # Perform integer encoding on the column
        df[column] = label_encoder.fit_transform(df[column].astype(str))
        
        
# # Print the resulting DataFrame
print(df.shape)
df.head(5)

(2370364, 34)


Unnamed: 0,xtimestamp,agg_timestamp,day_of_week,hour,server,client_ip,client_port,frontend,backend,backendservername,...,total_time,act_sess,fe_sess,be_sess,srv_sess,retries,http_method,endpoint,http_version,count_value
0,2023-06-06 07:35:12,0,0,0,0,11490,51732,0,6,9,...,409,0,0,0,0,0,3,11114,1,1
1,2023-06-06 07:35:12,0,0,0,0,6347,59225,0,1,3,...,191,0,0,0,0,0,3,1918,1,1
2,2023-06-06 07:35:12,0,0,0,0,6481,1517,0,6,9,...,173,0,0,0,0,0,3,11113,1,1
3,2023-06-06 07:35:12,0,0,0,0,6347,59225,0,1,2,...,246,0,0,0,0,0,3,1918,1,1
4,2023-06-06 07:35:13,0,0,0,0,6347,59225,0,1,3,...,237,0,0,0,0,0,3,1918,1,1


In [12]:
# Encode categorical values
# Select only the categorical columns
# categorical_columns = df.select_dtypes(include='object').columns

# Apply one-hot encoding to categorical columns
# df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Print the encoded DataFrame
# print(df_encoded.shape)
# df_encoded.head(5)

## Model Selection: 
Explore various deep learning models that are suitable for time-series prediction tasks. Some popular models include Long Short-Term Memory (LSTM) networks, Gated Recurrent Units (GRU), Convolutional Neural Networks (CNN), or Transformer-based architectures. These models can capture temporal dependencies and patterns in the log data.

### TensorFlow

In [13]:
# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder

# # Preprocess the data and prepare X and y
# df_filtered = df[df['http_status'] != 'other']
# X = df_filtered.drop(['http_status'], axis=1)
# y = df_filtered['http_status']

# # Encode the categorical target variable
# encoder = OneHotEncoder(sparse=False)
# y_encoded = encoder.fit_transform(y)

# # Split the data into training and testing datasets
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # Build the TensorFlow model
# model = tf.keras.models.Sequential([
#     tf.keras.layers.Dense(64, activation='relu', input_shape=(2,1)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(2, activation='sigmoid')
# ])

# # Compile the model
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=10, batch_size=32)

# # Evaluate the model
# loss, accuracy = model.evaluate(X_test, y_test)
# print("Model Accuracy:", accuracy)

# # Make predictions
# predictions = model.predict(X_test)


In [None]:
#Build the neural network model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

BATCH_SIZE = 2
EPOCHS = 5
INIT_LEARNING_RATE = 0.001
dense_units = 50
activation_func = 'relu'
activation_func_out='sigmoid'
loss = 'binary_crossentropy'

# Separate the independent and dependent variables
X = df.drop(['http_status', 'xtimestamp'], axis=1)
y_status = df['http_status']

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_status, test_size=0.1, random_state=42)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1111, random_state=42)

# Define the model architecture
# Define the input layer
inputs = Input(shape=(X_train.shape[1],))

# Add multiple hidden layers
dense1 = Dense(64, activation='relu')(inputs)
dropout1 = Dropout(0.2)(dense1)
dense2 = Dense(32, activation='relu')(dropout1)
dropout2 = Dropout(0.2)(dense2)
dense3 = Dense(16, activation='relu')(dropout2)

# Add the output layer
output = Dense(1, activation='sigmoid')(dense3)

# Create the model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=INIT_LEARNING_RATE), loss=loss, metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/5

In [6]:
#print(df.drop('http_status', axis=1).values)
print(df.shape)
df.head(5)

(2376657, 34)


Unnamed: 0,xtimestamp,agg_timestamp,day_of_week,hour,server,client_ip,client_port,frontend,backend,backendservername,...,total_time,act_sess,fe_sess,be_sess,srv_sess,retries,http_method,endpoint,http_version,count_value
0,2023-06-06 07:35:12,2023:06:06 07:35,Tuesday,7,interswitch-virtual-machine,41.210.186.148,51732,www-https,interswitchstore_backend,interswitchstore_webserver,...,409,0,0,0,0,0,POST,/interswitchstore/device/remoteConfigs,HTTP/1.1,1
1,2023-06-06 07:35:12,2023:06:06 07:35,Tuesday,7,interswitch-virtual-machine,196.13.203.201,59225,www-https,api_backend,api_webserver2,...,191,0,0,0,0,0,POST,/api/v1A/svapayments/validateCustomer,HTTP/1.1,1
2,2023-06-06 07:35:12,2023:06:06 07:35,Tuesday,7,interswitch-virtual-machine,197.239.10.226,1517,www-https,interswitchstore_backend,interswitchstore_webserver,...,173,0,0,0,0,0,POST,/interswitchstore/device/geolocationstatus,HTTP/1.1,1
3,2023-06-06 07:35:12,2023:06:06 07:35,Tuesday,7,interswitch-virtual-machine,196.13.203.201,59225,www-https,api_backend,api_webserver1,...,246,0,0,0,0,0,POST,/api/v1A/svapayments/validateCustomer,HTTP/1.1,1
4,2023-06-06 07:35:13,2023:06:06 07:35,Tuesday,7,interswitch-virtual-machine,196.13.203.201,59225,www-https,api_backend,api_webserver2,...,237,0,0,0,0,0,POST,/api/v1A/svapayments/validateCustomer,HTTP/1.1,1


### Keras

## Training and Evaluation:
Split your dataset into training, validation, and testing sets. Train your deep learning model on the training data and fine-tune hyperparameters using the validation set. Evaluate the model's performance using appropriate metrics like accuracy, precision, recall, F1-score, or area under the ROC curve (AUC-ROC).

## Model Interpretability:
Consider using techniques that enhance model interpretability, especially in critical systems like predicting downtime. Methods like attention mechanisms, feature importance analysis, or SHAP (SHapley Additive exPlanations) values can help you understand the factors contributing to downtime predictions.