In [142]:
import re
from datetime import datetime
import pandas as pd


In [143]:
##Helpers

def categorize_status_code(status_code):
    if status_code >= 500 and status_code <= 600:
        return "downtime"
    elif status_code >= 200 and status_code <= 300:
        return "uptime"
    else:
        return "other"

## Data Preprocessing

In [144]:

log_entries = []

# Read the log file
with open('data/haproxytmp.log', 'r') as file:
    log_entries = file.readlines()

# Define regular expression for field extraction
regex_pattern = r'^(?P<timestamp>\w+\s+\d+\s\d+:\d+:\d+)\s+(?P<server>\S+)\s+haproxy\[\d+\]:\s(?P<client_ip>[\d.:]+):(?P<client_port>\d+)\s+\[(?P<datetime>[^\]]+)\]\s+(?P<frontend>\S+)~\s+(?P<backend>[^\s/]+)/(?P<backendservername>[^\s]+)\s+(?P<timing>[^\s]+)\s+(?P<http_status>\d+)\s+(?P<bytes_read>\d+)\s+.*\s+(?P<act_conn>\d+)/(?P<fe_conn>\d+)/(?P<be_conn>\d+)/(?P<srv_conn>\d+)/(?P<retries>\d+)\s+.*\s+"(?P<request_line>[^"]+)"'

fields = []

# Iterate through log entries
for log_entry in log_entries:
    # Extract fields using regular expression
    match = re.match(regex_pattern, log_entry)
    if match:
        
         # Convert timestamp to datetime value
        timestamp_str = match.group('timestamp')
        timestamp = datetime.strptime(timestamp_str, '%b %d %H:%M:%S')
        # Set the current year as the placeholder
        current_year = datetime.now().year
        timestamp = timestamp.replace(year=current_year)
        
        
         # Convert datetime to datetime value
        datetime_str =  match.group('datetime')
        datetime_format = '%d/%b/%Y:%H:%M:%S.%f'
        mdatetime = datetime.strptime(datetime_str, datetime_format).timestamp()
        
        timing_dict['act_time']=0
        timing_dict['fe_time']=0
        timing_dict['be_time']=0
        timing_dict['srv_time']=0
        timing_dict['total_time']=0
        
        timing_columns = ['act_time', 'fe_time', 'be_time', 'srv_time', 'total_time']
        values_list = match.group('timing').split("/")
        timing_dict = {column: int(value) for column, value in zip(timing_columns, values_list)}
        
        # Split the request line by spaces
        request_line_parts = match.group('request_line').split()
        http_method = request_line_parts[0]
        endpoint = request_line_parts[1]
        http_version = request_line_parts[2]

        #print(timing_dict)
        
        fields.append({
            'xtimestamp': timestamp,
            'agg_timestamp': timestamp.strftime('%Y:%m:%d %H:%M'),
            'day_of_week': timestamp.strftime('%A'),
            'hour': timestamp.strftime('%H'),
            'server': match.group('server'),
            'client_ip': match.group('client_ip'),
            'client_port': int(match.group('client_port')),
            #'xdatetime': mdatetime,
            'frontend': match.group('frontend'),
            'backend': match.group('backend'),
            'backendservername': match.group('backendservername'),
            'timing': match.group('timing'),
            'http_status': categorize_status_code(int(match.group('http_status'))),
            'bytes_read': int(match.group('bytes_read')),
            'request_cookie': '-',
            'response_cookie': '-',
            'termination_state': '-',
            'act_conn': int(match.group('act_conn')),
            'fe_conn': int(match.group('fe_conn')),
            'be_conn': int(match.group('be_conn')),
            'srv_conn': int(match.group('srv_conn')),
            'act_time': int(timing_dict['act_time']),
            'fe_time': int(timing_dict['fe_time']),
            'be_time': int(timing_dict['be_time']),
            'srv_time': int(timing_dict['srv_time']),
            'total_time': int(timing_dict['total_time']),
            'act_sess': 0,
            'fe_sess': 0,
            'be_sess': 0,
            'srv_sess': 0,
            'retries': int(match.group('retries')),
            #'request_line': match.group('request_line'),
            'http_method': http_method,
            'endpoint': endpoint,
            'http_version': http_version,
            'count_value': 1
            
        })

# Print the first n extracted fields
for field in fields[:1]:
    print(field)
    
# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(fields)


{'xtimestamp': datetime.datetime(2023, 6, 6, 12, 49, 8), 'agg_timestamp': '2023:06:06 12:49', 'day_of_week': 'Tuesday', 'hour': '12', 'server': 'interswitch-virtual-machine', 'client_ip': '41.75.170.169', 'client_port': 13805, 'frontend': 'www-https', 'backend': 'svaAgentLogin_backend', 'backendservername': 'api_webserver5', 'timing': '19203/0/0/525/19735', 'http_status': 'uptime', 'bytes_read': 517, 'request_cookie': '-', 'response_cookie': '-', 'termination_state': '-', 'act_conn': 4046, 'fe_conn': 4046, 'be_conn': 101, 'srv_conn': 54, 'act_time': 19203, 'fe_time': 0, 'be_time': 0, 'srv_time': 525, 'total_time': 19735, 'act_sess': 0, 'fe_sess': 0, 'be_sess': 0, 'srv_sess': 0, 'retries': 0, 'http_method': 'GET', 'endpoint': '/api/v2/quickteller/agent/accountBalance/23027333', 'http_version': 'HTTP/1.1', 'count_value': 1}


In [145]:
#print(df.drop('http_status', axis=1).values)
print(df.shape)
df.head(5)

(3336, 34)


Unnamed: 0,xtimestamp,agg_timestamp,day_of_week,hour,server,client_ip,client_port,frontend,backend,backendservername,...,total_time,act_sess,fe_sess,be_sess,srv_sess,retries,http_method,endpoint,http_version,count_value
0,2023-06-06 12:49:08,2023:06:06 12:49,Tuesday,12,interswitch-virtual-machine,41.75.170.169,13805,www-https,svaAgentLogin_backend,api_webserver5,...,19735,0,0,0,0,0,GET,/api/v2/quickteller/agent/accountBalance/23027333,HTTP/1.1,1
1,2023-06-06 12:49:08,2023:06:06 12:49,Tuesday,12,interswitch-virtual-machine,41.75.170.75,15702,www-https,interswitchstore_backend,interswitchstore_webserver,...,194,0,0,0,0,0,POST,/interswitchstore/device/geolocationstatus,HTTP/1.1,1
2,2023-06-06 12:49:08,2023:06:06 12:49,Tuesday,12,interswitch-virtual-machine,41.210.155.99,38338,www-https,interswitchstore_backend,interswitchstore_webserver,...,228,0,0,0,0,0,POST,/interswitchstore/device/geolocationstatus,HTTP/1.1,1
3,2023-06-06 12:49:08,2023:06:06 12:49,Tuesday,12,interswitch-virtual-machine,197.221.137.205,34708,www-https,api_backend,api_webserver2,...,72,0,0,0,0,0,DELETE,/api/v1/appservice/users/23026724,HTTP/1.1,1
4,2023-06-06 12:49:08,2023:06:06 12:49,Tuesday,12,interswitch-virtual-machine,102.87.162.247,36122,www-https,svaAgentLogin_backend,api_webserver5,...,781,0,0,0,0,0,GET,/api/v2/quickteller/agent/accountBalance/23020622,HTTP/1.1,1


In [146]:
print(df.isnull().sum())

xtimestamp           0
agg_timestamp        0
day_of_week          0
hour                 0
server               0
client_ip            0
client_port          0
frontend             0
backend              0
backendservername    0
timing               0
http_status          0
bytes_read           0
request_cookie       0
response_cookie      0
termination_state    0
act_conn             0
fe_conn              0
be_conn              0
srv_conn             0
act_time             0
fe_time              0
be_time              0
srv_time             0
total_time           0
act_sess             0
fe_sess              0
be_sess              0
srv_sess             0
retries              0
http_method          0
endpoint             0
http_version         0
count_value          0
dtype: int64


## Perform analysis on the DataFrame

In [147]:
# Request Analysis
request_counts = df['endpoint'].value_counts()
print("Request Counts:")
print(request_counts)


# Response Analysis
response_counts = df['http_status'].value_counts()
print("\nResponse Counts:")
print(response_counts)

Request Counts:
/api/v1A/svapayments/validateCustomer                   333
/interswitchstore/device/trackupdate                    195
/interswitchstore/device/geolocationstatus              179
/api/v2/quickteller/agent/agentLogin                    172
/api/v1A/svapayments/sendAdviceRequest                  161
                                                       ... 
/api/v1A/svapayments/transactions/EVERSEND1682754639      1
/api/v2/quickteller/agent/appNotifications/3Is37494       1
/api/v1A/svapayments/transactions/01272854120199          1
/api/v2/quickteller/agent/accountBalance/3IS14365         1
/api/v2/quickteller/agent/accountBalance/23029472         1
Name: endpoint, Length: 892, dtype: int64

Response Counts:
uptime      2943
downtime     341
other         52
Name: http_status, dtype: int64


In [148]:
# Timing Analysis
timing_columns = ['act_time', 'fe_time', 'be_time', 'srv_time', 'total_time']
df[timing_columns] = df['timing'].str.split('/', expand=True).astype(int)
timing_summary = df[timing_columns].describe()
print("\nTiming Summary:")
print(timing_summary)


Timing Summary:
            act_time      fe_time      be_time       srv_time     total_time
count    3336.000000  3336.000000  3336.000000    3336.000000    3336.000000
mean    10362.869604    -0.005396     0.554257    9036.652878   19496.448741
std     26650.251019     0.073268     1.138423   18906.966389   34318.256343
min        -1.000000    -1.000000    -1.000000      -1.000000      35.000000
25%       161.000000     0.000000     0.000000     195.500000     609.750000
50%       540.500000     0.000000     0.000000     693.000000    4019.500000
75%      5868.000000     0.000000     1.000000    7714.000000   21420.250000
max    231518.000000     0.000000    16.000000  155133.000000  298442.000000


In [149]:
# Connection Analysis
connection_columns = ['act_conn', 'fe_conn', 'be_conn', 'srv_conn']
connection_summary = df[connection_columns].describe()
print("\nConnection Summary:")
print(connection_summary)


Connection Summary:
          act_conn      fe_conn      be_conn     srv_conn
count  3336.000000  3336.000000  3336.000000  3336.000000
mean   3994.172962  3994.172962   104.748501    55.988609
std      53.038910    53.038910    99.072632    61.903217
min    3924.000000  3924.000000     0.000000     0.000000
25%    3948.000000  3948.000000    23.000000    13.000000
50%    3974.000000  3974.000000    72.000000    39.000000
75%    4051.000000  4051.000000   175.250000    66.000000
max    4096.000000  4096.000000   359.000000   291.000000


In [150]:
# Bytes Analysis
bytes_summary = df['bytes_read'].describe()
print("\nBytes Summary:")
print(bytes_summary)


Bytes Summary:
count      3336.000000
mean       3202.988309
std       35530.296954
min         169.000000
25%         441.000000
50%         517.000000
75%         662.000000
max      970778.000000
Name: bytes_read, dtype: float64


In [151]:
# Time Series Analysis
# import pandas as pd
# import matplotlib.pyplot as plt
#count_per_10min = df.resample('10T').count()

In [152]:
# import matplotlib.pyplot as plt
# # Plotting
# plt.figure(figsize=(12, 6))
# yearly_traffic.plot(kind='bar', color='blue')
# plt.title('Yearly Traffic')
# plt.xlabel('Year')
# plt.ylabel('Bytes Read')
# plt.show()

## Feature Engineering

1. Date and Time Features: Extract additional information from the timestamp field, such as hour of the day, day of the week, month, etc. These features can help capture any temporal patterns or trends in the data.

2. Categorical Encoding: Convert categorical variables like server, frontend, backend, etc., into numeric representations using one-hot encoding or label encoding. This allows machine learning models to work with categorical data effectively.

3. Session Duration: Calculate the duration of each session by subtracting the start and end timestamps. This can provide insights into session lengths and help identify sessions with unusually long or short durations.

4. Request Analysis: Extract features from the request_line, such as the HTTP method (GET, POST, etc.), endpoint, or API version. These features can help analyze request patterns and identify popular endpoints or API versions.

5. Response Analysis: Create binary features based on the HTTP status code, such as whether the response was successful (200-299) or had an error (400-599). This can help identify patterns in successful and failed responses.

## Model Selection: 
Explore various deep learning models that are suitable for time-series prediction tasks. Some popular models include Long Short-Term Memory (LSTM) networks, Gated Recurrent Units (GRU), Convolutional Neural Networks (CNN), or Transformer-based architectures. These models can capture temporal dependencies and patterns in the log data.

### TensorFlow

In [153]:
# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder

# # Preprocess the data and prepare X and y
# df_filtered = df[df['http_status'] != 'other']
# X = df_filtered.drop(['http_status'], axis=1)
# y = df_filtered['http_status']

# # Encode the categorical target variable
# encoder = OneHotEncoder(sparse=False)
# y_encoded = encoder.fit_transform(y)

# # Split the data into training and testing datasets
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # Build the TensorFlow model
# model = tf.keras.models.Sequential([
#     tf.keras.layers.Dense(64, activation='relu', input_shape=(2,1)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(2, activation='sigmoid')
# ])

# # Compile the model
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=10, batch_size=32)

# # Evaluate the model
# loss, accuracy = model.evaluate(X_test, y_test)
# print("Model Accuracy:", accuracy)

# # Make predictions
# predictions = model.predict(X_test)


In [155]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Preprocess the data and prepare X and y
df_filtered = df[df['http_status'] != 'other']
X = df_filtered[['bytes_read', 'request_cookie', 'response_cookie', 'termination_state', 'act_conn', 'request_line']]
y = df_filtered['http_status']

# Encode the categorical variables in X
label_encoder = LabelEncoder()
X['request_cookie'] = label_encoder.fit_transform(X['request_cookie'])
X['response_cookie'] = label_encoder.fit_transform(X['response_cookie'])
X['termination_state'] = label_encoder.fit_transform(X['termination_state'])
X['request_line'] = label_encoder.fit_transform(X['request_line'])

# Scale the numerical features in X
scaler = StandardScaler()
X[['bytes_read', 'act_conn']] = scaler.fit_transform(X[['bytes_read', 'act_conn']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the TensorFlow model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(6,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


KeyError: "['request_line'] not in index"

### Keras

## Training and Evaluation:
Split your dataset into training, validation, and testing sets. Train your deep learning model on the training data and fine-tune hyperparameters using the validation set. Evaluate the model's performance using appropriate metrics like accuracy, precision, recall, F1-score, or area under the ROC curve (AUC-ROC).

## Model Interpretability:
Consider using techniques that enhance model interpretability, especially in critical systems like predicting downtime. Methods like attention mechanisms, feature importance analysis, or SHAP (SHapley Additive exPlanations) values can help you understand the factors contributing to downtime predictions.