In [28]:
!pip install --upgrade pip
!pip install pandas numpy scikit-learn scipy graphviz shap matplotlib tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 k

In [30]:
# Data Processing
import pandas as pd
import numpy as np

from shap import TreeExplainer
from shap import summary_plot
import shap.plots
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import svm

import matplotlib.pyplot as plt

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [6]:

# fraction = 0.25

# sampled_chunks = []
# chunk_size = 1000000

# for i, chunk in enumerate(pd.read_csv('traces/upb-hyccups2012/upb2012.csv', chunksize=chunk_size)):
#     if i > 3:
#         break
#     # Sample a fraction of rows from each chunk
#     sampled_chunk = chunk.sample(frac=fraction, random_state=1)
#     sampled_chunks.append(sampled_chunk)
# df = pd.concat(sampled_chunks, ignore_index=True)


df = pd.read_csv("traces/upb-hyccups2012/upb2012.csv", nrows=50000)

In [7]:
df

Unnamed: 0,messageId,messageSource,messageHopCount,oldRelayId,oldFriendWithDestination,oldRelayBattery,oldCommonCommunity,oldDataMemory,newRelayId,newFriendWithDestination,newRelayBattery,newCommonCommunity,newDataMemory
0,0,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000
1,1,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000
2,2,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000
3,3,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000
4,4,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1281,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222
49996,1282,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222
49997,1283,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222
49998,1284,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222


In [8]:
successful_messages = pd.read_csv("traces/upb-hyccups2012/successful2012.csv", nrows=10000)

In [9]:
successful_messages

Unnamed: 0,messageId,lastRelay,destination
0,14,5,0
1,579,5,0
2,595,5,0
3,604,6,1
4,1176,6,1
...,...,...,...
9995,1797,4,0
9996,1798,4,0
9997,1291,4,0
9998,1864,4,0


In [10]:
df["usefulTransfer"] = np.NaN
df

Unnamed: 0,messageId,messageSource,messageHopCount,oldRelayId,oldFriendWithDestination,oldRelayBattery,oldCommonCommunity,oldDataMemory,newRelayId,newFriendWithDestination,newRelayBattery,newCommonCommunity,newDataMemory,usefulTransfer
0,0,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000,
1,1,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000,
2,2,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000,
3,3,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000,
4,4,0,1,5,0,0.333218,0,0.045,0,0,0.730968,0,0.000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,1281,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222,
49996,1282,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222,
49997,1283,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222,
49998,1284,5,2,0,0,0.730968,0,0.286,2,0,0.637417,0,0.222,


In [11]:
# make sure column is integer not float
df["usefulTransfer"] = pd.Series(dtype=np.int64)

for index, row in successful_messages.iterrows():
  message_id = row.iloc[0]
  last_relay = row.iloc[1]
  destination = row.iloc[2]

  reached_source = False
  queue = df[(df['messageId'] == message_id) & (df['newRelayId'] == last_relay)].index.tolist()
  visited = []
  while (True):
    if (len(queue) == 0):
       break

    curr_idx = queue.pop()
    visited.append(curr_idx)

    df_row = df.iloc[curr_idx]
    message_source = df_row.iloc[1]
    old_relay_id = df_row.iloc[3]

    if old_relay_id == message_source:
        break

    df.loc[curr_idx, "usefulTransfer"] = 1
    last_relay = old_relay_id


    for idx in df[(df['messageId'] == message_id) & (df['newRelayId'] == last_relay)].index.tolist():
       if (idx not in visited):
         queue.insert(0, idx)

df.loc[(df['usefulTransfer'] != 1), 'usefulTransfer'] = 0
df = df.drop(columns=['messageId', 'oldRelayId', 'newRelayId', 'messageSource'])
df['usefulTransfer'].value_counts()[1]


2663

In [12]:
df.to_csv('traces/upb-hyccups2012/with_result.csv', index=False)


In [13]:
positive_df = df.loc[df['usefulTransfer'] == 1]
negative_df = df.loc[df['usefulTransfer'] == 0].sample(len(positive_df))
balanced_df = pd.concat([positive_df, negative_df], ignore_index=True)
print(balanced_df.min())
print(balanced_df.max())
balanced_df

messageHopCount             0.000000
oldFriendWithDestination    0.000000
oldRelayBattery             0.128897
oldCommonCommunity          0.000000
oldDataMemory               0.045000
newFriendWithDestination    0.000000
newRelayBattery             0.128897
newCommonCommunity          0.000000
newDataMemory               0.000000
usefulTransfer              0.000000
dtype: float64
messageHopCount             6.000000
oldFriendWithDestination    1.000000
oldRelayBattery             0.964487
oldCommonCommunity          1.000000
oldDataMemory               0.808000
newFriendWithDestination    1.000000
newRelayBattery             0.984842
newCommonCommunity          1.000000
newDataMemory               0.813000
usefulTransfer              1.000000
dtype: float64


Unnamed: 0,messageHopCount,oldFriendWithDestination,oldRelayBattery,oldCommonCommunity,oldDataMemory,newFriendWithDestination,newRelayBattery,newCommonCommunity,newDataMemory,usefulTransfer
0,0,0,0.730968,1,0.047,1,0.637417,0,0.001,1.0
1,0,0,0.730968,1,0.047,1,0.637417,0,0.009,1.0
2,0,0,0.730968,1,0.047,1,0.637417,0,0.019,1.0
3,2,0,0.730968,0,0.047,0,0.637417,0,0.041,1.0
4,2,0,0.637417,0,0.108,0,0.730968,0,0.047,1.0
...,...,...,...,...,...,...,...,...,...,...
5321,4,1,0.240536,0,0.450,1,0.385189,0,0.449,0.0
5322,1,0,0.730968,0,0.213,0,0.333218,0,0.187,0.0
5323,5,0,0.240536,0,0.579,0,0.385189,0,0.572,0.0
5324,3,0,0.546740,0,0.379,0,0.128897,0,0.354,0.0


In [14]:
balanced_df['messageHopCount']

0       0
1       0
2       0
3       2
4       2
       ..
5321    4
5322    1
5323    5
5324    3
5325    1
Name: messageHopCount, Length: 5326, dtype: int64

In [15]:
# add preprocessing
min_max_scaler = MinMaxScaler((0,1))
preprocessed_df = balanced_df.copy()

result_df = pd.DataFrame(balanced_df['usefulTransfer'].copy())
one_hot_encoder = LabelEncoder()
encoded = one_hot_encoder.fit_transform(result_df)
result_df = pd.DataFrame(encoded, columns=['usefulTransfer'])

preprocessed_df = preprocessed_df.drop(columns=['usefulTransfer'])
columns = preprocessed_df.columns
indices = preprocessed_df.index
preprocessed_df['messageHopCount'] = min_max_scaler.fit_transform(preprocessed_df['messageHopCount'].values.reshape(-1,1))
standard_scaler = StandardScaler()
preprocessed_df = standard_scaler.fit_transform(preprocessed_df)
preprocessed_df = pd.DataFrame(preprocessed_df, indices, columns)
preprocessed_df


  y = column_or_1d(y, warn=True)


Unnamed: 0,messageHopCount,oldFriendWithDestination,oldRelayBattery,oldCommonCommunity,oldDataMemory,newFriendWithDestination,newRelayBattery,newCommonCommunity,newDataMemory
0,-1.399206,-0.689454,1.296519,2.057141,-1.816290,1.299961,1.045775,-0.302683,-1.917095
1,-1.399206,-0.689454,1.296519,2.057141,-1.816290,1.299961,1.045775,-0.302683,-1.879593
2,-1.399206,-0.689454,1.296519,2.057141,-1.816290,1.299961,1.045775,-0.302683,-1.832715
3,-0.295753,-0.689454,1.296519,-0.486112,-1.816290,-0.769254,1.045775,-0.302683,-1.729585
4,-0.295753,-0.689454,0.899040,-0.486112,-1.521322,-0.769254,1.480524,-0.302683,-1.701458
...,...,...,...,...,...,...,...,...,...
5321,0.807701,1.450424,-0.787237,-0.486112,0.132434,1.299961,-0.126384,-0.302683,0.183023
5322,-0.847480,-0.689454,1.296519,-0.486112,-1.013590,-0.769254,-0.367903,-0.302683,-1.045171
5323,1.359427,-0.689454,-0.787237,-0.486112,0.756219,-0.769254,-0.126384,-0.302683,0.759617
5324,0.255974,-0.689454,0.513767,-0.486112,-0.210890,-0.769254,-1.317428,-0.302683,-0.262315


In [16]:
result_df

Unnamed: 0,usefulTransfer
0,1
1,1
2,1
3,1
4,1
...,...
5321,0
5322,0
5323,0
5324,0


In [17]:
preprocessed_df

Unnamed: 0,messageHopCount,oldFriendWithDestination,oldRelayBattery,oldCommonCommunity,oldDataMemory,newFriendWithDestination,newRelayBattery,newCommonCommunity,newDataMemory
0,-1.399206,-0.689454,1.296519,2.057141,-1.816290,1.299961,1.045775,-0.302683,-1.917095
1,-1.399206,-0.689454,1.296519,2.057141,-1.816290,1.299961,1.045775,-0.302683,-1.879593
2,-1.399206,-0.689454,1.296519,2.057141,-1.816290,1.299961,1.045775,-0.302683,-1.832715
3,-0.295753,-0.689454,1.296519,-0.486112,-1.816290,-0.769254,1.045775,-0.302683,-1.729585
4,-0.295753,-0.689454,0.899040,-0.486112,-1.521322,-0.769254,1.480524,-0.302683,-1.701458
...,...,...,...,...,...,...,...,...,...
5321,0.807701,1.450424,-0.787237,-0.486112,0.132434,1.299961,-0.126384,-0.302683,0.183023
5322,-0.847480,-0.689454,1.296519,-0.486112,-1.013590,-0.769254,-0.367903,-0.302683,-1.045171
5323,1.359427,-0.689454,-0.787237,-0.486112,0.756219,-0.769254,-0.126384,-0.302683,0.759617
5324,0.255974,-0.689454,0.513767,-0.486112,-0.210890,-0.769254,-1.317428,-0.302683,-0.262315


In [19]:
X = preprocessed_df.copy() # All columns except the last one
y = result_df.copy().squeeze()
print(X.shape)
print(y.shape)

(5326, 9)
(5326,)


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.87
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.79      0.85       520
           1       0.83      0.94      0.88       546

    accuracy                           0.87      1066
   macro avg       0.87      0.87      0.87      1066
weighted avg       0.87      0.87      0.87      1066

Confusion Matrix:
[[412 108]
 [ 34 512]]


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
base = svm.SVC(kernel='rbf')

# Hyperparameter tuning using Grid Search
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

grid_poly = GridSearchCV(base, param_grid, refit=True, cv=5)
grid_poly.fit(X_train, y_train)

In [41]:
best_svm = grid_poly.best_estimator_
y_pred = best_svm.predict(X_test)
print(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

[1 0 0 ... 1 1 1]
Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       520
           1       0.84      0.86      0.85       546

    accuracy                           0.84      1066
   macro avg       0.84      0.84      0.84      1066
weighted avg       0.84      0.84      0.84      1066

Confusion Matrix:
[[427  93]
 [ 75 471]]


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Dense(64, input_dim=9, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])


trained = model.fit(X_train, y_train, epochs=50, batch_size=10, validation_split=0.2)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m341/341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 990us/step - accuracy: 0.7275 - loss: 0.5560 - val_accuracy: 0.7934 - val_loss: 0.4500
Epoch 2/50
[1m341/341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 667us/step - accuracy: 0.7998 - loss: 0.4204 - val_accuracy: 0.8005 - val_loss: 0.4310
Epoch 3/50
[1m341/341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 565us/step - accuracy: 0.8414 - loss: 0.3773 - val_accuracy: 0.8063 - val_loss: 0.4305
Epoch 4/50
[1m341/341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 535us/step - accuracy: 0.8212 - loss: 0.3816 - val_accuracy: 0.8087 - val_loss: 0.4206
Epoch 5/50
[1m341/341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 534us/step - accuracy: 0.8349 - loss: 0.3550 - val_accuracy: 0.8005 - val_loss: 0.4099
Epoch 6/50
[1m341/341[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 573us/step - accuracy: 0.8289 - loss: 0.3561 - val_accuracy: 0.8005 - val_loss: 0.4101
Epoch 7/50
[1m341/341[0m 

In [37]:
neural_model = trained.model
y_pred = neural_model.predict(X_test)

# Set the threshold
threshold = 0.5

# Convert probabilities to binary labels
y_pred = (y_pred > threshold).astype(int)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 540us/step
Accuracy: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.74      0.82       520
           1       0.79      0.94      0.86       546

    accuracy                           0.84      1066
   macro avg       0.86      0.84      0.84      1066
weighted avg       0.86      0.84      0.84      1066

Confusion Matrix:
[[385 135]
 [ 31 515]]


In [18]:
explainer = TreeExplainer(rf_classifier, feature_names=preprocessed_df)
shap_values = np.array(explainer.shap_values(X_test))

In [19]:
truncated_shap = shap_values[:20][:][:]
truncated_x = X_test.head(20).to_numpy()

In [20]:
truncated_shap

array([[[-4.18734230e-01,  4.18734230e-01],
        [-4.69185409e-02,  4.69185409e-02],
        [ 3.17443729e-02, -3.17443729e-02],
        [ 2.89864573e-02, -2.89864573e-02],
        [ 6.64429314e-03, -6.64429314e-03],
        [-1.12317601e-01,  1.12317601e-01],
        [-2.91764105e-02,  2.91764105e-02],
        [-1.16940708e-03,  1.16940708e-03],
        [ 3.71992826e-02, -3.71992826e-02]],

       [[ 1.14869537e-01, -1.14869537e-01],
        [ 4.46056069e-02, -4.46056069e-02],
        [ 6.13322627e-02, -6.13322627e-02],
        [ 8.40323059e-02, -8.40323059e-02],
        [ 1.44697711e-02, -1.44697711e-02],
        [ 6.79872144e-02, -6.79872144e-02],
        [ 9.84617612e-02, -9.84617612e-02],
        [ 1.03233097e-02, -1.03233097e-02],
        [ 1.76447467e-04, -1.76447467e-04]],

       [[ 8.41004339e-02, -8.41004339e-02],
        [ 1.89792582e-02, -1.89792582e-02],
        [ 5.02679040e-02, -5.02679040e-02],
        [ 7.65510226e-02, -7.65510226e-02],
        [ 1.00925676e-01, -1

In [22]:

# desired_features = list(balanced_df.columns[:-1])
# for i in range(0, len(desired_features) - 1):
#     for j in range(i+1, len(desired_features)):
#         feature_indices = [i, j]
#         shap_values_subset = shap_values[:, feature_indices]  
#         shap_values_subset
#         feat_subset = X_test[list(map(lambda f: desired_features[f], feature_indices))]
#         plt.figure()
#         shap.summary_plot(shap_values_subset, feat_subset, plot_type="bar")  
#         display(plt.gcf()) 


