In [3]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [4]:
raw_train = pd.read_csv('train_network.csv')

In [5]:
raw_test = pd.read_csv('test_network.csv')


In [6]:
raw_train.head()

Unnamed: 0,timestamp,src_ip,dest_ip,packet_size,latency,throughput,scenario,protocol,errors,retransmissions,device_type,bandwidth,signal_strength,network_load,error_rate,sdwan_link_status,bgp_link_status,mpls_link_status,congestion,time_category
0,2024-07-14 15:31:03,192.168.1.27,192.168.2.50,841,84,30,packet_loss,UDP,4.0,5.0,firewall,998,-50,0.428483,0.01708,down,down,down,1,non_business_hours
1,2024-07-14 15:32:03,192.168.1.50,192.168.2.30,724,46,48,peak_hours,UDP,9.0,4.0,firewall,187,-57,0.291209,0.013705,up,up,up,0,non_business_hours
2,2024-07-14 15:33:03,192.168.1.43,192.168.2.36,1219,22,58,jitter,UDP,1.0,,switch,328,-97,0.221508,0.027237,up,up,up,0,non_business_hours
3,2024-07-14 15:34:03,192.168.1.27,192.168.2.35,1394,100,48,jitter,TCP,8.0,3.0,router,491,-100,0.632264,0.011104,down,down,down,1,non_business_hours
4,2024-07-14 15:35:03,192.168.1.18,192.168.2.11,1470,40,55,high_traffic,TCP,1.0,1.0,router,860,-70,0.866514,0.048174,down,down,up,0,non_business_hours


In [7]:
raw_train.nunique()

timestamp            5000
src_ip                 50
dest_ip                50
packet_size          1398
latency               141
throughput             91
scenario                5
protocol                2
errors                 21
retransmissions        16
device_type             3
bandwidth             985
signal_strength        71
network_load         5000
error_rate           5000
sdwan_link_status       2
bgp_link_status         2
mpls_link_status        2
congestion              2
time_category           2
dtype: int64

In [8]:
raw_train.shape

(5000, 20)

In [9]:
train_df = raw_train.copy()
test_df = raw_test.copy()

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp          5000 non-null   object 
 1   src_ip             5000 non-null   object 
 2   dest_ip            5000 non-null   object 
 3   packet_size        5000 non-null   int64  
 4   latency            5000 non-null   int64  
 5   throughput         5000 non-null   int64  
 6   scenario           5000 non-null   object 
 7   protocol           5000 non-null   object 
 8   errors             4797 non-null   float64
 9   retransmissions    4614 non-null   float64
 10  device_type        5000 non-null   object 
 11  bandwidth          5000 non-null   int64  
 12  signal_strength    5000 non-null   int64  
 13  network_load       5000 non-null   float64
 14  error_rate         5000 non-null   float64
 15  sdwan_link_status  5000 non-null   object 
 16  bgp_link_status    5000 

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp          1000 non-null   object 
 1   src_ip             1000 non-null   object 
 2   dest_ip            1000 non-null   object 
 3   packet_size        1000 non-null   int64  
 4   latency            1000 non-null   int64  
 5   throughput         1000 non-null   int64  
 6   scenario           1000 non-null   object 
 7   protocol           1000 non-null   object 
 8   errors             963 non-null    float64
 9   retransmissions    926 non-null    float64
 10  device_type        1000 non-null   object 
 11  bandwidth          1000 non-null   int64  
 12  signal_strength    1000 non-null   int64  
 13  network_load       1000 non-null   float64
 14  error_rate         1000 non-null   float64
 15  sdwan_link_status  1000 non-null   object 
 16  bgp_link_status    1000 n

In [12]:
train_y = train_df['congestion'].copy()

In [13]:
train_df.drop(columns = ['congestion'], inplace=True)
test_df.drop(columns='congestion', inplace=True)

In [14]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   timestamp          5000 non-null   object 
 1   src_ip             5000 non-null   object 
 2   dest_ip            5000 non-null   object 
 3   packet_size        5000 non-null   int64  
 4   latency            5000 non-null   int64  
 5   throughput         5000 non-null   int64  
 6   scenario           5000 non-null   object 
 7   protocol           5000 non-null   object 
 8   errors             4797 non-null   float64
 9   retransmissions    4614 non-null   float64
 10  device_type        5000 non-null   object 
 11  bandwidth          5000 non-null   int64  
 12  signal_strength    5000 non-null   int64  
 13  network_load       5000 non-null   float64
 14  error_rate         5000 non-null   float64
 15  sdwan_link_status  5000 non-null   object 
 16  bgp_link_status    5000 

In [15]:
train_df.drop(columns = 'timestamp', inplace=True)
test_df.drop(columns = 'timestamp', inplace=True)

In [16]:
train_df.drop(columns = ['src_ip', 'dest_ip'], inplace=True)
test_df.drop(columns = ['src_ip', 'dest_ip'], inplace=True)

In [17]:
train_df.columns

Index(['packet_size', 'latency', 'throughput', 'scenario', 'protocol',
       'errors', 'retransmissions', 'device_type', 'bandwidth',
       'signal_strength', 'network_load', 'error_rate', 'sdwan_link_status',
       'bgp_link_status', 'mpls_link_status', 'time_category'],
      dtype='object')

In [18]:
train_df[train_df.duplicated()]

Unnamed: 0,packet_size,latency,throughput,scenario,protocol,errors,retransmissions,device_type,bandwidth,signal_strength,network_load,error_rate,sdwan_link_status,bgp_link_status,mpls_link_status,time_category


In [19]:
test_df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [20]:
test_df.drop_duplicates(inplace=True)

In [21]:
train_df.isna().sum()

packet_size            0
latency                0
throughput             0
scenario               0
protocol               0
errors               203
retransmissions      386
device_type            0
bandwidth              0
signal_strength        0
network_load           0
error_rate             0
sdwan_link_status      0
bgp_link_status        0
mpls_link_status       0
time_category          0
dtype: int64

In [22]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   packet_size        5000 non-null   int64  
 1   latency            5000 non-null   int64  
 2   throughput         5000 non-null   int64  
 3   scenario           5000 non-null   object 
 4   protocol           5000 non-null   object 
 5   errors             4797 non-null   float64
 6   retransmissions    4614 non-null   float64
 7   device_type        5000 non-null   object 
 8   bandwidth          5000 non-null   int64  
 9   signal_strength    5000 non-null   int64  
 10  network_load       5000 non-null   float64
 11  error_rate         5000 non-null   float64
 12  sdwan_link_status  5000 non-null   object 
 13  bgp_link_status    5000 non-null   object 
 14  mpls_link_status   5000 non-null   object 
 15  time_category      5000 non-null   object 
dtypes: float64(4), int64(5),

In [23]:
num_cols = ['packet_size', 'latency', 'throughput', 'errors', 'retransmissions','bandwidth', 'signal_strength', 'network_load','error_rate']
cat_cols = ['scenario', 'protocol', 'sdwan_link_status', 'bgp_link_status', 'mpls_link_status', 'time_category']
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_imputer.fit(train_df[cat_cols])

In [24]:
num_imputer = SimpleImputer(strategy="mean")
num_imputer.fit(train_df[num_cols])

In [25]:
train_df[num_cols] = num_imputer.transform(train_df[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])
train_df[cat_cols] = cat_imputer.transform(train_df[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [26]:
train_df.isna().sum()

packet_size          0
latency              0
throughput           0
scenario             0
protocol             0
errors               0
retransmissions      0
device_type          0
bandwidth            0
signal_strength      0
network_load         0
error_rate           0
sdwan_link_status    0
bgp_link_status      0
mpls_link_status     0
time_category        0
dtype: int64

In [27]:
train_df.head()

Unnamed: 0,packet_size,latency,throughput,scenario,protocol,errors,retransmissions,device_type,bandwidth,signal_strength,network_load,error_rate,sdwan_link_status,bgp_link_status,mpls_link_status,time_category
0,841.0,84.0,30.0,packet_loss,UDP,4.0,5.0,firewall,998.0,-50.0,0.428483,0.01708,down,down,down,non_business_hours
1,724.0,46.0,48.0,peak_hours,UDP,9.0,4.0,firewall,187.0,-57.0,0.291209,0.013705,up,up,up,non_business_hours
2,1219.0,22.0,58.0,jitter,UDP,1.0,4.821413,switch,328.0,-97.0,0.221508,0.027237,up,up,up,non_business_hours
3,1394.0,100.0,48.0,jitter,TCP,8.0,3.0,router,491.0,-100.0,0.632264,0.011104,down,down,down,non_business_hours
4,1470.0,40.0,55.0,high_traffic,TCP,1.0,1.0,router,860.0,-70.0,0.866514,0.048174,down,down,up,non_business_hours


In [28]:
# feature engineering
train_df['combined_load'] = train_df['network_load'] + train_df['error_rate']
test_df['combined_load'] = test_df['network_load'] + test_df['error_rate']

In [29]:
train_df['efficiency'] = train_df['throughput'] / (train_df['latency'] + 1)  # Add 1 to avoid division by zero
test_df['efficiency'] = test_df['throughput'] / (test_df['latency'] + 1)
train_df['reliability_issues'] = train_df['errors'].fillna(0) + train_df['retransmissions'].fillna(0)
test_df['reliability_issues'] = test_df['errors'].fillna(0) + test_df['retransmissions'].fillna(0)
train_df['signal_to_bandwidth'] = train_df['signal_strength'] / train_df['bandwidth']
test_df['signal_to_bandwidth'] = test_df['signal_strength'] / test_df['bandwidth']


In [30]:
# drop the old columns
train_df.drop(columns=['network_load', 'error_rate', 'throughput', 'latency', 'errors', 'retransmissions', 'signal_strength','bandwidth'], inplace=True)



In [31]:
# drop the old columns
test_df.drop(columns=['network_load', 'error_rate', 'throughput', 'latency', 'errors', 'retransmissions', 'signal_strength','bandwidth'], inplace=True)



In [32]:
train_df.head()

Unnamed: 0,packet_size,scenario,protocol,device_type,sdwan_link_status,bgp_link_status,mpls_link_status,time_category,combined_load,efficiency,reliability_issues,signal_to_bandwidth
0,841.0,packet_loss,UDP,firewall,down,down,down,non_business_hours,0.445563,0.352941,9.0,-0.0501
1,724.0,peak_hours,UDP,firewall,up,up,up,non_business_hours,0.304913,1.021277,13.0,-0.304813
2,1219.0,jitter,UDP,switch,up,up,up,non_business_hours,0.248745,2.521739,5.821413,-0.295732
3,1394.0,jitter,TCP,router,down,down,down,non_business_hours,0.643368,0.475248,11.0,-0.203666
4,1470.0,high_traffic,TCP,router,down,down,up,non_business_hours,0.914687,1.341463,2.0,-0.081395


In [33]:
train_df.nunique()

packet_size            1398
scenario                  5
protocol                  2
device_type               3
sdwan_link_status         2
bgp_link_status           2
mpls_link_status          2
time_category             2
combined_load          5000
efficiency             2716
reliability_issues       54
signal_to_bandwidth    4533
dtype: int64

In [34]:
train_df.scenario.unique()

array(['packet_loss', 'peak_hours', 'jitter', 'high_traffic', 'normal'],
      dtype=object)

In [35]:
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [36]:
train_df.head()

Unnamed: 0,packet_size,scenario,protocol,device_type,sdwan_link_status,bgp_link_status,mpls_link_status,time_category,combined_load,efficiency,reliability_issues,signal_to_bandwidth
0,841.0,3,1,firewall,0,0,0,1,0.445563,0.352941,9.0,-0.0501
1,724.0,4,1,firewall,1,1,1,1,0.304913,1.021277,13.0,-0.304813
2,1219.0,1,1,switch,1,1,1,1,0.248745,2.521739,5.821413,-0.295732
3,1394.0,1,0,router,0,0,0,1,0.643368,0.475248,11.0,-0.203666
4,1470.0,0,0,router,0,0,1,1,0.914687,1.341463,2.0,-0.081395


In [37]:
train_df['device_type'] = le.fit_transform(train_df['device_type'])
test_df['device_type'] = le.transform(test_df['device_type'])

In [38]:
train_df.head()

Unnamed: 0,packet_size,scenario,protocol,device_type,sdwan_link_status,bgp_link_status,mpls_link_status,time_category,combined_load,efficiency,reliability_issues,signal_to_bandwidth
0,841.0,3,1,0,0,0,0,1,0.445563,0.352941,9.0,-0.0501
1,724.0,4,1,0,1,1,1,1,0.304913,1.021277,13.0,-0.304813
2,1219.0,1,1,2,1,1,1,1,0.248745,2.521739,5.821413,-0.295732
3,1394.0,1,0,1,0,0,0,1,0.643368,0.475248,11.0,-0.203666
4,1470.0,0,0,1,0,0,1,1,0.914687,1.341463,2.0,-0.081395


In [39]:
log_transform_cols = ['packet_size', 'combined_load', 'efficiency', 'reliability_issues']
scaling_cols = log_transform_cols + ['signal_to_bandwidth']

In [40]:
train_df[log_transform_cols] = np.log1p(train_df[log_transform_cols])
test_df[log_transform_cols] = np.log1p(test_df[log_transform_cols])

In [41]:
for col in log_transform_cols:
    if (train_df[col] <= 0).any():
        print(f"Column '{col}' contains zero or negative values.")
    else:
        print(f"Column '{col}' is safe for log transformation.")

Column 'packet_size' is safe for log transformation.
Column 'combined_load' is safe for log transformation.
Column 'efficiency' is safe for log transformation.
Column 'reliability_issues' contains zero or negative values.


In [42]:
# to handle negative values while log transformation
for col in log_transform_cols:
    if (train_df[col] <= 0).any():
        min_val = train_df[col].min()
        constant = abs(min_val) + 1
        train_df[col] += constant
        print(f"Column '{col}' contains zero or negative values. Added constant {constant} to make all values positive.")

Column 'reliability_issues' contains zero or negative values. Added constant 1.0 to make all values positive.


In [43]:
for col in log_transform_cols:
    if (test_df[col] <= 0).any():
        min_val = test_df[col].min()
        constant = abs(min_val) + 1
        test_df[col] += constant
        print(f"Column '{col}' contains zero or negative values. Added constant {constant} to make all values positive.")

Column 'reliability_issues' contains zero or negative values. Added constant 1.0 to make all values positive.


In [44]:
train_df[log_transform_cols] = np.log1p(train_df[log_transform_cols])
test_df[log_transform_cols] = np.log1p(test_df[log_transform_cols])



In [45]:
#scaling

minmax = MinMaxScaler()
train_df = minmax.fit_transform(train_df)
test_df = minmax.transform(test_df)

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_df, train_y, test_size=0.3, random_state=0)

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [48]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

In [49]:
model.fit(X_train, Y_train)

In [50]:
y_pred = model.predict(X_test)

In [51]:
acc = accuracy_score(Y_test, y_pred)

In [52]:
print(acc)

0.9926666666666667


In [53]:
class_distribution = Y_train.value_counts()

In [54]:
print(class_distribution)


congestion
0    2467
1    1033
Name: count, dtype: int64


In [55]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X_train, Y_train, cv=5)  # 5-fold cross-validation

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Cross-validation scores: [0.99428571 0.99571429 0.98857143 0.99285714 0.99      ]
Mean cross-validation score: 0.9922857142857143


In [56]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Predict on the validation set
y_pred = model.predict(X_test)

# Calculate and print precision, recall, and F1-score
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Print the confusion matrix
conf_matrix = confusion_matrix(Y_test, y_pred)
print("Confusion matrix:")
print(conf_matrix)

Precision: 0.9858823529411764
Recall: 0.9882075471698113
F1-score: 0.9870435806831567
Confusion matrix:
[[1070    6]
 [   5  419]]


In [57]:
# serialization
import joblib

In [58]:
joblib.dump(model, "network-cong-classifier.pkl")

['network-cong-classifier.pkl']

In [62]:
model_to_import = joblib.load('network-cong-classifier.pkl')

In [63]:
model_to_import.intercept_, model_to_import.coef_

AttributeError: 'RandomForestClassifier' object has no attribute 'intercept_'

In [65]:
RandomForestClassifier.__dict__

mappingproxy({'__module__': 'sklearn.ensemble._forest',
              '__annotations__': {'_parameter_constraints': dict},
              '_parameter_constraints': {'n_estimators': [<sklearn.utils._param_validation.Interval at 0x12ec973a0>],
               'bootstrap': ['boolean'],
               'oob_score': ['boolean', <function callable(obj, /)>],
               'n_jobs': [numbers.Integral, None],
               'random_state': ['random_state'],
               'verbose': ['verbose'],
               'warm_start': ['boolean'],
               'max_samples': [None,
                <sklearn.utils._param_validation.Interval at 0x12ec973d0>,
                <sklearn.utils._param_validation.Interval at 0x12ec97430>],
               'max_depth': [<sklearn.utils._param_validation.Interval at 0x12ec4c190>,
                None],
               'min_samples_split': [<sklearn.utils._param_validation.Interval at 0x12ec4cfa0>,
                <sklearn.utils._param_validation.Interval at 0x12ec4fbe0

In [70]:
train_df.head

AttributeError: 'numpy.ndarray' object has no attribute 'head'