In [11]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [7]:
df = pd.read_csv("../util/network_failure_systems_data_with_id.csv",index_col="ID")

In [9]:
df.head(10)

Unnamed: 0_level_0,Unnamed: 0,Component_Type,Manufacturer,Installation_Date,Traffic_Load,CPU_Usage,Memory_Usage,Temperature,Failure_Type,Failure_Timestamp,Failure_Duration_Hours,Maintenance_Activity,Maintenance_Date,Maintenance_Cost
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
N1,0,load balancer,Cisco,2017-09-20,14.793688,98.684986,86.506722,27.580917,hardware,2023-01-28,20,inspection,2022-10-08,989.878718
N2,1,firewall,Cisco,2014-12-16,8.401736,94.045871,21.520613,37.433907,configuration,2023-05-10,3,inspection,2023-12-29,375.043681
N3,2,load balancer,Dell,2013-10-17,55.54011,62.387288,51.770011,31.115325,software,2023-04-28,23,upgrade,2022-10-01,786.470249
N4,3,switch,HP,2013-02-09,45.821845,38.581668,80.366058,46.517821,hardware,2022-09-29,19,repair,2022-12-07,898.526466
N5,4,firewall,Dell,2013-09-29,92.582371,10.666095,89.997417,18.900332,hardware,2022-07-18,1,upgrade,2023-09-24,276.162455
N6,5,load balancer,Cisco,2011-08-03,57.900779,42.865349,82.045208,19.714776,configuration,2023-04-07,18,upgrade,2022-02-13,304.059698
N7,6,firewall,HP,2016-07-22,3.327435,90.299558,65.661427,20.806854,hardware,2022-09-23,15,inspection,2023-08-29,863.552721
N8,7,load balancer,Dell,2011-04-25,21.081388,54.874531,86.323673,20.660357,configuration,2022-01-22,5,inspection,2022-09-08,493.639065
N9,8,router,Juniper,2012-08-26,1.695848,44.709806,73.123967,28.766554,configuration,2023-02-14,10,repair,2022-11-20,413.873598
N10,9,switch,HP,2017-08-23,98.901836,59.100503,4.531991,5.7008,software,2022-02-25,13,upgrade,2023-07-29,994.271999


In [14]:
df.fillna(method='ffill', inplace=True)


label_encoders = {}
categorical_columns = ['Component_Type', 'Manufacturer', 'Failure_Type', 'Maintenance_Activity']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


scaler = StandardScaler()
numerical_columns = ['Traffic_Load', 'CPU_Usage', 'Memory_Usage', 'Temperature', 'Failure_Duration_Hours', 'Maintenance_Cost']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

X = df[['Component_Type', 'Manufacturer', 'Traffic_Load', 'CPU_Usage', 'Memory_Usage', 'Temperature']]
y = df['Failure_Type']


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.31675
              precision    recall  f1-score   support

           0       0.31      0.35      0.33      1274
           1       0.33      0.30      0.32      1375
           2       0.31      0.30      0.30      1351

    accuracy                           0.32      4000
   macro avg       0.32      0.32      0.32      4000
weighted avg       0.32      0.32      0.32      4000



In [19]:
from datetime import datetime

data = pd.read_csv('../util/network_failure_systems_data_with_id.csv', index_col='ID')


data['Installation_Date'] = pd.to_datetime(data['Installation_Date'])
data['Failure_Timestamp'] = pd.to_datetime(data['Failure_Timestamp'])
data['Maintenance_Date'] = pd.to_datetime(data['Maintenance_Date'])


data['Installation_Age'] = (datetime.now() - data['Installation_Date']).dt.days
data['Time_Since_Last_Maintenance'] = (datetime.now() - data['Maintenance_Date']).dt.days
data['Failure_Duration_Days'] = data['Failure_Duration_Hours'] / 24  # Convert hours to days


data['Average_Traffic_Load'] = data['Traffic_Load'].mean()
data['Average_CPU_Usage'] = data['CPU_Usage'].mean()
data['Average_Memory_Usage'] = data['Memory_Usage'].mean()


data = pd.get_dummies(data, columns=['Component_Type', 'Manufacturer'])


features = ['Installation_Age', 'Time_Since_Last_Maintenance', 'Failure_Duration_Days',
            'Average_Traffic_Load', 'Average_CPU_Usage', 'Average_Memory_Usage',
            'Temperature', 'Component_Type_load balancer', 'Component_Type_firewall',
            'Component_Type_switch', 'Component_Type_router',
            'Manufacturer_Cisco', 'Manufacturer_Dell', 'Manufacturer_HP', 'Manufacturer_Juniper']

X = data[features]
y = data['Failure_Type']  

In [20]:
X

Unnamed: 0_level_0,Installation_Age,Time_Since_Last_Maintenance,Failure_Duration_Days,Average_Traffic_Load,Average_CPU_Usage,Average_Memory_Usage,Temperature,Component_Type_load balancer,Component_Type_firewall,Component_Type_switch,Component_Type_router,Manufacturer_Cisco,Manufacturer_Dell,Manufacturer_HP,Manufacturer_Juniper
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
N1,2488,644,0.833333,49.513525,50.237834,49.714009,27.580917,1,0,0,0,1,0,0,0
N2,3497,197,0.125000,49.513525,50.237834,49.714009,37.433907,0,1,0,0,1,0,0,0
N3,3922,651,0.958333,49.513525,50.237834,49.714009,31.115325,1,0,0,0,0,1,0,0
N4,4172,584,0.791667,49.513525,50.237834,49.714009,46.517821,0,0,1,0,0,0,1,0
N5,3940,293,0.041667,49.513525,50.237834,49.714009,18.900332,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N19996,4789,884,0.833333,49.513525,50.237834,49.714009,30.502409,1,0,0,0,1,0,0,0
N19997,3936,859,0.166667,49.513525,50.237834,49.714009,24.509304,0,0,1,0,0,1,0,0
N19998,2499,883,0.541667,49.513525,50.237834,49.714009,13.486821,1,0,0,0,1,0,0,0
N19999,3416,262,0.125000,49.513525,50.237834,49.714009,42.053106,0,0,0,1,0,0,1,0


In [23]:
correlation_matrix = X.corr()
print("Correlation Matrix:\n", correlation_matrix)


from statsmodels.stats.outliers_influence import variance_inflation_factor


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled, i) for i in range(len(X.columns))]
print("\nVariance Inflation Factors (VIF):\n", vif_data)

Correlation Matrix:
                               Installation_Age  Time_Since_Last_Maintenance  \
Installation_Age                      1.000000                     0.012297   
Time_Since_Last_Maintenance           0.012297                     1.000000   
Failure_Duration_Days                -0.002988                    -0.006540   
Average_Traffic_Load                       NaN                          NaN   
Temperature                           0.007987                    -0.008623   
Component_Type_load balancer          0.002768                     0.004639   
Component_Type_firewall              -0.007521                    -0.005535   
Component_Type_switch                 0.005658                    -0.004311   
Component_Type_router                -0.000914                     0.005191   
Manufacturer_Cisco                    0.011556                     0.007391   
Manufacturer_Dell                    -0.003862                    -0.002755   
Manufacturer_HP                

  vif = 1. / (1. - r_squared_i)



Variance Inflation Factors (VIF):
                          Feature           VIF
0               Installation_Age  1.000434e+00
1    Time_Since_Last_Maintenance  1.000397e+00
2          Failure_Duration_Days  1.000262e+00
3           Average_Traffic_Load  1.000000e+00
4                    Temperature  1.000343e+00
5   Component_Type_load balancer  4.075656e+12
6        Component_Type_firewall  5.316742e+10
7          Component_Type_switch  1.445918e+11
8          Component_Type_router           inf
9             Manufacturer_Cisco  5.151033e+10
10             Manufacturer_Dell  4.794353e+10
11               Manufacturer_HP  1.344157e+12
12          Manufacturer_Juniper  1.695632e+12


In [22]:
reduced_features = [
    'Installation_Age', 'Time_Since_Last_Maintenance', 'Failure_Duration_Days',
    'Average_Traffic_Load', 'Temperature',
    'Component_Type_load balancer', 'Component_Type_firewall',
    'Component_Type_switch', 'Component_Type_router',
    'Manufacturer_Cisco', 'Manufacturer_Dell', 'Manufacturer_HP', 'Manufacturer_Juniper'
]

X_reduced = X[reduced_features]

X = X_reduced

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.32425

Classification Report:
                precision    recall  f1-score   support

configuration       0.31      0.33      0.32      1274
     hardware       0.35      0.34      0.34      1375
     software       0.32      0.31      0.32      1351

     accuracy                           0.32      4000
    macro avg       0.32      0.32      0.32      4000
 weighted avg       0.32      0.32      0.32      4000



In [26]:
predictions = rf_classifier.predict(X)
predictions

array(['configuration', 'configuration', 'software', ..., 'software',
       'configuration', 'software'], dtype=object)

In [28]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


param_dist = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}


random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_dist,
                                   n_iter=50, cv=5, n_jobs=-1, verbose=2, random_state=42)


random_search.fit(X_train, y_train)


print("Best Parameters:", random_search.best_params_)

# Best estimator found by RandomizedSearchCV
best_rf_classifier = random_search.best_estimator_

# Predictions using the best estimator
y_pred_best_rf = best_rf_classifier.predict(X_test)

# Evaluation of the best estimator
print("Accuracy of Best RF Model:", accuracy_score(y_test, y_pred_best_rf))
print("\nClassification Report of Best RF Model:\n", classification_report(y_test, y_pred_best_rf))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 20, 'bootstrap': True}
Accuracy of Best RF Model: 0.32525

Classification Report of Best RF Model:
                precision    recall  f1-score   support

configuration       0.31      0.32      0.32      1274
     hardware       0.34      0.32      0.33      1375
     software       0.32      0.33      0.33      1351

     accuracy                           0.33      4000
    macro avg       0.33      0.33      0.33      4000
 weighted avg       0.33      0.33      0.33      4000



In [None]:
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)


print("Best Parameters:", grid_search.best_params_)

best_rf_classifier = grid_search.best_estimator_


y_pred_best_rf = best_rf_classifier.predict(X_test)

print("Accuracy of Best RF Model:", accuracy_score(y_test, y_pred_best_rf))
print("\nClassification Report of Best RF Model:\n", classification_report(y_test, y_pred_best_rf))

logistic_classifier = LogisticRegression(random_state=42, max_iter=10000)
svc_classifier = SVC(random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)


logistic_classifier.fit(X_train, y_train)
y_pred_logistic = logistic_classifier.predict(X_test)


svc_classifier.fit(X_train, y_train)
y_pred_svc = svc_classifier.predict(X_test)

gb_classifier.fit(X_train, y_train)
y_pred_gb = gb_classifier.predict(X_test)


print("Accuracy of Logistic Regression:", accuracy_score(y_test, y_pred_logistic))
print("\nClassification Report of Logistic Regression:\n", classification_report(y_test, y_pred_logistic))

print("Accuracy of SVM:", accuracy_score(y_test, y_pred_svc))
print("\nClassification Report of SVM:\n", classification_report(y_test, y_pred_svc))

print("Accuracy of Gradient Boosting:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report of Gradient Boosting:\n", classification_report(y_test, y_pred_gb))

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


In [None]:
648 => 3240