In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.metrics import matthews_corrcoef
import numpy as np
from sklearn.metrics import matthews_corrcoef, multilabel_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
#pip install --upgrade scikit-learn

In [7]:
df = pd.read_csv("/content/nine_systems_data.csv")

In [8]:
df

Unnamed: 0,static_text,log_level,project
0,No serialized RegionInfo in,warn,HBase
1,Scanning META | starting at row= | stopping ...,trace,HBase
2,Got exception in closing the meta scanner visitor,debug,HBase
3,Ignoring invalid region for server | ; cell=,error,HBase
4,Added region *,debug,HBase
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,elasticsearch
17687,"privileged action exception, with root cause",error,elasticsearch
17688,SimpleKdcLdapServer started.,info,elasticsearch
17689,error occurred while cleaning up after init fa...,debug,elasticsearch


In [9]:
df1 = df.drop(['project'], axis=1)

In [10]:
df1

Unnamed: 0,static_text,log_level
0,No serialized RegionInfo in,warn
1,Scanning META | starting at row= | stopping ...,trace
2,Got exception in closing the meta scanner visitor,debug
3,Ignoring invalid region for server | ; cell=,error
4,Added region *,debug
...,...,...
17686,SpnegoClient with userPrincipalName : *,info
17687,"privileged action exception, with root cause",error
17688,SimpleKdcLdapServer started.,info
17689,error occurred while cleaning up after init fa...,debug


### ***Data Cleaning***

In [11]:
import nltk
from nltk.corpus import stopwords

In [12]:
import string
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import re
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [15]:
 def data_preprocessing(static_text):
    static_text=static_text.lower()                  #lowercase
    static_text=nltk.word_tokenize(static_text)  #tokenization


   # remove splecial characters
    x=[]
    for i in static_text:
        if i.isalnum():
            x.append(i)

    #remove number

    static_text=x[:]
    x.clear()

    num = "1234567890"
    for i in static_text:
        if i not in num:
            x.append(i)

    #remove stopwords

    static_text=x[:]
    x.clear()

    for i in static_text:
        if i not in stopwords.words('english'):
            x.append(i)

    #remove punctuation
    static_text=x[:]
    x.clear()

    for i in static_text:
        if i not in string.punctuation:
            x.append(i)


    #Stemming

    static_text= x[:]
    x.clear()

    for i in static_text:
        x.append(ps.stem(i))

    return " ".join(x)


In [16]:
#apply preprocessing
df1['preprocessed_text']=df1['static_text'].apply(lambda x: data_preprocessing(x))

In [17]:
df1['preprocessed_text'].fillna('', inplace=True)
df1 = df1.dropna(subset=['preprocessed_text'])

In [18]:
df1

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


In [19]:
df1['log_level'].value_counts()

debug    4779
info     3903
warn     3755
error    3312
trace    1938
fatal       4
Name: log_level, dtype: int64

In [20]:
df2= df1.copy()

In [21]:
df2

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


# ***Info, Error, Warn***

In [22]:
desired_log_levels = ['error', 'warn', 'info']
df2 = df2[df2['log_level'].str.lower().isin(desired_log_levels)]


In [23]:
df2

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
5,Added * regions to meta.,info,ad region meta
6,Updated * in hbase:meta,info,updat hbase meta
7,Deleted table | state from META,info,delet tabl state meta
...,...,...,...
17684,not executing watch [*] on this scheduler beca...,info,execut watch schedul paus
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start


In [24]:
df2['log_level'].value_counts()

info     3903
warn     3755
error    3312
Name: log_level, dtype: int64

In [25]:
df2

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
5,Added * regions to meta.,info,ad region meta
6,Updated * in hbase:meta,info,updat hbase meta
7,Deleted table | state from META,info,delet tabl state meta
...,...,...,...
17684,not executing watch [*] on this scheduler beca...,info,execut watch schedul paus
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start


In [26]:
# Create binary columns for 'info', 'warning', and 'error'
df2['info'] = df2['log_level'].apply(lambda x: 1 if x == 'info' else 0)
df2['warning'] = df2['log_level'].apply(lambda x: 1 if x == 'warn' else 0)  # Changed to 'warn'
df2['error'] = df2['log_level'].apply(lambda x: 1 if x == 'error' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['info'] = df2['log_level'].apply(lambda x: 1 if x == 'info' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['error'] = df2['log_level'].apply(lambda x: 1 if x == 'error' else 0)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df2['preprocessed_text'])
y = df2[['info', 'warning', 'error']]

In [29]:
# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [30]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [31]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC


In [32]:
param_grid = {
    'estimator__n_estimators': [20, 50, 100],
    'estimator__max_depth': [None, 5, 10],
    'estimator__min_samples_split': [2, 5, 10],
}

In [33]:
# Build a multi-label classifier using a RandomForestClassifier
base_classifier = RandomForestClassifier()
classifier = MultiOutputClassifier(base_classifier)

In [34]:
# Perform GridSearchCV for hyperparameter tuning and k-fold cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [35]:
# Get the best classifier from the hyperparameter tuning
best_classifier = grid_search.best_estimator_

In [36]:
# Evaluate on the validation set
y_val_pred = best_classifier.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
report_val = classification_report(y_val, y_val_pred, target_names=['info', 'warning', 'error'])
print("Validation Set Performance:")
print(f"Accuracy: {accuracy_val}")
print(report_val)

conf_matrices_val = multilabel_confusion_matrix(y_val, y_val_pred)
mcc_val = np.mean([matthews_corrcoef(conf_matrix[:, 1], conf_matrix[:, 1]) for conf_matrix in conf_matrices_val])
print("Matthews Correlation Coefficient (Validation - RandomForest):", mcc_val)

Validation Set Performance:
Accuracy: 0.6153144940747494
              precision    recall  f1-score   support

        info       0.81      0.81      0.81       384
       error       0.81      0.54      0.65       338

   micro avg       0.78      0.63      0.70      1097
   macro avg       0.78      0.63      0.69      1097
weighted avg       0.78      0.63      0.69      1097
 samples avg       0.62      0.63      0.63      1097

Matthews Correlation Coefficient (Validation - RandomForest): 1.0


  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# Evaluate on the test set
y_test_pred = best_classifier.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
report_test = classification_report(y_test, y_test_pred, target_names=['info', 'warning', 'error'])
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_test}")
print(report_test)

# Evaluate on the test set for RandomForestClassifier
conf_matrices_test = multilabel_confusion_matrix(y_test, y_test_pred)

mcc_test = np.mean([matthews_corrcoef(conf_matrix[:, 1], conf_matrix[:, 1]) for conf_matrix in conf_matrices_test])
print("Matthews Correlation Coefficient (Test - RandomForest):", mcc_test)




Test Set Performance:
Accuracy: 0.6089334548769371
              precision    recall  f1-score   support

        info       0.80      0.82      0.81       397
       error       0.85      0.50      0.63       343

   micro avg       0.78      0.62      0.69      1097
   macro avg       0.78      0.61      0.68      1097
weighted avg       0.78      0.62      0.68      1097
 samples avg       0.62      0.62      0.62      1097

Matthews Correlation Coefficient (Test - RandomForest): 1.0


  _warn_prf(average, modifier, msg_start, len(result))


In [38]:

# Find indices where RandomForestClassifier predictions are incorrect
incorrect_indices_rf = np.where(~np.all(y_test.values == y_test_pred, axis=1))[0]

# Print the records where RandomForestClassifier failed to predict labels
print("Records where RandomForestClassifier failed to predict labels:")
for idx in incorrect_indices_rf:
    print(f"Index: {idx}")
    print(f"True Labels: {y_test.iloc[idx].values}")
    print(f"Predicted Labels: {y_test_pred[idx]}\n")

Records where RandomForestClassifier failed to predict labels:
Index: 2
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 3
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 10
True Labels: [0 1 0]
Predicted Labels: [0 0 1]

Index: 12
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index: 13
True Labels: [0 1 0]
Predicted Labels: [0 0 1]

Index: 18
True Labels: [0 0 1]
Predicted Labels: [0 1 0]

Index: 19
True Labels: [0 0 1]
Predicted Labels: [0 1 0]

Index: 20
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 22
True Labels: [1 0 0]
Predicted Labels: [0 0 1]

Index: 23
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 25
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 26
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index: 27
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 28
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 30
True Labels: [0 0 1]
Predicted Labels: [1 0 0]

Index: 34
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index: 35
T

In [39]:
# Print the records where RandomForestClassifier predicted labels correctly
correct_indices_rf = np.where(np.all(y_test.values == y_test_pred, axis=1))[0]
print("\nRecords where RandomForestClassifier predicted labels correctly:")
for idx in correct_indices_rf[:5]:  # Print only the first 5 correct predictions for brevity
    print(f"Index: {idx}")
    print(f"True Labels: {y_test.iloc[idx].values}")
    print(f"Predicted Labels: {y_test_pred[idx]}\n")


Records where RandomForestClassifier predicted labels correctly:
Index: 0
True Labels: [0 1 0]
Predicted Labels: [0 1 0]

Index: 1
True Labels: [1 0 0]
Predicted Labels: [1 0 0]

Index: 4
True Labels: [1 0 0]
Predicted Labels: [1 0 0]

Index: 5
True Labels: [0 0 1]
Predicted Labels: [0 0 1]

Index: 6
True Labels: [0 0 1]
Predicted Labels: [0 0 1]



In [40]:
# Set up the parameter grid for hyperparameter tuning
param_grid_svm = {
    'estimator__C': [0.1, 1, 10, 25],
    'estimator__gamma': ['scale', 'auto',0.1, 1, 10],
}

In [41]:
# Build a multi-label classifier using an SVC (Support Vector Machine)
base_classifier_svm = SVC()
classifier_svm = MultiOutputClassifier(base_classifier_svm)

In [42]:
# Perform GridSearchCV for hyperparameter tuning and k-fold cross-validation
grid_search_svm = GridSearchCV(classifier_svm, param_grid_svm, cv=3, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

In [43]:
# Get the best classifier from the hyperparameter tuning
best_classifier_svm = grid_search_svm.best_estimator_

In [44]:
# Evaluate on the validation set
y_val_pred_svm = best_classifier_svm.predict(X_val)
accuracy_val_svm = accuracy_score(y_val, y_val_pred_svm)
report_val_svm = classification_report(y_val, y_val_pred_svm, target_names=['info', 'warning', 'error'])
print("Validation Set Performance (SVM):")
print(f"Accuracy: {accuracy_val_svm}")
print(report_val_svm)

conf_matrices_val_svm = multilabel_confusion_matrix(y_val, y_val_pred_svm)
mcc_val_svm = np.mean([matthews_corrcoef(conf_matrix[:, 1], conf_matrix[:, 1]) for conf_matrix in conf_matrices_val_svm])
print("\nMatthews Correlation Coefficient (Validation - SVM):", mcc_val_svm)


Validation Set Performance (SVM):
Accuracy: 0.6426618049225159
              precision    recall  f1-score   support

        info       0.80      0.81      0.81       384
       error       0.81      0.61      0.70       338

   micro avg       0.78      0.66      0.72      1097
   macro avg       0.78      0.66      0.71      1097
weighted avg       0.78      0.66      0.71      1097
 samples avg       0.65      0.66      0.65      1097


Matthews Correlation Coefficient (Validation - SVM): 1.0


  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
# Evaluate on the test set
y_test_pred_svm = best_classifier_svm.predict(X_test)
accuracy_test_svm = accuracy_score(y_test, y_test_pred_svm)
report_test_svm = classification_report(y_test, y_test_pred_svm, target_names=['info', 'warning', 'error'])
print("\nTest Set Performance (SVM):")
print(f"Accuracy: {accuracy_test_svm}")
print(report_test_svm)

conf_matrices_test_svm = multilabel_confusion_matrix(y_test, y_test_pred_svm)
mcc_test_svm = np.mean([matthews_corrcoef(conf_matrix[:, 1], conf_matrix[:, 1]) for conf_matrix in conf_matrices_test_svm])
print("Matthews Correlation Coefficient (Test - SVM):", mcc_test_svm)






Test Set Performance (SVM):
Accuracy: 0.6335460346399271
              precision    recall  f1-score   support

        info       0.80      0.79      0.80       397
       error       0.83      0.56      0.67       343

   micro avg       0.77      0.65      0.71      1097
   macro avg       0.78      0.65      0.70      1097
weighted avg       0.78      0.65      0.70      1097
 samples avg       0.64      0.65      0.65      1097

Matthews Correlation Coefficient (Test - SVM): 1.0


  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# Find indices where Support Vector Machine (SVM) predictions are incorrect
incorrect_indices_svm = np.where(~np.all(y_test.values == y_test_pred_svm, axis=1))[0]

# Print the records where Support Vector Machine (SVM) failed to predict labels
print("\nRecords where Support Vector Machine (SVM) failed to predict labels:")
for idx in incorrect_indices_svm:
    print(f"Index: {idx}")
    print(f"True Labels: {y_test.iloc[idx].values}")
    print(f"Predicted Labels: {y_test_pred_svm[idx]}\n")


Records where Support Vector Machine (SVM) failed to predict labels:
Index: 2
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 3
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 5
True Labels: [0 0 1]
Predicted Labels: [0 1 0]

Index: 10
True Labels: [0 1 0]
Predicted Labels: [0 0 1]

Index: 12
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index: 13
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 16
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 18
True Labels: [0 0 1]
Predicted Labels: [0 1 0]

Index: 20
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 22
True Labels: [1 0 0]
Predicted Labels: [0 1 0]

Index: 23
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 25
True Labels: [1 0 0]
Predicted Labels: [0 0 0]

Index: 26
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index: 27
True Labels: [0 1 0]
Predicted Labels: [0 0 0]

Index: 28
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index: 34
True Labels: [0 1 0]
Predicted Labels: [1 0 0]

Index

In [47]:
# Print the records where Support Vector Machine (SVM) predicted labels correctly
correct_indices_svm = np.where(np.all(y_test.values == y_test_pred_svm, axis=1))[0]
print("Records where Support Vector Machine (SVM) predicted labels correctly:")
for idx in correct_indices_svm[:5]:  # Print only the first 5 correct predictions for brevity
    print(f"Index: {idx}")
    print(f"True Labels: {y_test.iloc[idx].values}")
    print(f"Predicted Labels: {y_test_pred_svm[idx]}\n")

Records where Support Vector Machine (SVM) predicted labels correctly:
Index: 0
True Labels: [0 1 0]
Predicted Labels: [0 1 0]

Index: 1
True Labels: [1 0 0]
Predicted Labels: [1 0 0]

Index: 4
True Labels: [1 0 0]
Predicted Labels: [1 0 0]

Index: 6
True Labels: [0 0 1]
Predicted Labels: [0 0 1]

Index: 7
True Labels: [1 0 0]
Predicted Labels: [1 0 0]

