In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv("/content/nine_systems_data.csv")

In [4]:
df

Unnamed: 0,static_text,log_level,project
0,No serialized RegionInfo in,warn,HBase
1,Scanning META | starting at row= | stopping ...,trace,HBase
2,Got exception in closing the meta scanner visitor,debug,HBase
3,Ignoring invalid region for server | ; cell=,error,HBase
4,Added region *,debug,HBase
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,elasticsearch
17687,"privileged action exception, with root cause",error,elasticsearch
17688,SimpleKdcLdapServer started.,info,elasticsearch
17689,error occurred while cleaning up after init fa...,debug,elasticsearch


In [5]:
df1 = df.drop(['project'], axis=1)

In [6]:
df1

Unnamed: 0,static_text,log_level
0,No serialized RegionInfo in,warn
1,Scanning META | starting at row= | stopping ...,trace
2,Got exception in closing the meta scanner visitor,debug
3,Ignoring invalid region for server | ; cell=,error
4,Added region *,debug
...,...,...
17686,SpnegoClient with userPrincipalName : *,info
17687,"privileged action exception, with root cause",error
17688,SimpleKdcLdapServer started.,info
17689,error occurred while cleaning up after init fa...,debug


### ***Data Cleaning***

In [7]:
import nltk
from nltk.corpus import stopwords

In [8]:
import string
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
import re
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [11]:
 def data_preprocessing(static_text):
    static_text=static_text.lower()                  #lowercase
    static_text=nltk.word_tokenize(static_text)  #tokenization


   # remove splecial characters
    x=[]
    for i in static_text:
        if i.isalnum():
            x.append(i)

    #remove number

    static_text=x[:]
    x.clear()

    num = "1234567890"
    for i in static_text:
        if i not in num:
            x.append(i)

    #remove stopwords

    static_text=x[:]
    x.clear()

    for i in static_text:
        if i not in stopwords.words('english'):
            x.append(i)

    #remove punctuation
    static_text=x[:]
    x.clear()

    for i in static_text:
        if i not in string.punctuation:
            x.append(i)


    #Stemming

    static_text= x[:]
    x.clear()

    for i in static_text:
        x.append(ps.stem(i))

    return " ".join(x)


In [12]:
#apply preprocessing
df1['preprocessed_text']=df1['static_text'].apply(lambda x: data_preprocessing(x))

In [13]:
df1['preprocessed_text'].fillna('', inplace=True)
df1 = df1.dropna(subset=['preprocessed_text'])

In [14]:
df1

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


In [15]:
df1['log_level'].value_counts()

debug    4779
info     3903
warn     3755
error    3312
trace    1938
fatal       4
Name: log_level, dtype: int64

In [16]:
df3=df1.copy()

In [17]:
df3

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


In [18]:
df3['log_level'].value_counts()

debug    4779
info     3903
warn     3755
error    3312
trace    1938
fatal       4
Name: log_level, dtype: int64

In [19]:
desired_log_levels = ['warn', 'info']
df3 = df3[df3['log_level'].str.lower().isin(desired_log_levels)]

In [20]:
df3

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
5,Added * regions to meta.,info,ad region meta
6,Updated * in hbase:meta,info,updat hbase meta
7,Deleted table | state from META,info,delet tabl state meta
8,Updated row * with server=,info,updat row
...,...,...,...
17679,caught an interrupted exception when waiting w...,warn,caught interrupt except wait close ticker thread
17684,not executing watch [*] on this scheduler beca...,info,execut watch schedul paus
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start


In [21]:
df3['log_level'].value_counts()

info    3903
warn    3755
Name: log_level, dtype: int64

In [22]:
df3.to_csv('warn_info.csv', index=False)

In [23]:
df4= df1.copy()

In [24]:
df4

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


In [25]:
df4['log_level'].value_counts()

debug    4779
info     3903
warn     3755
error    3312
trace    1938
fatal       4
Name: log_level, dtype: int64

In [26]:
desired_log_levels = ['warn', 'error']
df4 = df4[df4['log_level'].str.lower().isin(desired_log_levels)]

In [27]:
df4

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
16,Failed to parse the passed region name:,warn,fail pars pass region name
17,No serialized RegionInfo in,warn,serial regioninfo
19,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
...,...,...,...
17674,failed to parse [*],error,fail pars
17675,could not update watcher stopped status to [*]...,error,could updat watcher stop statu sourc
17679,caught an interrupted exception when waiting w...,warn,caught interrupt except wait close ticker thread
17682,Unexpected failure,error,unexpect failur


In [28]:
df4['log_level'].value_counts()

warn     3755
error    3312
Name: log_level, dtype: int64

In [29]:
df4.to_csv('warn_error.csv', index=False)

In [30]:
df5= df1.copy()

In [31]:
df5

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


In [32]:
df5['log_level'].value_counts()

debug    4779
info     3903
warn     3755
error    3312
trace    1938
fatal       4
Name: log_level, dtype: int64

In [33]:
desired_log_levels = ['info', 'error']
df5 = df5[df5['log_level'].str.lower().isin(desired_log_levels)]

In [34]:
df5['log_level'].value_counts()

info     3903
error    3312
Name: log_level, dtype: int64

In [35]:
df5.to_csv('info_error.csv', index=False)

In [36]:
df2= df1.copy()

In [37]:
df2

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
1,Scanning META | starting at row= | stopping ...,trace,scan meta start stop
2,Got exception in closing the meta scanner visitor,debug,got except close meta scanner visitor
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
4,Added region *,debug,ad region
...,...,...,...
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start
17689,error occurred while cleaning up after init fa...,debug,error occur clean init failur simplekdcldapserv


# ***Info, Error, Warn***

In [38]:
desired_log_levels = ['error', 'warn', 'info']
df2 = df2[df2['log_level'].str.lower().isin(desired_log_levels)]


In [39]:
df2

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
5,Added * regions to meta.,info,ad region meta
6,Updated * in hbase:meta,info,updat hbase meta
7,Deleted table | state from META,info,delet tabl state meta
...,...,...,...
17684,not executing watch [*] on this scheduler beca...,info,execut watch schedul paus
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start


In [40]:
df2['log_level'].value_counts()

info     3903
warn     3755
error    3312
Name: log_level, dtype: int64

In [41]:
df2

Unnamed: 0,static_text,log_level,preprocessed_text
0,No serialized RegionInfo in,warn,serial regioninfo
3,Ignoring invalid region for server | ; cell=,error,ignor invalid region server
5,Added * regions to meta.,info,ad region meta
6,Updated * in hbase:meta,info,updat hbase meta
7,Deleted table | state from META,info,delet tabl state meta
...,...,...,...
17684,not executing watch [*] on this scheduler beca...,info,execut watch schedul paus
17686,SpnegoClient with userPrincipalName : *,info,spnegocli userprincipalnam
17687,"privileged action exception, with root cause",error,privileg action except root caus
17688,SimpleKdcLdapServer started.,info,simplekdcldapserv start


In [42]:
# Create binary columns for 'info', 'warning', and 'error'
df2['info'] = df2['log_level'].apply(lambda x: 1 if x == 'info' else 0)
df2['warning'] = df2['log_level'].apply(lambda x: 1 if x == 'warn' else 0)  # Changed to 'warn'
df2['error'] = df2['log_level'].apply(lambda x: 1 if x == 'error' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['info'] = df2['log_level'].apply(lambda x: 1 if x == 'info' else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['error'] = df2['log_level'].apply(lambda x: 1 if x == 'error' else 0)


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df2['preprocessed_text'])
y = df2[['info', 'warning', 'error']]

In [45]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [47]:
# Build a multi-label classifier using a RandomForestClassifier
classifier = MultiOutputClassifier(RandomForestClassifier())
classifier.fit(X_train, y_train)

In [48]:

y_pred = classifier.predict(X_test)

In [49]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['info', 'warning', 'error'])


  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.6112123974475843
              precision    recall  f1-score   support

        info       0.81      0.80      0.80       781
       error       0.85      0.50      0.63       681

   micro avg       0.79      0.62      0.70      2194
   macro avg       0.79      0.62      0.69      2194
weighted avg       0.79      0.62      0.69      2194
 samples avg       0.62      0.62      0.62      2194



In [51]:
# Build a multi-label classifier using a RandomForestClassifier
classifier = MultiOutputClassifier(SVC())
classifier.fit(X_train, y_train)

In [52]:
y_pred = classifier.predict(X_test)

In [53]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['info', 'warning', 'error'])

  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.5838650865998177
              precision    recall  f1-score   support

        info       0.82      0.78      0.80       781
       error       0.86      0.49      0.62       681

   micro avg       0.80      0.59      0.68      2194
   macro avg       0.80      0.58      0.67      2194
weighted avg       0.80      0.59      0.67      2194
 samples avg       0.59      0.59      0.59      2194



In [55]:
s=df = pd.read_csv("/content/warn_info.csv")