In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType,DoubleType
spark = SparkSession.builder.getOrCreate()

In [2]:
df_train = spark.read.csv("Token1_Falcon_HC_Model_Data_CSV.csv",inferSchema=False,header=True)
# df_test = spark.read.csv("Token1_Test_Falcon_HC_Model_Data_CSV.csv",inferSchema=True,header=True)

In [3]:
df_train.show(155)

+-----+------------------+----------------------------+------------------+-----------------+------------------------------+--------------------+---------------------------------+------------------------+-------------------+--------------+-----------+-------------------+-------------------------+------------------------+-------------------+-------------------------------------+---------------------+--------------------+--------------------------+--------------------------+---------------------+----------------------------+----------------------------+-----------------------+
|Index|      Solution Key|           Error Information|Source Line Number|Remote IP Address|Program/Method/Function Module|             Package|Name of Method or Function Module|Name of Class or Program|       Message Area|Message number|Expiry Date|  Error Subcategory|         Error Short Text|Application component ID|   Application Area|ABAP Name of Consumer or Server Proxy|Error Log Information|        Sender par

In [4]:
# cols=(  'Source Line Number',
#         'Remote IP Address',
#         'Program/Method/Function Module',
#         'Message Area',
#         'Message number',
#         'Expiry Date',
#         'Application component ID',
#         'Application Area',
#         'Sender party',
#         'Sender interface namespace',
#         'Receiver interface namespace'
#         )
# df_train.drop(*cols)


In [5]:
indexer = StringIndexer(inputCols=[
    'Solution Key',
    'Error Information',
    'Source Line Number',
    'Remote IP Address',
    'Program/Method/Function Module',
    'Package',
    'Name of Method or Function Module',
    'Name of Class or Program',
    'Message Area',
    'Message number',
    'Expiry Date',
    'Error Subcategory',
    'Error Short Text',
    'Application component ID',
    'Application Area',
    'ABAP Name of Consumer or Server Proxy',
    'Error Log Information',
    'Sender party',
    'Sender interface operation',
    'Sender interface namespace',
    'Sender interface name',
    'Receiver interface operation',
    'Receiver interface namespace',
    'Receiver interface name'
],
                        outputCols=[
    'label',
    'Error Information_index',
    'Source Line Number_index',
    'Remote IP Address_index',
    'Program/Method/Function Module_index',
    'Package_index',
    'Name of Method or Function Module_index',
    'Name of Class or Program_index',
    'Message Area_index',
    'Message number_index',
    'Expiry Date_index',
    'Error Subcategory_index',
    'Error Short Text_index',
    'Application component ID_index',
    'Application Area_index',
    'ABAP Name of Consumer or Server Proxy_index',
    'Error Log Information_index',
    'Sender party_index',
    'Sender interface operation_index',
    'Sender interface namespace_index',
    'Sender interface name_index',
    'Receiver interface operation_index',
    'Receiver interface namespace_index',
    'Receiver interface name_index']).setHandleInvalid("keep")
indexed_train = indexer.fit(df_train).transform(df_train)


                        

In [6]:
indexed_train.show(155)

+-----+------------------+----------------------------+------------------+-----------------+------------------------------+--------------------+---------------------------------+------------------------+-------------------+--------------+-----------+-------------------+-------------------------+------------------------+-------------------+-------------------------------------+---------------------+--------------------+--------------------------+--------------------------+---------------------+----------------------------+----------------------------+-----------------------+-----+-----------------------+------------------------+-----------------------+------------------------------------+-------------+---------------------------------------+------------------------------+------------------+--------------------+-----------------+-----------------------+----------------------+------------------------------+----------------------+-------------------------------------------+----------------

In [7]:
indexed_test = indexed_train.where(indexed_train.Index>113)
indexed_test.show(100)

+-----+------------------+----------------------------+------------------+-----------------+------------------------------+--------------------+---------------------------------+------------------------+-------------------+--------------+-----------+-------------------+-------------------------+------------------------+-------------------+-------------------------------------+---------------------+--------------------+--------------------------+--------------------------+---------------------+----------------------------+----------------------------+-----------------------+-----+-----------------------+------------------------+-----------------------+------------------------------------+-------------+---------------------------------------+------------------------------+------------------+--------------------+-----------------+-----------------------+----------------------+------------------------------+----------------------+-------------------------------------------+----------------

In [8]:
from pyspark.ml.feature import VectorAssembler
numericCols = [
    'Error Information_index',
    'Source Line Number_index',
    'Remote IP Address_index',
    'Program/Method/Function Module_index',
    'Package_index',
    'Name of Method or Function Module_index',
    'Name of Class or Program_index',
    'Message Area_index',
    'Message number_index',
    'Expiry Date_index',
    'Error Subcategory_index',
    'Error Short Text_index',
    'Application component ID_index',
    'Application Area_index',
    'ABAP Name of Consumer or Server Proxy_index',
    'Error Log Information_index',
    'Sender party_index',
    'Sender interface operation_index',
    'Sender interface namespace_index',
    'Sender interface name_index',
    'Receiver interface operation_index',
    'Receiver interface namespace_index',
    'Receiver interface name_index'
]
assembler_train = VectorAssembler(inputCols=numericCols, outputCol="features")
indexed_train = assembler_train.transform(indexed_train)
assembler_test = VectorAssembler(inputCols=numericCols, outputCol="features")
indexed_test = assembler_test.transform(indexed_test)

In [9]:
indexed_train.show(111,truncate=False)

+-----+------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------+-----------------+------------------------------+-----------------------------+-----------------------------------+--------------------------------+-------------------+--------------+-----------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+-------------------+-------------------------------------+---------------------+----------------------------+----------------------------------------------------------+----------------------------------------------+---------------------------------------------------+------------------------------------------------------------------------+-------------------------------------

In [10]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label').setMaxBins(52)
rfModel = rf.fit(indexed_train)
predictions = rfModel.transform(indexed_test)
predictions.select('Solution Key', 'Error Information', 'label', 'rawPrediction', 'prediction', 'probability').show(300)

+------------------+----------------------------+-----+--------------------+----------+--------------------+
|      Solution Key|           Error Information|label|       rawPrediction|prediction|         probability|
+------------------+----------------------------+-----+--------------------+----------+--------------------+
|SCM_FALCON001_2022|        ['able', 'CM_AP_P...|  3.0|[0.19967847075405...|       3.0|[0.00998392353770...|
|SCM_FALCON002_2022|        ['account', 'date...| 11.0|[0.44408527648161...|      11.0|[0.02220426382408...|
|SCM_FALCON003_2022|        ['-', '.', '>', '...| 31.0|[0.44408527648161...|      28.0|[0.02220426382408...|
|SCM_FALCON004_2022|        ['able', 'CM_AP_P...|  4.0|[0.07454004329004...|       4.0|[0.00372700216450...|
|SCM_FALCON005_2022|        ['able', 'CM_APDL...| 12.0|[0.16875022107580...|      12.0|[0.00843751105379...|
|SCM_FALCON006_2022|        ['able', 'CM_APDL...| 32.0|[0.49964083203716...|      32.0|[0.02498204160185...|
|SCM_FALCON007_2022

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.6938775510204082


In [12]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.pipeline import Pipeline
nb = NaiveBayes(featuresCol='features', labelCol='label')
nb_pipeline = Pipeline(stages=[nb])
nb_model = nb_pipeline.fit(indexed_train)

nb_predictions = nb_model.transform(indexed_test)
nb_predictions.select('Solution Key', 'Error Information','Sender interface name','Receiver interface name','label', 'rawPrediction', 'prediction', 'probability').show(300)

+------------------+----------------------------+---------------------+-----------------------+-----+--------------------+----------+--------------------+
|      Solution Key|           Error Information|Sender interface name|Receiver interface name|label|       rawPrediction|prediction|         probability|
+------------------+----------------------------+---------------------+-----------------------+-----+--------------------+----------+--------------------+
|SCM_FALCON001_2022|        ['able', 'CM_AP_P...| OutboundDeliveryP...|   InboundDeliveryPr...|  3.0|[-105.03041685577...|       3.0|[0.00264799503177...|
|SCM_FALCON002_2022|        ['account', 'date...| LogisticsExecutio...|   InboundDeliveryPr...| 11.0|[-91.873654013907...|      11.0|[5.14756761165205...|
|SCM_FALCON003_2022|        ['-', '.', '>', '...| OutboundDeliveryP...|   InboundDeliveryPr...| 31.0|[-165.34429151845...|      31.0|[6.41177923410936...|
|SCM_FALCON004_2022|        ['able', 'CM_AP_P...| InboundDeliveryPr...

In [13]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
accuracy = evaluator.evaluate(nb_predictions)
print('Accuracy:', accuracy)

Accuracy: 0.7755102040816326


In [14]:
import fileinput
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
masterDict={
    'Solution Key':'nan',
    'Error Information':'nan',
    'Source Line Number':'nan',
    'Remote IP Address':'nan',
    'Program/Method/Function Module':'nan',
    'Package':'nan',
    'Name of Method or Function Module':'nan',
    'Name of Class or Program':'nan',
    'Message Area':'nan',
    'Message number':'nan',
    'Expiry Date':'nan',
    'Error Subcategory':'nan',
    'Error Short Text':'nan',
    'Application component ID':'nan',
    'Application Area':'nan',
    'ABAP Name of Consumer or Server Proxy':'nan',
    'Error Log Information':'nan',
    'Sender party':'nan',
    'Sender interface operation':'nan',
    'Sender interface namespace':'nan',
    'Sender interface name':'nan',
    'Receiver interface operation':'nan',
    'Receiver interface namespace':'nan',
    'Receiver interface name':'nan'
}
def tokennizeData(data):
    sw = stopwords.words('english')
    # remove stop words from the string
    X_list = word_tokenize(data)
    X_set = {w for w in X_list if not w in sw}
    return X_set
def fileAnalyzer(filename):

    global masterDict
    for line in fileinput.input(files=filename, encoding="utf-8"):
        if(line.startswith("Receiver interface name:", 0)):
            data = line[25:-1]

            data = data.strip()
            data = data.strip('" \"')

            masterDict['Receiver interface name'] = (
                data if data != "" else 'nan')
            # if(isErrorData):
            #     global receiver
            #     receiver = data if data!="" else 'nan'
        elif(line.startswith("Receiver interface namespace:", 0)):
            data = line[30:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Receiver interface namespace'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Receiver interface operation:", 0)):
            data = line[30:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Receiver interface operation'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Sender interface name:", 0)):
            data = line[23:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Sender interface name'] = (
                data if data != "" else 'nan')
            # if(isErrorData):
            #     global sender
            #     sender=data if data!="" else 'nan'
        elif(line.startswith("Sender interface namespace:", 0)):
            data = line[28:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Sender interface namespace'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Sender interface operation:", 0)):
            data = line[28:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Sender interface operation'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Sender party:", 0)):
            data = line[14:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Sender party'] = (data if data != "" else 'nan')
        elif(line.startswith("Error Log Information:", 0)):
            data = line[23:-1]
            data = data.strip()
            data = data.strip('" \"')

            data = tokennizeData(data)
            data=sorted(data,key=str.casefold)
            masterDict['Error Log Information'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Error information:", 0)):
            data = line[19:-1]
            data = data.strip()
            data = data.strip('" \"')

            data = tokennizeData(data)
            data=sorted(data,key=str.casefold)
            masterDict['Error Information'] = (
                data if data != "" else 'nan')
        elif(line.startswith("ABAP Name of Consumer or Server Proxy:", 0)):
            data = line[40:-1]
            data = data.strip()
            data = data.strip('" \"')


            masterDict['ABAP Name of Consumer or Server Proxy'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Application Area:", 0)):
            data = line[18:-1]
            data = data.strip()
            data = data.strip('" \"')


            masterDict['Application Area'] = (data if data != "" else 'nan')
        elif(line.startswith("Application component ID:", 0)):
            data = line[26:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Application component ID'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Error Short Text:", 0)):
            data = line[18:-1]
            data = data.strip()
            data = data.strip('" \"')

            data = tokennizeData(data)
            data=sorted(data,key=str.casefold)
            masterDict['Error Short Text'] = (data if data != "" else 'nan')
            # masterDict['PAF Error']=(data if data != "" else 'nan' )
        elif(line.startswith("Error Subcategory:", 0)):
            data = line[19:-1]
            data = data.strip()
            data = data.strip('" \"')

            data = tokennizeData(data)
            data=sorted(data,key=str.casefold)
            masterDict['Error Subcategory'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Expiry Date:", 0)):
            data = line[13:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Expiry Date'] = (data if data != "" else 'nan')
        elif(line.startswith("Message number:", 0)):
            data = str(line[16:-1])
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Message number'] = (data if data != "" else 'nan')
        elif(line.startswith("Message Area:", 0)):
            data = line[14:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Message Area'] = (data if data != "" else 'nan')
        elif(line.startswith("Name of Class or Program:", 0)):
            data = line[26:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Name of Class or Program'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Name of Method or Function Module:", 0)):
            data = line[35:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Name of Method or Function Module'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Package:", 0)):
            data = line[9:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Package'] = (data if data != "" else 'nan')
        elif(line.startswith("Program/Method/Function Module:", 0)):
            data = line[32:-1]
            data = data.strip()
            data = data.strip('" \"')
            # print(data)
            masterDict['Program/Method/Function Module'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Remote IP Address:", 0)):
            data = line[19:-1]
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Remote IP Address'] = (
                data if data != "" else 'nan')
        elif(line.startswith("Source Line Number:", 0)):
            # print(line)
            data = (line[20:])
            # print(data)
            data = data.strip()
            data = data.strip('" \"')

            masterDict['Source Line Number'] = (
                data if data != "" else 'nan')
    fileinput.close()



fileAnalyzer('errorInput.txt')


In [15]:
print(masterDict)

{'Solution Key': 'nan', 'Error Information': ['account', 'date', 'User', 'validity'], 'Source Line Number': '0', 'Remote IP Address': 'nan', 'Program/Method/Function Module': 'nan', 'Package': 'nan', 'Name of Method or Function Module': 'nan', 'Name of Class or Program': 'nan', 'Message Area': 'SRT_HC', 'Message number': '007', 'Expiry Date': '0000-00-00', 'Error Subcategory': [], 'Error Short Text': ['Application', 'Commit', 'Error', 'execution', 'Work'], 'Application component ID': 'nan', 'Application Area': 'SRT_HC', 'ABAP Name of Consumer or Server Proxy': 'nan', 'Error Log Information': [], 'Sender party': 'LogisticsExecutionControl', 'Sender interface operation': 'RequestDeliveryFulfilment', 'Sender interface namespace': 'http://sap.com/xi/AP/SupplyChainControl/Global', 'Sender interface name': 'LogisticsExecutionControlFulfilmentOut', 'Receiver interface operation': 'MaintainInboundDeliveryRequest', 'Receiver interface namespace': 'http://sap.com/xi/AP/LogisticsExecution/Global'

In [16]:
import pandas as pd
df = pd.DataFrame.from_dict(masterDict, orient='index')
df = df.transpose()
df.to_csv('my_file.csv', index=False, header=True)

In [17]:
df_to_pred = spark.read.csv("my_file.csv",inferSchema=False,header=True)
df_to_pred.show()

+------------+--------------------+------------------+-----------------+------------------------------+-------+---------------------------------+------------------------+------------+--------------+-----------+-----------------+--------------------+------------------------+----------------+-------------------------------------+---------------------+--------------------+--------------------------+--------------------------+---------------------+----------------------------+----------------------------+-----------------------+
|Solution Key|   Error Information|Source Line Number|Remote IP Address|Program/Method/Function Module|Package|Name of Method or Function Module|Name of Class or Program|Message Area|Message number|Expiry Date|Error Subcategory|    Error Short Text|Application component ID|Application Area|ABAP Name of Consumer or Server Proxy|Error Log Information|        Sender party|Sender interface operation|Sender interface namespace|Sender interface name|Receiver interface op

In [18]:
indexed_to_pred = indexer.fit(df_train).transform(df_to_pred)
indexed_to_pred.show()

+------------+--------------------+------------------+-----------------+------------------------------+-------+---------------------------------+------------------------+------------+--------------+-----------+-----------------+--------------------+------------------------+----------------+-------------------------------------+---------------------+--------------------+--------------------------+--------------------------+---------------------+----------------------------+----------------------------+-----------------------+-----+-----------------------+------------------------+-----------------------+------------------------------------+-------------+---------------------------------------+------------------------------+------------------+--------------------+-----------------+-----------------------+----------------------+------------------------------+----------------------+-------------------------------------------+---------------------------+------------------+-------------------

In [19]:
assembler_to_pred = VectorAssembler(inputCols=numericCols, outputCol="features")
indexed_to_pred = assembler_to_pred.transform(indexed_to_pred)


In [20]:
indexed_to_pred.show(truncate=False)

+------------+---------------------------------------+------------------+-----------------+------------------------------+-------+---------------------------------+------------------------+------------+--------------+-----------+-----------------+-------------------------------------------------------+------------------------+----------------+-------------------------------------+---------------------+-------------------------+--------------------------+----------------------------------------------+--------------------------------------+------------------------------+----------------------------------------------+-------------------------------------+-----+-----------------------+------------------------+-----------------------+------------------------------------+-------------+---------------------------------------+------------------------------+------------------+--------------------+-----------------+-----------------------+----------------------+------------------------------+---

In [21]:
# nb_predictions = nb_model.predict(indexed_to_pred)
nb_predictions=nb_model.transform(indexed_to_pred)
# nb_predictions.columns
nb_predictions.select('Solution Key', 'Error Information','Sender interface name','Receiver interface name','label', 'rawPrediction', 'prediction', 'probability').show(300,truncate=False)

+------------+---------------------------------------+--------------------------------------+-------------------------------------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
from pyspark.sql.functions import desc
# nb_predictions = nb_model.predict(indexed_to_pred)
sorted_pred = nb_predictions.sort(desc("probability",))
sorted_pred.show(truncate=False)



TypeError: Invalid argument, not a string or column: ('probability', 'label') of type <class 'tuple'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [None]:
# top3_label = sorted_pred.select("label","probability")\
#                         .limit(3)\
#                         .rdd\
#                         .map(lambda x:(x[0],round(float(x[1][1]),2)*100))\
#                         .collect()
                        

# nb_predictions=nb_model.predict_proba(indexed_to_pred) 
# ff = clf_pf.predict_proba(test) 
sorted_prob=nb_predictions.collect()[0][50]
# print(sorted_prob[27])
i=0
for x in sorted_prob:
  percentage=round(sorted_prob[i],2)*100
#   print("i={}  percentage={}%",(i+1,percentage))
  print("index : %2d, percentage : %5.2f" % (i, percentage))
  i=i+1
# for i,(label,probability) in enumerate(top3_label):
#     print("Rank {}: Label = {}, probability={}%".format(i+1,label,probability))

index :  0, percentage :  0.00
index :  1, percentage :  0.00
index :  2, percentage :  0.00
index :  3, percentage :  0.00
index :  4, percentage :  0.00
index :  5, percentage :  0.00
index :  6, percentage :  0.00
index :  7, percentage :  0.00
index :  8, percentage :  0.00
index :  9, percentage :  0.00
index : 10, percentage :  0.00
index : 11, percentage : 79.00
index : 12, percentage :  0.00
index : 13, percentage :  0.00
index : 14, percentage :  0.00
index : 15, percentage :  0.00
index : 16, percentage :  0.00
index : 17, percentage :  0.00
index : 18, percentage :  0.00
index : 19, percentage :  0.00
index : 20, percentage : 21.00
index : 21, percentage :  0.00
index : 22, percentage :  0.00
index : 23, percentage :  0.00
index : 24, percentage :  0.00
index : 25, percentage :  0.00
index : 26, percentage :  0.00
index : 27, percentage :  0.00
index : 28, percentage :  0.00
index : 29, percentage :  0.00
index : 30, percentage :  0.00
index : 31, percentage :  0.00
index : 

In [None]:
sorted_prob=nb_predictions.collect()[0][50]
def sort_index(lst, rev=True):
    index = range(len(lst))
    s = sorted(index, reverse=rev, key=lambda i: lst[i])
    return s[:3]
res=sort_index(sorted_prob)
print(res)


[11, 20, 41]


In [None]:
pred=nb_predictions.collect()[0][51]
print(pred)
result=indexed_train.filter(indexed_train.label==pred).collect()[0][1]
print(result)

11.0
SCM_FALCON002_2022
