# Information Extraction as described in Section 3.1 of Salesforce Paper

## Importing Packages and Files

In [1]:
import os
import json
import pandas as pd
from io import StringIO
import html
from html.parser import HTMLParser
import time
from bs4 import BeautifulSoup
import re

In [2]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"
roberta_model = pipeline('question-answering', model=model_name, tokenizer=model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [3]:
json_file = './CSO_data/CSO_all_scraped_Sign.json'

## Cleaning Functions

## RC Extraction

In [4]:
t = time.time()
with open(json_file, 'r') as f:
    data = json.load(f)

rc_ques = ['What was the root cause?', 'What was the root cause of the CSO?', 'What caused the CSO?', 'Why did CSO occur?', 'What was the root cause of the incident?', 'What caused the incident?', 'Why did incident occur?']
rc_dict = {}
for cso in list(data.keys())[:-2]:
    max_score = 0
    rc = 'None'
    if (data[cso]['problems'][0]['u_root_cause_description']== None) or (data[cso]['problems'][0]['u_root_cause_description']==''):
        x = "None"
    else:
        x = BeautifulSoup(data[cso]['problems'][0]['u_root_cause_description']).get_text().replace('\n',' ').replace('\r',' ').replace('\xa0',' ')

    trial = {}
    trial['context'] = x

    for i in rc_ques:
        trial['question'] = i
        ans = roberta_model(trial)
        if ans['score']>max_score:
            rc = ans['answer']
            max_score = ans['score']
    rc_dict[int(cso)] = rc
    print(cso, " done")
rc_time = time.time()-t

17536  done
17522  done
17510  done
17401  done
17368  done
17088  done
16755  done
16742  done
16649  done
16644  done
16624  done
16589  done
16571  done
16536  done
16532  done
16516  done
16480  done
16425  done
16280  done
16263  done
15934  done
15873  done
15869  done
15849  done
15839  done
15763  done
15739  done
15704  done
15572  done
15560  done
15558  done
15539  done
15493  done
15484  done
15463  done
15461  done
15334  done
15221  done
15215  done
15126  done
15019  done
15005  done
14965  done
14961  done
14947  done
14932  done
14902  done
14886  done
14865  done
14804  done
14797  done
14767  done
14757  done
14722  done
14710  done
14573  done
14452  done
14449  done
14385  done
14379  done
14377  done
14224  done
14073  done
14055  done
13850  done
13840  done
13808  done
13738  done
13678  done
13662  done
13660  done
13630  done
13558  done
12826  done
12686  done
12626  done
12615  done
12606  done
12187  done
12088  done
12078  done
12070  done
12052  done
1204



6326  done
6323  done
6288  done
6215  done
6186  done
6165  done
6147  done
6119  done
6065  done
6025  done


In [5]:
len(rc_dict)

180

## Rem Extraction

In [6]:
t1 = time.time()
with open(json_file, 'r') as f:
    data = json.load(f)

rem_ques = ['What was the remediation?', 'What steps were taken?', 'How was the issue resolved?','How was the problem fixed?']
rem_dict = {}
for cso in list(data.keys())[:-2]:
    max_score = 0
    rem = 'None'
    if (data[cso]['problems'][0]['u_permanent_solution'] == None) or (BeautifulSoup(data[cso]['problems'][0]['u_permanent_solution']).get_text() == ''):
        x = "None"
    else:
        x = BeautifulSoup(data[cso]['problems'][0]['u_permanent_solution']).get_text().replace('\n',' ').replace('\r',' ').replace('\xa0',' ')

    trial = {}
    trial['context'] = x

    for i in rem_ques:
        trial['question'] = i
        ans = roberta_model(trial)
        if ans['score']>max_score:
            rem = ans['answer']
            max_score = ans['score']
    rem_dict[int(cso)] = rem
    print(cso, " done")
rem_time = time.time()-t1

17536  done
17522  done
17510  done
17401  done
17368  done
17088  done
16755  done
16742  done
16649  done
16644  done
16624  done
16589  done
16571  done
16536  done
16532  done
16516  done
16480  done
16425  done
16280  done
16263  done
15934  done
15873  done
15869  done
15849  done
15839  done
15763  done
15739  done
15704  done
15572  done
15560  done
15558  done
15539  done
15493  done
15484  done
15463  done
15461  done
15334  done
15221  done
15215  done
15126  done
15019  done
15005  done
14965  done
14961  done
14947  done
14932  done
14902  done
14886  done
14865  done
14804  done
14797  done
14767  done
14757  done
14722  done
14710  done
14573  done
14452  done
14449  done
14385  done
14379  done
14377  done
14224  done
14073  done
14055  done
13850  done
13840  done
13808  done
13738  done
13678  done
13662  done
13660  done
13630  done
13558  done
12826  done
12686  done
12626  done
12615  done
12606  done
12187  done
12088  done
12078  done
12070  done
12052  done
1204

In [7]:
len(rem_dict)

180

## Symptom Extraction

In [8]:
sym_dict = {}
for i in list(data.keys())[:-2]:
    sym_dict[int(i)] = re.sub(r'http\S+', ' ', BeautifulSoup(data[i]['primaryIncident']['u_customer_impacts']).get_text().replace('\n',' ').replace('\r',' ').replace('\xa0',' '))

In [9]:
len(sym_dict)

180

## Exporting to CSV Sheet

In [10]:
df_dict = {
    'cso_number':[],
    'symptom':[],
    'root_cause':[],
    'remediations':[]
}

In [11]:
for cso in list(data.keys())[:-2]:
    cso_number = int(cso)
    df_dict['cso_number'].append(cso_number)
    df_dict['symptom'].append(sym_dict[cso_number])
    df_dict['root_cause'].append(rc_dict[cso_number])
    df_dict['remediations'].append(rem_dict[cso_number])

In [12]:
cso_df = pd.DataFrame(df_dict)
cso_df.to_csv('./CSO_data/CSO_salesforce_extracted_entities.csv')

In [13]:
cso_df

Unnamed: 0,cso_number,symptom,root_cause,remediations
0,17536,Between 2022-05-25 at 22:55 UTC and 2022-05-26...,implemented a change to get email connected in...,
1,17522,Starting on 2022-05-24 between 11:29 UTC to 11...,canary deployment,
2,17510,Starting on 2022-05-18 08:30 UTC and ending at...,misstep in converting from the sign v5 to v6 apis,1. review the manual testing procedure with te...
3,17401,"On 2022-05-06, between 07:19 UTC and 07:36 UTC...",Termination of box from AWS console,Validate alerting for nameservice tier
4,17368,"On 2022-04-28 between 08:00 UTC and 16:12 UTC,...",known behavior of the JMS message clients,message handling
...,...,...,...,...
175,6165,"Between 3:59 AM PT and 4:02 AM PT, the Adobe S...",execution of a database ALTER statement,Add a lock timeout to ALTER ENUM statements
176,6147,"Between 2:53 PM and 4:13 PM PT, Adobe Sign Mic...",degradation,ASR
177,6119,"Between 13:37 PT and 22:15 PT, 214 requests o...",periodic smoke tests were failing,autoscaling
178,6065,The issue causing impact has been resolved and...,Outage,No permanent solution being implemented
