# Information Extraction as described in Section 3.1 of Salesforce Paper

## Importing Packages and Files

In [1]:
import os
import json
import pandas as pd
from io import StringIO
import html
from html.parser import HTMLParser
import time
from bs4 import BeautifulSoup
import re

In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
model_name = "deepset/roberta-base-squad2"
roberta_model = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [3]:
json_file = './CSO_data/CSO.json'

## Cleaning Functions

## RC Extraction

In [None]:
t = time.time()
with open(json_file, 'r') as f:
    data = json.load(f)

rc_ques = ['What was the root cause?', 'What was the root cause of the CSO?', 'What caused the CSO?', 'Why did CSO occur?', 'What was the root cause of the incident?', 'What caused the incident?', 'Why did incident occur?']
rc_dict = {}
for cso in list(data.keys())[:-2]:
    max_score = 0
    rc = 'None'
    if (data[cso]['problems'][0]['u_root_cause_description']== None) or (data[cso]['problems'][0]['u_root_cause_description']==''):
        x = "None"
    else:
        x = BeautifulSoup(data[cso]['problems'][0]['u_root_cause_description']).get_text().replace('\n',' ').replace('\r',' ').replace('\xa0',' ')

    trial = {}
    trial['context'] = x

    for i in rc_ques:
        trial['question'] = i
        ans = roberta_model(trial)
        if ans['score']>max_score:
            rc = ans['answer']
            max_score = ans['score']
    rc_dict[int(cso)] = rc
    print(cso, " done")
rc_time = time.time()-t

In [5]:
len(rc_dict)

180

## Rem Extraction

In [None]:
t1 = time.time()
with open(json_file, 'r') as f:
    data = json.load(f)

rem_ques = ['What was the remediation?', 'What steps were taken?', 'How was the issue resolved?','How was the problem fixed?']
rem_dict = {}
for cso in list(data.keys())[:-2]:
    max_score = 0
    rem = 'None'
    if (data[cso]['problems'][0]['u_permanent_solution'] == None) or (BeautifulSoup(data[cso]['problems'][0]['u_permanent_solution']).get_text() == ''):
        x = "None"
    else:
        x = BeautifulSoup(data[cso]['problems'][0]['u_permanent_solution']).get_text().replace('\n',' ').replace('\r',' ').replace('\xa0',' ')

    trial = {}
    trial['context'] = x

    for i in rem_ques:
        trial['question'] = i
        ans = roberta_model(trial)
        if ans['score']>max_score:
            rem = ans['answer']
            max_score = ans['score']
    rem_dict[int(cso)] = rem
    print(cso, " done")
rem_time = time.time()-t1

In [7]:
len(rem_dict)

180

## Symptom Extraction

In [8]:
sym_dict = {}
for i in list(data.keys())[:-2]:
    sym_dict[int(i)] = re.sub(r'http\S+', ' ', BeautifulSoup(data[i]['primaryIncident']['u_customer_impacts']).get_text().replace('\n',' ').replace('\r',' ').replace('\xa0',' '))

In [9]:
len(sym_dict)

180

## Exporting to CSV Sheet

In [10]:
df_dict = {
    'cso_number':[],
    'symptom':[],
    'root_cause':[],
    'remediations':[]
}

In [11]:
for cso in list(data.keys())[:-2]:
    cso_number = int(cso)
    df_dict['cso_number'].append(cso_number)
    df_dict['symptom'].append(sym_dict[cso_number])
    df_dict['root_cause'].append(rc_dict[cso_number])
    df_dict['remediations'].append(rem_dict[cso_number])

In [12]:
cso_df = pd.DataFrame(df_dict)
cso_df.to_csv('./CSO_data/CSO_salesforce_extracted_entities.csv')