# Data Preparation

Datasets provided for procedure detection are content from --
- 1. Libre help - help.libreoffice.org
- 2. Ubuntu help - help.ubuntu.com

In [3]:
import pandas as pd
import re
import html
import os

DATA_PATH = 'data'

In [4]:
libre_df  =pd.read_csv(os.path.join(DATA_PATH, "libre_help_lists.csv"))

libre_df.head()

Unnamed: 0.1,Unnamed: 0,Page_title,Subtitle,Lists,Context,List_tags
0,0,creating numbered or bulleted lists as you type,to enable automatic numbering and bulleting,choose tools - autocorrect - autocorrect optio...,libreoffice can automatically apply numbering ...,1
1,1,creating numbered or bulleted lists as you type,to create a numbered or bulleted list while yo...,"type 1., i., or i. to start a numbered list. t...",,1
2,2,changing page background,to change the page background,to only apply the new page style to a single p...,next style,1
3,3,changing page background,to use different page backgrounds in the same ...,to change the background of the current page o...,style,1
4,4,changing page background,to change the page background,choose view - styles . <st> click the page sty...,libreoffice uses page styles to specify the ba...,1


In [5]:
ubuntu_df = pd.read_csv(os.path.join(DATA_PATH, "ubuntu_help_lists.csv"))
ubuntu_df.head()

Unnamed: 0.1,Unnamed: 0,Page_title,Subtitle,Lists,Context,List_tags
0,0,How do I calibrate my camera?,More Information,Calibration,More Information,0
1,1,How do I calibrate my camera?,See Also,How do I calibrate my printer? — Calibrating y...,See Also,0
2,3,Control location services,Turn off the geolocation features of your desktop,Open the Activities overview and\r\n star...,Turn off the geolocation features of your desktop,1
3,4,Files list columns preferences,Files list columns preferences,The first character is the file type. - means ...,drwxrw-r--,0
4,5,Files list columns preferences,Files list columns preferences,"r : readable, meaning that you can open the fi...",Each permission has the following meanings:,0


In [6]:
len(libre_df) , len(ubuntu_df)

(550, 457)

Let's clean-up the text content in both the files. We are mostly interested in the columns 'Subtitle' and 'Lists'. We will remove all the extra and unwanted characters from the text. We will also remove any list which is less than 50 characters in length.
 

In [7]:
def fixup(x):
    if isinstance(x, str):
        x = x.replace('\u2019', "'").replace('\xa0', ' ').replace('\u201c',"'").replace('\u201d',"'")
        x = re.sub(r"[{\[\]}/&*%$#@\']", "", x)
        x = html.unescape(x.strip())
        return x
    else: 
        x=''
        return x

def fixup_lists(x):
    if fixup(x) and len(x)>50:
        return x

In [8]:
fixup_lists("Calibration")

In [9]:
libre_lists = pd.DataFrame({'Subtitle': libre_df['Subtitle'].apply(fixup), 'Lists': libre_df['Lists'].apply(fixup_lists), 'Labels': libre_df['List_tags']})
ubuntu_lists = pd.DataFrame({'Subtitle': ubuntu_df['Subtitle'].apply(fixup), 'Lists': ubuntu_df['Lists'].apply(fixup_lists), 'Labels': ubuntu_df['List_tags']})

In [10]:
len(libre_lists) , len(ubuntu_lists)

(550, 457)

In [11]:
libre_lists.dropna(subset=['Lists'], inplace=True)
ubuntu_lists.dropna(subset=['Lists'], inplace=True)

In [12]:
len(libre_lists) , len(ubuntu_lists)

(546, 388)

In [13]:
libre_lists.drop_duplicates(subset=['Lists'], inplace=True)
ubuntu_lists.drop_duplicates(subset=['Lists'], inplace=True)

In [14]:
len(libre_lists) , len(ubuntu_lists)

(546, 301)

In [15]:
libre_lists.groupby(['Labels']).count()

Unnamed: 0_level_0,Subtitle,Lists
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,137,137
1,409,409


In [16]:
ubuntu_lists.groupby(['Labels']).count()

Unnamed: 0_level_0,Subtitle,Lists
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,168,168
1,133,133


In [17]:
list_data = pd.concat([libre_lists, ubuntu_lists], ignore_index=True)
len(list_data)

847

In [18]:
list_data.groupby(['Labels']).count()

Unnamed: 0_level_0,Subtitle,Lists
Labels,Unnamed: 1_level_1,Unnamed: 2_level_1
0,305,305
1,542,542


We need to note that our data is imbalanced and we need to take a note of these while analysing our results (like accuracy, etc)

In [19]:
baseline = len(list_data[list_data['Labels']==1])/len(list_data)
baseline

0.6399055489964581

In [20]:
list_data.to_csv(os.path.join(DATA_PATH, "procedure_data.csv"))

Let us divide the data into train and test split.

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(list_data['Lists'], list_data['Labels'], test_size=0.2, random_state=42)

train_df = pd.DataFrame({'Lists': X_train, 'Labels': y_train})
test_df = pd.DataFrame({'Lists': X_test, 'Labels': y_test})

train_df.to_csv(os.path.join(DATA_PATH, 'procedure_train_data.csv'))
test_df.to_csv(os.path.join(DATA_PATH, 'procedure_test_data.csv'))

In [22]:
len(train_df), len(test_df)

(677, 170)

In [23]:
train_lines = ["__label__"+str(y)+' '+x for y,x in zip(y_train, X_train)]
test_lines = ["__label__"+str(y)+' '+x for y,x in zip(y_test, X_test)]

with open(os.path.join(DATA_PATH, 'procedure_data.train.txt'), 'w', encoding="utf-8") as f:
    for item in train_lines:
        f.write("%s\n" % item)
with open(os.path.join(DATA_PATH, 'procedure_data.test.txt'), 'w', encoding="utf-8") as f:
    for item in test_lines:
        f.write("%s\n" % item)

In [24]:
with open(os.path.join(DATA_PATH, 'procedure_data.txt'), 'w', encoding="utf-8") as f:
    for x,y in zip(list_data['Lists'], list_data['Labels']):
        f.write("__label__"+str(y)+' '+ x + "\n")