# Data Preprocessing  
(1) Build dataframe from the json file.  
(2) Remove the two data points from the dataframe whose goals contain non-english words.  
(3) Remove the methods 1926747_3, 2191502_0 and 985548_2 together with their steps from the dataframe.  
(4) Write the preprocessed dataframe object to a .p file for later usage.

In [1]:
import utils
import pandas as pd
import dill as pickle

### (1) Build dataframe from the json file.

In [3]:
# Load the json file.
articles = utils.load_text(mode='uncased', path='../data/preprocessing/WikihowText_data.json')

# Create a dataframe from it.
articles_df = pd.json_normalize(articles)

The default path for the textual data is:
    '/mount/studenten/arbeitsdaten-studenten1/shencg/BA/models/VGSI/WikihowText_data.json'
Please adapt it if your textual data is stored under a different path.



In [3]:
articles_df

Unnamed: 0,file_id,goal,goal_description,category_hierarchy,methods
0,10000798,how to hang an ironing board,for people who like to have an organized and c...,"[home and garden, housekeeping, home organizat...",[{'name': 'hanging an ironing board over a doo...
1,10000832,how to plant lavender in pots,lavender plants are beautiful and fragrant pla...,"[home and garden, gardening, indoor and patio ...",[{'name': 'setting up the right growing condit...
2,10002000,how to deal with talkative students,delivering the day’s lesson to your class can ...,"[education and communications, teaching, class...","[{'name': 'redirecting talkative students', 's..."
3,100021,how to make a dress,ever seen a really gorgeous dress on the runwa...,"[personal care and style, clothing, clothes by...","[{'name': 'starting your dress', 'steps': [{'h..."
4,1000266,how to avoid dangerous dog toys,,"[pets and animals, dogs, dog toys and play]","[{'name': 'finding the right toys', 'steps': [..."
...,...,...,...,...,...
53184,9997212,how to obtain a turkish visa,,"[travel, destinations, asia travel]",[{'name': 'following general application proce...
53185,9997262,how to relieve a hypertension headache,,"[health, cardiovascular system health, blood h...",[{'name': 'responding immediately to a headach...
53186,9997619,how to hide electrical outlets,"although essential, electrical outlets can be ...","[home and garden, home maintenance, electrical...","[{'name': 'covering outlets', 'steps': [{'head..."
53187,9997744,how to hang a volleyball net,"whether played indoors or outdoors, volleyball...","[sports and fitness, team sports, volleyball]","[{'name': 'securing indoor posts', 'steps': [{..."


In [4]:
articles_df[articles_df['file_id'] == '385799']

Unnamed: 0,file_id,goal,goal_description,category_hierarchy,methods
27502,385799,how to make Éclairs,,"[food and entertaining, recipes, baking, pastr...","[{'name': 'making the custard', 'steps': [{'he..."


In [5]:
articles_df[articles_df['file_id'] == '5323060']

Unnamed: 0,file_id,goal,goal_description,category_hierarchy,methods
33281,5323060,how to play alak dolak (Ălăk dolăk),alak dolak (spelled in iranian as Ălăk dolăk) ...,"[hobbies and crafts, games]","[{'name': 'preparation', 'steps': [{'headline'..."


### (2) Remove the two data points from the dataframe whose goals contain non-english words, as the authors did in their experiments.

In [9]:
# Remove the two data points from the dataframe whose goals contain non-english words, i.e.
# (1) index 27502, file_id 385799, goal "how to make Éclairs",
# (2) index 33281, file_id 5323060, goal "how to play alak dolak (Ălăk dolăk)".

articles_df.drop([27502, 33281], inplace=True)
len(articles_df)

53187

In [10]:
articles_df[articles_df['file_id'] == '385799']

Unnamed: 0,file_id,goal,goal_description,category_hierarchy,methods


In [11]:
articles_df[articles_df['file_id'] == '5323060']

Unnamed: 0,file_id,goal,goal_description,category_hierarchy,methods


### (3) Remove the methods with method_id '1926747_3', '2191502_0' and '985548_2' together with their steps from the dataframe to ensure the dependency parser raises no errors.

In [13]:
# Check the number of step headlines in the dataset/dataframe which contain only a single string such as ".", ",", "?" etc.
# Before removing:

fid2wrongheadline = dict()
for idx, row in articles_df.iterrows():
    fid = row['file_id']
    for method in row['methods']:
        for step in method['steps']:
            if len(step['headline']) == 1:
                fid2wrongheadline[fid] = [method['method_id'], step['step_id'], step['headline']]
                
fid2wrongheadline

{'1926747': ['1926747_3', '1926747_3_0', '.'],
 '2191502': ['2191502_0', '2191502_0_0', '.'],
 '985548': ['985548_2', '985548_2_0', '.']}

In [14]:
# Remove the methods with method_id '1926747_3', '2191502_0' and '985548_2' together with their steps 
# from the dataframe.

for idx,row in articles_df.iterrows():
    for method in row['methods']:
        if method['method_id'] in ['1926747_3', '2191502_0', '985548_2']:
            row['methods'].remove(method)

In [15]:
# Check the number of step headlines in the dataset/dataframe contain only a single string such as ".", ",", "?" etc.
# After removing:

fid2wrongheadline = dict()
for idx, row in articles_df.iterrows():
    fid = row['file_id']
    for method in row['methods']:
        for step in method['steps']:
            if len(step['headline']) == 1:
                fid2wrongheadline[fid] = [method['method_id'], step['step_id'], step['headline']]
                
fid2wrongheadline

{}

In [16]:
len(articles_df)

53187

### (4) Write the preprocessed dataframe object to a pickle file for subsequent usage.

In [17]:
# Write articles_df to articles_df.p file.

with open('./output/articles_df.p', 'wb') as file:
    pickle.dump(articles_df, file)

In [18]:
articles_df

Unnamed: 0,file_id,goal,goal_description,category_hierarchy,methods
0,10000798,how to hang an ironing board,for people who like to have an organized and c...,"[home and garden, housekeeping, home organizat...",[{'name': 'hanging an ironing board over a doo...
1,10000832,how to plant lavender in pots,lavender plants are beautiful and fragrant pla...,"[home and garden, gardening, indoor and patio ...",[{'name': 'setting up the right growing condit...
2,10002000,how to deal with talkative students,delivering the day’s lesson to your class can ...,"[education and communications, teaching, class...","[{'name': 'redirecting talkative students', 's..."
3,100021,how to make a dress,ever seen a really gorgeous dress on the runwa...,"[personal care and style, clothing, clothes by...","[{'name': 'starting your dress', 'steps': [{'h..."
4,1000266,how to avoid dangerous dog toys,,"[pets and animals, dogs, dog toys and play]","[{'name': 'finding the right toys', 'steps': [..."
...,...,...,...,...,...
53184,9997212,how to obtain a turkish visa,,"[travel, destinations, asia travel]",[{'name': 'following general application proce...
53185,9997262,how to relieve a hypertension headache,,"[health, cardiovascular system health, blood h...",[{'name': 'responding immediately to a headach...
53186,9997619,how to hide electrical outlets,"although essential, electrical outlets can be ...","[home and garden, home maintenance, electrical...","[{'name': 'covering outlets', 'steps': [{'head..."
53187,9997744,how to hang a volleyball net,"whether played indoors or outdoors, volleyball...","[sports and fitness, team sports, volleyball]","[{'name': 'securing indoor posts', 'steps': [{..."
