#### Imports

In [33]:
import os
import re
import nltk
import pandas as pd
import regex
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

#### Retrieving data

In [34]:
dataset_folder = "dataset"
raw_df = pd.DataFrame()
for f in os.listdir(dataset_folder):
    raw_df = pd.concat([raw_df,pd.read_csv(os.path.join(dataset_folder, f))], ignore_index=True)

#### Featurisation

In [35]:
# Removes the proper nouns from long description sentences and retrurns updated sentence
def remove_proper_nouns(sentence):
    for i,sen in enumerate(sentence):
        if sen[1] == "NNP":
            sentence[i] = ("worker","NN")
    sen = list(filter(lambda i: i not in [i for i in sentence if (i[0] == "." or i[0] == "?" or i[0] == "#") and len(sentence) < 3], sentence))
    return [w for i, w in enumerate(sen) if i == 0 or w != sen[i-1]]

# Removes the proper nouns from long description sentences and retrurns list of words of sentence
def remove_pronoun(sentence):
    words = []
    for word in sentence:
        if (word[1] == "PRP" or word[1] == "PRP$") and (word[0] == "height" or word[0] == "it"):
            words.append(word[0])
        if word[1] != "PRP" and word[1] != "PRP$":
            words.append(word[0])
    return words

def process_data(df_removed_duplicates,long_description):
    # nltk pos tags of verbs
    verb_tags = ["VB","VBD","VBG","VBN","VBP","VBZ"]

    # key words for us3-curing dictionary
    curing_dictionary = ['ls12','ls10','ls18','ls24','rtd','d11','d13','d14','v24','limit','ls9','v2','v7','chariot',
                            'ls15','v22','ls2','membrane','locking','rotation','d10','open','d17','precon','v6','ls8,',
                            'ls21','press','lock','opening','loading','d1bis','ls5','steam','ls27','v3','ls20','vinj',
                            'd02','d19','d06','d18','lid','raise','d09','valves','ls28','cylinder','d07','ls6','vter',
                            'v1','v4bis','d03','v9','hooks','release','unloading','ls16','v1bis','v4ebis','to','ls17',
                            'd19bis','close','jib','d04','hydraulic','ls1','d1ter','d20','ring','v23','sectors','d15',
                            'preconfirmation','hook','d01','closing','unlock','head','v34','ls7,','pin','ls19','lower',
                            'v19','d20bis','d12','fingers','d05','d3bis','switches','moldback', 'orifice', 'valve', 
                            'kpot','reheat','dilate']
    
    # creating copy of the dataset
    df_processed = df_removed_duplicates.copy(deep=True)
    
    for i in range(len(df_removed_duplicates)):
        sentences = df_removed_duplicates[long_description][i]
        is_verb = False
        
        # removing proper nouns and "?" or "." from the sentences 
        sentence = remove_proper_nouns(sentences)
        for token in sentence:
            # if sentence contains verb or curing dictionary keywords mark flag as true
            if token[1] in verb_tags or token[0] in curing_dictionary: 
                is_verb = True
                break

        # if flag is false drop the row from dataset
        if not is_verb or len(sentence) < 2:
            df_processed.drop(i,inplace=True)
        else:
            # remove pos tags and convert to back to sentence
            df_processed[long_description][i] = TreebankWordDetokenizer().detokenize(remove_pronoun(sentence))
    return df_processed

# Capitalising each workers name long description column
def capitalise_workers_name(workers_list,df_removed_duplicates,long_description,generic_names):
    workers_name = set()
    for name in workers_list:
        for n in re.split(r"[,-; ]",name):
            if len(n)>2:
                workers_name.add(n)
    for name in generic_names:
        workers_name.add(name)       
    # Changing proper nouns (workers' name) to capital case
    for i in range(len(df_removed_duplicates)):
        for n in workers_name:
            if re.search(r'\b' + n + r'\b', df_removed_duplicates[long_description][i].lower()):
                df_removed_duplicates[long_description][i] = df_removed_duplicates[long_description][i].replace(n, n.capitalize())

def update_pronoun_and_special_characters(tagged_list):
    for i,token in enumerate(tagged_list):
            if token[0] == "i":
                tagged_list[i] = ('worker', 'NN')
            if token[0].count('.') > 1:
                tagged_list[i] = ('.', '.')
            if token[0].count('-') > 1:
                tagged_list[i] = ('-', ':')

def remove_work_order(df,long_description,work_order):
    # removal of work order entries
    df_removed = df.copy(deep=True)
    drop_index = []
    for i in range(len(df)):    
        sen = df[long_description][i].lower()
        for w in work_order:
            if re.search(w, sen): 
                drop_index.append(i)
    return df_removed.drop(index=drop_index).reset_index(drop=True)

def remove_date_and_time(df_removed_duplicates,long_description):
    for i in range(len(df_removed_duplicates)):
        sentence = df_removed_duplicates[long_description][i].split()
        output = list(filter(lambda word: not regex.search('(?:(?:[0-9]{1,2}[:\/,]){1,2}[0-9]{2,4}|am|pm)', word),sentence))
        df_removed_duplicates[long_description][i] = " ".join(output)

In [36]:
def main(df):
    # Assigning dataframe column name into constant variables
    long_description = 'WO rem long desc'
    cause_object = 'Cause object'
    short_description = 'WO Description'
    workers = 'WO workers name'
    
    # Filling NAN values with ""
    df[long_description].fillna("",inplace=True)
    df[cause_object].fillna("",inplace=True)
    df[short_description].fillna("",inplace=True)
    df[workers].fillna("",inplace=True)
    
    # extractng short description, long description and cause object column and dropping duplicates from them
    df_removed_duplicates = df[[short_description,long_description,cause_object]].drop_duplicates(ignore_index=True) 
    
    # remove date and time     
    remove_date_and_time(df_removed_duplicates,long_description)

    # list of top 100 generic names region wise
    generic_names = ['sammy', 'james', 'robert', 'john', 'michael', 'david', 'william', 'richard', 'joseph', 'thomas', 'christopher', 'charles', 'daniel', 'matthew', 'anthony', 'mark', 'donald', 'steven', 'andrew', 'paul', 'joshua', 'kenneth', 'kevin', 'brian', 'george', 'timothy', 'ronald', 'jason', 'edward', 'jeffrey', 'ryan', 'jacob', 'gary', 'nicholas', 'eric', 'jonathan', 'stephen', 'larry', 'justin', 'scott', 'brandon', 'benjamin', 'samuel', 'gregory', 'alexander', 'patrick', 'frank', 'raymond', 'jack', 'dennis', 'jerry', 'tyler', 'aaron', 'jose', 'adam', 'nathan', 'henry', 'zachary', 'douglas', 'peter', 'kyle', 'noah', 'ethan', 'jeremy', 'walter', 'christian', 'keith', 'roger', 'terry', 'austin', 'sean', 'gerald', 'carl', 'harold', 'dylan', 'arthur', 'lawrence', 'jordan', 'jesse', 'bryan', 'billy', 'bruce', 'gabriel', 'joe', 'logan', 'alan', 'juan', 'albert', 'willie', 'elijah', 'wayne', 'randy', 'vincent', 'mason', 'roy', 'ralph', 'bobby', 'russell', 'bradley', 'philip', 'eugene']

    # extracting workers' name from the dataset     
    capitalise_workers_name(df[workers],df_removed_duplicates,long_description,generic_names)
    
    # remove work order
    df_removed_duplicates = remove_work_order(df_removed_duplicates,long_description,["see grs","see w/o","see wo"])
    
    # Tokenisation of long description column
    for i in range(len(df_removed_duplicates)):    
        sentence = df_removed_duplicates[long_description][i]
        token = nltk.word_tokenize(sentence)
        tagged_list = nltk.pos_tag(token)
        update_pronoun_and_special_characters(tagged_list)
        ## adding in dataframe ##
        df_removed_duplicates[long_description][i] = tagged_list

    df_processed = process_data(df_removed_duplicates,long_description)
    save_path = "./Preprocessed/processed_dataset.csv"
    df_processed_final = df_processed[[short_description,long_description,cause_object]].drop_duplicates(ignore_index=True)
    df_processed_final.to_csv(path_or_buf = save_path, index = False)
main(raw_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[long_description].fillna("",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[cause_object].fillna("",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always 