In [1]:
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

tokenizer = nltk.RegexpTokenizer(r"\w+")

In [2]:
training = pd.read_csv("./dataset/disaster_response_messages_training.csv")
test = pd.read_csv("./dataset/disaster_response_messages_test.csv")
validation = pd.read_csv("./dataset/disaster_response_messages_validation.csv")

print(training.shape)
print(test.shape)
print(validation.shape)

(21046, 42)
(2629, 42)
(2573, 42)


In [3]:
chosen_cols = ['id', 'message', 'request', 'food', 'shelter', 'water']

In [4]:
training_clean = pd.DataFrame()
test_clean = pd.DataFrame()
validation_clean = pd.DataFrame()

for col in chosen_cols:
    training_clean[col] = training[col]
    test_clean[col] = test[col]
    validation_clean[col] = validation[col]

In [5]:
# create a new stemmer
porter = PorterStemmer()

# create a new lemmetizer
lemmatizer = WordNetLemmatizer()

In [6]:
def get_tokens(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = filter(lambda x: x.isalpha(), tokens)
    tokens = map(lambda x: x.lower(), tokens)
    
    return list(tokens)

def lemmatize_sentence(sentence):
    # Returns the sentence with each word lemmatized
    tokens = get_tokens(sentence)
    result = []

    for word, tag in pos_tag(tokens):
        if tag.startswith("NN"):
            result.append(lemmatizer.lemmatize(word, pos='n'))
        elif tag.startswith('VB'):
            result.append(lemmatizer.lemmatize(word, pos='v'))
        elif tag.startswith('JJ'):
            result.append(lemmatizer.lemmatize(word, pos='a'))
        elif tag.startswith('R'):
            result.append(lemmatizer.lemmatize(word, pos='r'))
        else:
            result.append(word)
    
    return " ".join(result)

def stem_sentence(sentence):
    # Returns the sentence with each word stemmed
    tokens = get_tokens(sentence)
    result = []

    for tok in tokens:
        result.append(porter.stem(tok))
    
    return " ".join(result)

In [7]:
messages = training.message.to_list()
messages_stemmed = []

for message in messages:
    messages_stemmed.append(stem_sentence(message))

messages_lemmatized = []

for message in messages:
    messages_lemmatized.append(lemmatize_sentence(message))

training_clean['message_stem'] = messages_stemmed
training_clean['message_lemma'] = messages_lemmatized

messages = test.message.to_list()
messages_stemmed = []

for message in messages:
    messages_stemmed.append(stem_sentence(message))

messages_lemmatized = []

for message in messages:
    messages_lemmatized.append(lemmatize_sentence(message))

test_clean['message_stem'] = messages_stemmed
test_clean['message_lemma'] = messages_lemmatized

messages = validation.message.to_list()
messages_stemmed = []

for message in messages:
    messages_stemmed.append(stem_sentence(message))

messages_lemmatized = []

for message in messages:
    messages_lemmatized.append(lemmatize_sentence(message))

validation_clean['message_stem'] = messages_stemmed
validation_clean['message_lemma'] = messages_lemmatized

In [8]:
print(training_clean.shape)
print(test_clean.shape)
print(validation_clean.shape)

(21046, 8)
(2629, 8)
(2573, 8)


In [9]:
training_clean = training_clean[training_clean.message_stem != '']
test_clean = test_clean[test_clean.message_stem != '']
validation_clean = validation_clean[validation_clean.message_stem != '']

training_clean = training_clean[training_clean.message_lemma != '']
test_clean = test_clean[test_clean.message_lemma != '']
validation_clean = validation_clean[validation_clean.message_lemma != '']

In [10]:
print(training_clean.shape)
print(test_clean.shape)
print(validation_clean.shape)

(21042, 8)
(2628, 8)
(2572, 8)


In [11]:
training_clean.to_csv('./preprocessed/training.csv', index=False)
test_clean.to_csv('./preprocessed/test.csv', index=False)
validation_clean.to_csv('./preprocessed/validation.csv', index=False)