### Text Data Preprocessing

In [1]:
# TODO sentence preprocessing, turn big paragraphs into single sentences

import pandas as pd

from nltk.stem import PorterStemmer, WordNetLemmatizer

Reading the Excel File into a pandas DataFrame

In [2]:
file_path = "data/interrater_data.xlsx"

excel_file = pd.read_excel(file_path)

Extracting the text and associated label for each excel row and storing them in _sentences_ and _labels_ respectively

In [3]:
excel_file

Unnamed: 0,Time,Speaker,Utterance,Not Classified,Statement of Intent,Statement of Prediction,Statement of Situation,Statement of Action,Request for Intent,Request for Prediction,Request for Situation,Request for Action
0,00:00:00,Bravo,"Alpha, Charlie. Bravo check.",,,,,,,,x,
1,00:00:05,Charlie,Alpha you're loud_and_clear.,,,,x,,,,,
2,00:00:06,Alpha,Charlie. Good to me,,,,x,,,,,
3,00:00:10,Bravo,"Charlie, Charlie one, Bravo radio check.",,,,,,,,x,
4,00:00:13,Alpha,Yeah. Charlie good to me. Over,,,,x,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
129,00:15:52,Charlie,Yep,X,,,,,,,,
130,00:15:54,Alpha,Don't forget we still got a fire in the kind o...,,,,X,,,,,
131,00:16:00,Bravo,Yeah. Now.,,,,,X,,,,
132,00:16:05,Charlie,OK.,,,,,,,,,


In [4]:
excel_file.shape
# The first number is the number of rows/entries there are, the second is the number of columns of the .shape function

(134, 12)

In [5]:
excel_file.nunique()

Time                       115
Speaker                      4
Utterance                  132
Not Classified               2
Statement of Intent          2
Statement of Prediction      2
Statement of Situation       2
Statement of Action          2
Request for Intent           0
Request for Prediction       0
Request for Situation        2
Request for Action           2
dtype: int64

In [6]:
from nltk.tokenize import sent_tokenize

actual_labels = []
for col in excel_file.columns[3:]:
    actual_labels.append(col)

sentences = []
labels = []

for row in excel_file.iterrows():
    for index, speech_act in enumerate(row[1].iloc[3:10]):
        if speech_act == "x":
            # TODO assign different labels for the different parts of the text?
            row_sentences = sent_tokenize(row[1].iloc[2].lower().strip())
            for s in row_sentences:
                sentences.append(s)
                labels.append(actual_labels[index])
            break

print("Sentences: ", sentences[:5])
print("I have sentences: ", len(sentences))
print("Correct Labels: ", labels[:5])
print("I have labels: ", len(labels))

Sentences:  ["alpha you're loud_and_clear.", 'charlie.', 'good to me', 'yeah.', 'charlie good to me.']
I have sentences:  125
Correct Labels:  ['Statement of Situation', 'Statement of Situation', 'Statement of Situation', 'Statement of Situation', 'Statement of Situation']
I have labels:  125


In [7]:
sentences   

["alpha you're loud_and_clear.",
 'charlie.',
 'good to me',
 'yeah.',
 'charlie good to me.',
 'over',
 "you're loud_and_clear.",
 'from alpha.',
 'splendid.',
 "ok, my piece of news is all about the lowtown hospital, that's in grid square november_30.",
 'how they have been able to save the hospital.',
 'so i would expect it to be lots of people there, potentially.',
 'bravo, i’ve some irrelevant drivel about squirrels in firwood.',
 "yeah, i've something about steel.",
 "probably given it to me because i'm northern.",
 'but again, its useless.',
 '5 minutes for planning.',
 'starting now.',
 'ok, so i think the grid idea from alex mark one is a good idea.',
 'that leaves you, alex two, with your  firetruck_six that you can then go either side of that barrier line depending on where you feel the biggest threat is or where the people are.',
 'i assume alex one you’ll take  firetruck _seven and  firetruck _eight for replenishment into a position that will best serve serve all of that.'

In [8]:
labels

['Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Not Classified',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Action',
 'Statement of Action',
 'Statement of Intent',
 'Statement of Intent',
 'Statement of Intent',
 'Statement of Intent',
 'Statement of Intent',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Not Classified',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Not Classified',
 'Not Classified',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',
 'Statement of Situation',


# Preprocessing using NLTK
Tokenisation

In [9]:
from nltk.tokenize import word_tokenize
# nltk.download('punkt')

tokens = []

for text in sentences:
    tokens = tokens + word_tokenize(text)

print(tokens)



Removing Stop Words

In [10]:
from nltk.corpus import stopwords

# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

filtered_tokens

['alpha',
 "'re",
 'loud_and_clear',
 '.',
 'charlie',
 '.',
 'good',
 'yeah',
 '.',
 'charlie',
 'good',
 '.',
 "'re",
 'loud_and_clear',
 '.',
 'alpha',
 '.',
 'splendid',
 '.',
 'ok',
 ',',
 'piece',
 'news',
 'lowtown',
 'hospital',
 ',',
 "'s",
 'grid',
 'square',
 'november_30',
 '.',
 'able',
 'save',
 'hospital',
 '.',
 'would',
 'expect',
 'lots',
 'people',
 ',',
 'potentially',
 '.',
 'bravo',
 ',',
 '’',
 'irrelevant',
 'drivel',
 'squirrels',
 'firwood',
 '.',
 'yeah',
 ',',
 "'ve",
 'something',
 'steel',
 '.',
 'probably',
 'given',
 "'m",
 'northern',
 '.',
 ',',
 'useless',
 '.',
 '5',
 'minutes',
 'planning',
 '.',
 'starting',
 '.',
 'ok',
 ',',
 'think',
 'grid',
 'idea',
 'alex',
 'mark',
 'one',
 'good',
 'idea',
 '.',
 'leaves',
 ',',
 'alex',
 'two',
 ',',
 'firetruck_six',
 'go',
 'either',
 'side',
 'barrier',
 'line',
 'depending',
 'feel',
 'biggest',
 'threat',
 'people',
 '.',
 'assume',
 'alex',
 'one',
 '’',
 'take',
 'firetruck',
 '_seven',
 'firetruck'

Attempt at preprocessing:
- Stemming
- Lemmatisation

In [11]:
# nltk.download('wordnet')

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens]

stemmed_tokens

['alpha',
 'you',
 "'re",
 'loud_and_clear',
 '.',
 'charli',
 '.',
 'good',
 'to',
 'me',
 'yeah',
 '.',
 'charli',
 'good',
 'to',
 'me',
 '.',
 'over',
 'you',
 "'re",
 'loud_and_clear',
 '.',
 'from',
 'alpha',
 '.',
 'splendid',
 '.',
 'ok',
 ',',
 'my',
 'piec',
 'of',
 'news',
 'is',
 'all',
 'about',
 'the',
 'lowtown',
 'hospit',
 ',',
 'that',
 "'s",
 'in',
 'grid',
 'squar',
 'november_30',
 '.',
 'how',
 'they',
 'have',
 'been',
 'abl',
 'to',
 'save',
 'the',
 'hospit',
 '.',
 'so',
 'i',
 'would',
 'expect',
 'it',
 'to',
 'be',
 'lot',
 'of',
 'peopl',
 'there',
 ',',
 'potenti',
 '.',
 'bravo',
 ',',
 'i',
 '’',
 've',
 'some',
 'irrelev',
 'drivel',
 'about',
 'squirrel',
 'in',
 'firwood',
 '.',
 'yeah',
 ',',
 'i',
 "'ve",
 'someth',
 'about',
 'steel',
 '.',
 'probabl',
 'given',
 'it',
 'to',
 'me',
 'becaus',
 'i',
 "'m",
 'northern',
 '.',
 'but',
 'again',
 ',',
 'it',
 'useless',
 '.',
 '5',
 'minut',
 'for',
 'plan',
 '.',
 'start',
 'now',
 '.',
 'ok',
 ',',

In [12]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

lemmatized_tokens

['alpha',
 'you',
 "'re",
 'loud_and_clear',
 '.',
 'charlie',
 '.',
 'good',
 'to',
 'me',
 'yeah',
 '.',
 'charlie',
 'good',
 'to',
 'me',
 '.',
 'over',
 'you',
 "'re",
 'loud_and_clear',
 '.',
 'from',
 'alpha',
 '.',
 'splendid',
 '.',
 'ok',
 ',',
 'my',
 'piece',
 'of',
 'news',
 'is',
 'all',
 'about',
 'the',
 'lowtown',
 'hospital',
 ',',
 'that',
 "'s",
 'in',
 'grid',
 'square',
 'november_30',
 '.',
 'how',
 'they',
 'have',
 'been',
 'able',
 'to',
 'save',
 'the',
 'hospital',
 '.',
 'so',
 'i',
 'would',
 'expect',
 'it',
 'to',
 'be',
 'lot',
 'of',
 'people',
 'there',
 ',',
 'potentially',
 '.',
 'bravo',
 ',',
 'i',
 '’',
 've',
 'some',
 'irrelevant',
 'drivel',
 'about',
 'squirrel',
 'in',
 'firwood',
 '.',
 'yeah',
 ',',
 'i',
 "'ve",
 'something',
 'about',
 'steel',
 '.',
 'probably',
 'given',
 'it',
 'to',
 'me',
 'because',
 'i',
 "'m",
 'northern',
 '.',
 'but',
 'again',
 ',',
 'it',
 'useless',
 '.',
 '5',
 'minute',
 'for',
 'planning',
 '.',
 'startin

In [13]:
# Sample text
text = "This is a sample sentence. And here is another one."

# Split into sentences
test_sentences = sent_tokenize(text)

test_sentences

['This is a sample sentence.', 'And here is another one.']

In [14]:
tokenized_sentences = [word_tokenize(sentence) for sentence in test_sentences]

tokenized_sentences

[['This', 'is', 'a', 'sample', 'sentence', '.'],
 ['And', 'here', 'is', 'another', 'one', '.']]