In [None]:
# import necessary libraries 

import string
import re


import joblib
import spacy
import contractions
from unidecode import unidecode
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [2]:
# load the text data

textData = joblib.load('../artifacts/data.txt')

In [3]:
textData[:100] # show first 100 characters

"Maria's life has been reduced to waiting for the next phone call from her husband - never knowing if"

# step one sentence Boundary Detection 

In [4]:

sentences = sent_tokenize(textData)

In [5]:
sentences[:5]

["Maria's life has been reduced to waiting for the next phone call from her husband - never knowing if it might be the last.Ivan, a 31-year-old Ukrainian fighter pilot, began defending the skies from the very first hours of Russia's full-scale invasion in February 2022, and has now flown more than 200 perilous missions in his old Soviet-era Mig-29 warplane.The squadron commander has lost several comrades in the war.",
 'Some were close friends.',
 "Others were godfathers to each other's children.",
 'The location of his current air base in western Ukraine cannot be revealed for security reasons.But as US-led efforts to negotiate a ceasefire gather pace - and fresh talks with Russia and Ukraine planned on Monday - things have changed.',
 '"If any ceasefire comes [about], we will feel safer," says Maria.Across Ukraine, more and more people are openly talking about war fatigue.']

# step two Lower case conversion 

In [6]:
lower_sentences = [sentence.lower() for sentence in sentences]

In [7]:
lower_sentences[:5]

["maria's life has been reduced to waiting for the next phone call from her husband - never knowing if it might be the last.ivan, a 31-year-old ukrainian fighter pilot, began defending the skies from the very first hours of russia's full-scale invasion in february 2022, and has now flown more than 200 perilous missions in his old soviet-era mig-29 warplane.the squadron commander has lost several comrades in the war.",
 'some were close friends.',
 "others were godfathers to each other's children.",
 'the location of his current air base in western ukraine cannot be revealed for security reasons.but as us-led efforts to negotiate a ceasefire gather pace - and fresh talks with russia and ukraine planned on monday - things have changed.',
 '"if any ceasefire comes [about], we will feel safer," says maria.across ukraine, more and more people are openly talking about war fatigue.']

# step three spelling correction 

In [10]:
correct_sentences = [str(TextBlob(sentence).correct()) for sentence in lower_sentences]

In [11]:
correct_sentences[:5]

["maria's life has been reduced to waiting for the next phone call from her husband - never knowing if it might be the last.ivan, a 31-year-old ukrainian fighter pilot, began defending the skies from the very first hours of russia's full-scale invasion in february 2022, and has now flown more than 200 perilous missions in his old soviet-era fig-29 airplane.the squadron commander has lost several comrades in the war.",
 'some were close friends.',
 "others were godfather to each other's children.",
 'the location of his current air base in western ukraine cannot be revealed for security reasons.but as us-led efforts to negotiate a ceasefire gather pace - and fresh talks with russia and ukraine planned on monday - things have changed.',
 '"if any ceasefire comes [about], we will feel safer," says maria.across ukraine, more and more people are openly talking about war fatigue.']

# step four Punctuation removal 

In [14]:
punctuation_free_sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in correct_sentences]

In [16]:
punctuation_free_sentences[:5]

['marias life has been reduced to waiting for the next phone call from her husband  never knowing if it might be the lastivan a 31yearold ukrainian fighter pilot began defending the skies from the very first hours of russias fullscale invasion in february 2022 and has now flown more than 200 perilous missions in his old sovietera fig29 airplanethe squadron commander has lost several comrades in the war',
 'some were close friends',
 'others were godfather to each others children',
 'the location of his current air base in western ukraine cannot be revealed for security reasonsbut as usled efforts to negotiate a ceasefire gather pace  and fresh talks with russia and ukraine planned on monday  things have changed',
 'if any ceasefire comes about we will feel safer says mariaacross ukraine more and more people are openly talking about war fatigue']

# Step five Stop Words Removal

In [18]:
stop_words = list(ENGLISH_STOP_WORDS)

In [19]:
filter_sentences = [" ".join([word for word in sentence.split() if word not in stop_words]) for sentence in punctuation_free_sentences]

In [21]:
filter_sentences[:5]

['marias life reduced waiting phone husband knowing lastivan 31yearold ukrainian fighter pilot began defending skies hours russias fullscale invasion february 2022 flown 200 perilous missions old sovietera fig29 airplanethe squadron commander lost comrades war',
 'close friends',
 'godfather children',
 'location current air base western ukraine revealed security reasonsbut usled efforts negotiate ceasefire gather pace fresh talks russia ukraine planned monday things changed',
 'ceasefire comes feel safer says mariaacross ukraine people openly talking war fatigue']

# Step six Lemmatization

In [27]:
nlp = spacy.load('en_core_web_sm')

In [28]:
lemmatized_sentences = [" ".join([token.lemma_ for token in nlp(sentence)]) for sentence in filter_sentences]

#  Step 7: Text Normalization

In [33]:
def normalize_text(text):
    text = contractions.fix(text)
    text = re.sub('r\s+', ' ', text).strip()
    text = unidecode(text)
    return text

In [34]:
normalized_sentences  = [normalize_text(sentence) for sentence in lemmatized_sentences]

In [35]:
normalized_sentences[:5]

['marias life reduce wait phone husband know lastivan 31yearold ukrainian fighte pilot begin defend sky hou russias fullscale invasion february 2022 fly 200 perilous mission old sovietera fig29 airplanethe squadron commande lose comrade war',
 'close friend',
 'godfathe child',
 'location current ai base western ukraine reveal security reasonsbut usled effort negotiate ceasefire gathe pace fresh talk russia ukraine plan monday thing change',
 'ceasefire come feel safe say mariaacross ukraine people openly talk wa fatigue']

In [36]:
# save the preprocessed data 

joblib.dump(normalized_sentences, '../artifacts/cleanData.txt')

['../artifacts/cleanData.txt']