#### Preprocessing Part : added the following enhancements to preprocessing pipeline:

---

✔ Elongated Word Normalization (e.g., "goooood" → "good") 
<br>✔ Contraction Expansion (e.g., "you're" → "you are") 
<br>✔ POS-based Lemmatization (improves accuracy of lemmatization)

---

In [86]:
#import required libraries
import pandas as pd

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

import warnings
warnings.filterwarnings("ignore")

In [87]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/suvarnaaglave/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [88]:
# Load dataset
df = pd.read_csv("BA_Airline_Reviews_Cleaned.csv")


In [89]:
# Define negative words to retain
negative_words = ['not', 'no', 'never', "isn't", "aren't", "wasn't", "weren't", "haven't", "hasn't", "hadn't", "won't", "wouldn't", "don't", "doesn't", "didn't", "can't", "couldn't", "shouldn't", "mightn't", "mustn't"]

# Get stopwords and remove negative words
stop_words = set(stopwords.words('english')) - set(negative_words)

# Add airline-specific words to stopwords
stop_words.update(['ba', 'british', 'airways'])

# Initialize lemmatizer
detokenizer = TreebankWordDetokenizer()
lemmatizer = WordNetLemmatizer()


In [90]:
# Function to convert emojis to words
def convert_emojis(text):
    emoji_dict = {":)": "happy", ":(": "sad", "😃": "happy", "😢": "sad", "😡": "angry", "👍": "positive", "👎": "negative"}
    for emoji, meaning in emoji_dict.items():
        text = text.replace(emoji, meaning)
    return text

# Function to correct spelling
def correct_spelling(text):
    return str(TextBlob(text).correct())

# Function to normalize elongated words
def normalize_elongated_words(text):
    return re.sub(r'(\w)\1{2,}', r'\1\1', text)

# Function to expand contractions
def expand_contractions(text):
    contractions_dict = {"you're": "you are", "they're": "they are", "we're": "we are", "it's": "it is", "i'm": "i am"}
    for contraction, expanded in contractions_dict.items():
        text = text.replace(contraction, expanded)
    return text

# Function for POS-based Lemmatization
def pos_based_lemmatization(tokens):
    pos_tagged = pos_tag(tokens)
    pos_map = {'N': 'n', 'V': 'v', 'J': 'a', 'R': 'r'}
    return [lemmatizer.lemmatize(word, pos=pos_map.get(tag[0], 'n')) for word, tag in pos_tagged]

In [91]:
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    # Convert emojis to words
    text = convert_emojis(text)
    # Expand contractions
    text = expand_contractions(text)
    # Normalize elongated words
    text = normalize_elongated_words(text)
    # Lowercasing
    text = text.lower()
    # Remove punctuation but keep alphanumeric characters and numbers
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Spelling correction
    text = correct_spelling(text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords but retain negative words
    tokens = [word for word in tokens if word not in stop_words]
    # POS-based Lemmatization
    tokens = pos_based_lemmatization(tokens)
    # Return tokens
    text = " ".join(tokens)
    return text

In [92]:
# Apply preprocessing
df['ReviewBody'] = df['ReviewBody'].astype(str).apply(preprocess_text)
df['ReviewHeader'] = df['ReviewHeader'].astype(str).apply(preprocess_text)

In [93]:
# Save the cleaned dataset
df.to_csv("BA_Airline_Reviews_Preprocessed.csv", index=False)

print("Preprocessing complete. File saved as BA_Airline_Reviews_Preprocessed.csv")


Preprocessing complete. File saved as BA_Airline_Reviews_Preprocessed.csv
