In [2]:
import numpy as np
import pandas as pd
import joblib


In [4]:
model = joblib.load('random_forest_model.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
model

In [8]:
df = pd.read_csv('BA_Processed_Table.csv')
df.columns

Index(['Unnamed: 0', 'reviews', 'Review Sentiment', 'Sentiment'], dtype='object')

Using the vectorizer I trained in the previous notebook.

In [10]:
vectorizer = joblib.load('tfidf_vectorizer.pkl')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [12]:
X = vectorizer.transform(df['reviews'])

In [14]:
df['sentiment_score'] = model.predict(X)

In [16]:
df = df[['reviews', 'sentiment_score']]
df

Unnamed: 0,reviews,sentiment_score
0,next time it will be jet2. not a great experie...,0.593942
1,become the ryanair of long-haul travel. i was ...,0.160517
2,they never help us. i just want to warn everyo...,-0.578817
3,uncomfortable seat and disgusting food. paid f...,-0.816998
4,zero customer service. the plane was extremely...,-0.761117
...,...,...
1215,staff try so very hard to please. gatwick to m...,-0.129066
1216,extremely unprofessional. london to casablanca...,-0.667269
1217,manager is very professional. british airways ...,0.197459
1218,regret choosing ba. hyderabad to brussels via ...,-0.330155


In [18]:
rev = df['sentiment_score'].to_numpy()
df['sentiment'] = np.select([rev>=0.5 , rev<= -0.5], ['Positive','Negative'], 'Neutral' )
df


Unnamed: 0,reviews,sentiment_score,sentiment
0,next time it will be jet2. not a great experie...,0.593942,Positive
1,become the ryanair of long-haul travel. i was ...,0.160517,Neutral
2,they never help us. i just want to warn everyo...,-0.578817,Negative
3,uncomfortable seat and disgusting food. paid f...,-0.816998,Negative
4,zero customer service. the plane was extremely...,-0.761117,Negative
...,...,...,...
1215,staff try so very hard to please. gatwick to m...,-0.129066,Neutral
1216,extremely unprofessional. london to casablanca...,-0.667269,Negative
1217,manager is very professional. british airways ...,0.197459,Neutral
1218,regret choosing ba. hyderabad to brussels via ...,-0.330155,Neutral


In [20]:
negative_reviews = df[~(df['sentiment'] == 'Positive')]
negative_reviews

Unnamed: 0,reviews,sentiment_score,sentiment
1,become the ryanair of long-haul travel. i was ...,0.160517,Neutral
2,they never help us. i just want to warn everyo...,-0.578817,Negative
3,uncomfortable seat and disgusting food. paid f...,-0.816998,Negative
4,zero customer service. the plane was extremely...,-0.761117,Negative
5,baggage arrival nearly took 1.5 hours. overall...,-0.727227,Negative
...,...,...,...
1214,entertainment was not working. my family and i...,0.221360,Neutral
1215,staff try so very hard to please. gatwick to m...,-0.129066,Neutral
1216,extremely unprofessional. london to casablanca...,-0.667269,Negative
1217,manager is very professional. british airways ...,0.197459,Neutral


**Step wise topic modelling using LDA**

1. Cleaning the text to remove the stopwords

Source: ChatGPT

In [22]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and tokenize
def preprocess(texts):
    processed_texts = []
    for doc in texts:
        # simple_preprocess removes punctuation and lowercases the words
        tokens = simple_preprocess(doc, deacc=True)
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 3]
        processed_texts.append(tokens)
    return processed_texts

# Apply to your reviews
texts = negative_reviews['reviews'].astype(str).tolist()
processed_texts = preprocess(texts)

[nltk_data] Downloading package stopwords to /Users/mohan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mohan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
from gensim import corpora

# Create a dictionary
id2word = corpora.Dictionary(processed_texts)

# Create a Bag-of-Words corpus
corpus = [id2word.doc2bow(text) for text in processed_texts]

In [26]:
from gensim.models.ldamodel import LdaModel

lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=5,  # You can tune this
                     random_state=42,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

In [28]:
from pprint import pprint

# Print the topics
pprint(lda_model.print_topics())

[(0,
  '0.032*"seat" + 0.022*"airline" + 0.021*"service" + 0.015*"customer" + '
  '0.011*"flight" + 0.011*"airway" + 0.011*"british" + 0.009*"paid" + '
  '0.009*"price" + 0.008*"would"'),
 (1,
  '0.042*"flight" + 0.015*"customer" + 0.014*"told" + 0.014*"call" + '
  '0.013*"refund" + 0.012*"airway" + 0.012*"british" + 0.010*"would" + '
  '0.009*"cancelled" + 0.009*"airport"'),
 (2,
  '0.018*"flight" + 0.017*"food" + 0.016*"crew" + 0.014*"seat" + 0.014*"drink" '
  '+ 0.013*"service" + 0.013*"cabin" + 0.012*"class" + 0.012*"meal" + '
  '0.010*"airline"'),
 (3,
  '0.058*"flight" + 0.023*"hour" + 0.018*"london" + 0.015*"staff" + '
  '0.015*"check" + 0.013*"time" + 0.013*"plane" + 0.012*"heathrow" + '
  '0.011*"boarding" + 0.010*"passenger"'),
 (4,
  '0.045*"seat" + 0.019*"economy" + 0.014*"premium" + 0.013*"food" + '
  '0.012*"class" + 0.011*"business" + 0.011*"breakfast" + '
  '0.009*"uncomfortable" + 0.009*"screen" + 0.008*"cabin"')]


In [30]:
from pprint import pprint

# Print the topics
pprint(lda_model.print_topics())

[(0,
  '0.032*"seat" + 0.022*"airline" + 0.021*"service" + 0.015*"customer" + '
  '0.011*"flight" + 0.011*"airway" + 0.011*"british" + 0.009*"paid" + '
  '0.009*"price" + 0.008*"would"'),
 (1,
  '0.042*"flight" + 0.015*"customer" + 0.014*"told" + 0.014*"call" + '
  '0.013*"refund" + 0.012*"airway" + 0.012*"british" + 0.010*"would" + '
  '0.009*"cancelled" + 0.009*"airport"'),
 (2,
  '0.018*"flight" + 0.017*"food" + 0.016*"crew" + 0.014*"seat" + 0.014*"drink" '
  '+ 0.013*"service" + 0.013*"cabin" + 0.012*"class" + 0.012*"meal" + '
  '0.010*"airline"'),
 (3,
  '0.058*"flight" + 0.023*"hour" + 0.018*"london" + 0.015*"staff" + '
  '0.015*"check" + 0.013*"time" + 0.013*"plane" + 0.012*"heathrow" + '
  '0.011*"boarding" + 0.010*"passenger"'),
 (4,
  '0.045*"seat" + 0.019*"economy" + 0.014*"premium" + 0.013*"food" + '
  '0.012*"class" + 0.011*"business" + 0.011*"breakfast" + '
  '0.009*"uncomfortable" + 0.009*"screen" + 0.008*"cabin"')]


In [48]:
lda_model.save('lda_model.gensim')

Converting the topics generated by lda model into a dictionary with keywords.


In [34]:
topics_dict = {}
for idx, topic in lda_model.show_topics(formatted=False, num_words=10):
    topics_dict[idx] = {
        "keywords": [word for word, _ in topic]
    }
topics_dict

{0: {'keywords': ['seat',
   'airline',
   'service',
   'customer',
   'flight',
   'airway',
   'british',
   'paid',
   'price',
   'would']},
 1: {'keywords': ['flight',
   'customer',
   'told',
   'call',
   'refund',
   'airway',
   'british',
   'would',
   'cancelled',
   'airport']},
 2: {'keywords': ['flight',
   'food',
   'crew',
   'seat',
   'drink',
   'service',
   'cabin',
   'class',
   'meal',
   'airline']},
 3: {'keywords': ['flight',
   'hour',
   'london',
   'staff',
   'check',
   'time',
   'plane',
   'heathrow',
   'boarding',
   'passenger']},
 4: {'keywords': ['seat',
   'economy',
   'premium',
   'food',
   'class',
   'business',
   'breakfast',
   'uncomfortable',
   'screen',
   'cabin']}}

**Building a recommender system by observing the themes and giving manual suggestions to those**

-> Using a function: preprocess_review() which will process the given text.

->The is_negative_neutral_review() will predict the sentiment of the review using the RandomForestRegressor model and based on the score we decide whether to give recommendations or not.

-> The recommend_action() will be my recommendation engine that gives solutions based on the themes. If none of them match, we ask to review manually

In [36]:
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string



stop_words = set(stopwords.words('english'))

def preprocess_review(text):
    # Basic NLP cleaning — replicate what you did during training
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return tokens

def is_negative_neutral_review(review, threshold=0.5):
    """
    Predict sentiment score and return True if review is negative or neutral.
    """
    review_features = vectorizer.transform([review])
    score = model.predict(review_features)[0]
    return score < threshold, score


def recommend_action(review):

    is_negative, score = is_negative_neutral_review(review)
    
    if not is_negative:
        print("This is a positive review and no need for any recommendation. The sentiment score is :",score)
        return
    
    processed = preprocess_review(review)
    bow_vector = id2word.doc2bow(processed)
    topic_probs = lda_model.get_document_topics(bow_vector)

    if not topic_probs:
        return "No dominant topic detected. Needs manual review."

    # Get the topic with the highest probability
    top_topic = max(topic_probs, key=lambda x: x[1])[0]
    
    # Retrieve keywords for human-readable interpretation
    keywords = topics_dict[top_topic]['keywords']
    
    # Dummy recommendation logic (can be customized)
    recommendation_map = {
        0: "Flag for pricing/value concerns. Suggest loyalty program or voucher.",
        1: "Urgent refund/cancellation issue. Escalate to billing team.",
        2: "In-flight service complaint. Share with cabin crew operations.",
        3: "Delay/ground staff complaint. Check Heathrow or gate management logs.",
        4: "Seat/cabin discomfort. Suggest upgrade policy or improvements."
    }

    result =  {
        "topic_id": top_topic,
        "keywords": keywords,
        "recommendation": recommendation_map.get(top_topic, "Review manually.")
    }

    print("Topic :" , result["topic_id"] )
    print("Keywords :", result["keywords"] )
    print("Recommendation : ", result["recommendation"])    

    

In [38]:
text = "The experience was very bad. I had to wait a lot in the boarding area and the flight was delayed. Very poor service"
recommend_action(text)


Topic : 3
Keywords : ['flight', 'hour', 'london', 'staff', 'check', 'time', 'plane', 'heathrow', 'boarding', 'passenger']
Recommendation :  Delay/ground staff complaint. Check Heathrow or gate management logs.


In [40]:
text = "Such nice service! I had never expected that British Airways would be so good. I recommend this to my friends and family. Excited and enthusiastic for my next trip with British Airways."
recommend_action(text)


This is a positive review and no need for any recommendation. The sentiment score is : 0.7714870000000001


In [42]:
id2word.save('id2word.dict')

In [46]:
import os
os.getcwd()

'/Users/mohan/Library/Mobile Documents/com~apple~CloudDocs/Saras AI/Semester 4/AI in Cloud/Final Project /Week 3'