In [72]:
import pandas as pd
from top2vec import Top2Vec
from nltk.corpus import stopwords

df = pd.read_csv('BA_reviews.csv')

# displaying all columns/rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# removing Unnamed: 0 column
df = df.drop(columns=['Unnamed: 0'])
# Rows with a Overall Rating of 5 or less (out of 10)
negative_df = df[df['Overall Rating'] <= 5]

In [73]:
negative_reviews = negative_df['Reviews']
negative_reviews.iloc[:5]

0    ✅ Trip Verified | The check in process and rew...
1    ✅ Trip Verified |   We flew in November 2023, ...
2    ✅ Trip Verified | I left for London from Johan...
3    ✅ Trip Verified |   After an excellent flight ...
4    ✅ Trip Verified |   On a recent flight from Cy...
Name: Reviews, dtype: object

In [74]:
# removing "✅ Trip Verified |", "Not Verified |", and stop words
negative_reviews = negative_reviews.str.replace('✅ Trip Verified', '')
negative_reviews = negative_reviews.str.replace('Not Verified', '')
negative_reviews = negative_reviews.str.replace('|', '')

stop_words = set(stopwords.words('english'))
def remove_stopwords(sentence):
    words = sentence.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

negative_reviews_cleaned = negative_reviews.apply(remove_stopwords)

negative_reviews_cleaned.iloc[:5]


  negative_reviews = negative_reviews.str.replace('|', '')


0    check process reward/loyalty program mess. nev...
1    flew November 2023, took long seek satisfactor...
2    left London Johannesburg 21:15 22 December 202...
3    excellent flight 777 CPT LHR return good. BA m...
4    recent flight Cyprus BA621 23/11/24, second ca...
Name: Reviews, dtype: object

In [75]:

negative_reviews_list = negative_reviews_cleaned.tolist()
negative_reviews_list[0]

'check process reward/loyalty program mess. never able get points, even trying partner airline. Unfortunately work travel sometimes. time, flight delayed.'

In [76]:
model = Top2Vec(documents=negative_reviews_list)

2025-01-07 17:51:58,029 - top2vec - INFO - Pre-processing documents for training
2025-01-07 17:51:58,295 - top2vec - INFO - Creating joint document/word embedding
2025-01-07 17:52:01,600 - top2vec - INFO - Creating lower dimension embedding of documents
2025-01-07 17:52:04,824 - top2vec - INFO - Finding dense areas of documents
2025-01-07 17:52:04,883 - top2vec - INFO - Finding topics


In [77]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_nums)

[0 1 2 3 4]


In [78]:
topic_words, word_scores, topic_nums = model.get_topics(4)

In [82]:
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(f"Topic #{num} with size {topic_sizes[num]}")
    print(words)
    print(scores)
    print()
    break
    

Topic #0 with size 716
['getting' 'done' 'wanted' 'instead' 'unfortunately' 'holiday'
 'everything' 'back' 'apparently' 'we' 'it' 'anything' 'give' 'delays'
 'wait' 'less' 'night' 'without' 'lot' 've' 'know' 'going' 'possible'
 'using' 'one' 'given' 'didn' 'would' 'absolutely' 'two' 'services'
 'could' 'option' 'everyone' 'different' 'reason' 'went' 're' 'card'
 'whole' 'case' 'either' 'this' 'trip' 'end' 'again' 'kept' 'simply'
 'care' 'london']
[0.9757851  0.9718951  0.96510684 0.9649189  0.96307343 0.9557885
 0.9508037  0.95024085 0.94780254 0.94748753 0.946199   0.94579965
 0.9456085  0.94430614 0.94420725 0.9438486  0.9427116  0.9426683
 0.9414065  0.9409725  0.9384174  0.93671566 0.9364039  0.93519616
 0.9351533  0.9350832  0.9339901  0.9339392  0.9331397  0.9331018
 0.9326582  0.9325386  0.9313867  0.9302538  0.9297083  0.9280297
 0.9279285  0.9271908  0.9271342  0.92709386 0.927029   0.9268063
 0.9262425  0.92619973 0.9261159  0.9259852  0.9258251  0.92519224
 0.92512536 0.9247

In [80]:
# Looking at the documents that most relate to that topic, a higher score means the document is very similar to that topic. 
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=0, num_docs=4)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document #{doc_id} with score {score}")
    print(doc)
    print()


Document #238 with score 0.9909299612045288
luggage mis-tagged Dallas way Cairo via London. Luggage arrived LHR planned Saturday 6/24. However, upon arrival due luggage mis-tagged incorrect name luggage, never sent Cairo. sitting Terminal 3 LHR (I air tag luggage) British Airways saying nothing it. filed 3 claims, made 13 phone calls, hung 3x simple solve problem workers BA refuse solve problem.

Document #119 with score 0.9885177612304688
actually get fly BA cancelled flight 3 days notice refused offer acceptable alternative, resulting 2 people losing holiday (a river cruise). Since then, appears standard protocol, ignored every communication regarding obligations compensation option pursue via official channels.

Document #1036 with score 0.9883452653884888
London Houston. Executive gold club refused help keep us informed case get put flight subsequently missed another two days flights 6 possible flights per day. staff steadfastly opinion problem received card 801 USD buy essential i