# Reading the Data

In [2]:
#importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [3]:
# Having a look at the data
airlines_df=pd.read_csv("Airline_review.csv")
airlines_df

Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Value For Money,Recommended
0,AB Aviation,9,"""pretty decent airline""",11th November 2019,True,Moroni to Moheli. Turned out to be a pretty ...,Solo Leisure,Economy Class,Moroni to Moheli,Nov-19,4.0,5.0,4.0,4.0,3.0,yes
1,AB Aviation,1,"""Not a good airline""",25th June 2019,True,Moroni to Anjouan. It is a very small airline...,Solo Leisure,Economy Class,Moroni to Anjouan,Jun-19,2.0,2.0,1.0,1.0,2.0,no
2,AB Aviation,1,"""flight was fortunately short""",25th June 2019,True,Anjouan to Dzaoudzi. A very small airline an...,Solo Leisure,Economy Class,Anjouan to Dzaoudzi,Jun-19,2.0,1.0,1.0,1.0,2.0,no
3,Adria Airways,1,"""I will never fly again with Adria""",28th September 2019,False,Please do a favor yourself and do not fly wi...,Solo Leisure,Economy Class,Frankfurt to Pristina,Sep-19,1.0,1.0,,1.0,1.0,no
4,Adria Airways,1,"""it ruined our last days of holidays""",24th September 2019,True,Do not book a flight with this airline! My fr...,Couple Leisure,Economy Class,Sofia to Amsterdam via Ljubljana,Sep-19,1.0,1.0,1.0,1.0,1.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23166,ZIPAIR,1,"""customer service is terrible""",5th July 2022,False,Bangkok to Tokyo. I’ve flown many low cost ai...,Couple Leisure,Economy Class,Bangkok to Tokyo,Jun-22,2.0,1.0,,1.0,1.0,no
23167,ZIPAIR,1,"""Avoid at all costs""",1st June 2022,True,Avoid at all costs. I booked flights to go f...,Solo Leisure,Economy Class,Singapore to Tokyo,Jun-22,,,,,1.0,no
23168,ZIPAIR,3,"""Will not recommend to anyone""",31st May 2022,True,Flight was leaving at 23.15 and after an hou...,Business,Economy Class,Bangkok to Tokyo,May-22,2.0,4.0,2.0,1.0,2.0,no
23169,ZIPAIR,6,"""It was immaculately clean""",23rd May 2022,True,Zipair is JAL’s budget airline. They don’t ha...,Business,Business Class,Tokyo to Los Angeles,May-22,3.0,4.0,3.0,1.0,5.0,yes


In [4]:
#to check the number of rows and columns
airlines_df.shape

(23171, 16)

In [5]:
#columns only
airlines_df.shape[0]

23171

In [6]:
#Checking the nulls percentage
airlines_df.isnull().sum()/airlines_df.shape[0]

Airline Name           0.000000
Overall_Rating         0.000000
Review_Title           0.000000
Review Date            0.000000
Verified               0.000000
Review                 0.000000
Type Of Traveller      0.161322
Seat Type              0.047301
Route                  0.165207
Date Flown             0.162013
Seat Comfort           0.179319
Cabin Staff Service    0.183851
Food & Beverages       0.374218
Ground Service         0.206853
Value For Money        0.046006
Recommended            0.000000
dtype: float64

In [12]:
# Checking number of airlines
airlines_df["Airline Name"].nunique()

497

In [14]:
#Checking the number of reviews for each airline
airlines= airlines_df["Airline Name"].value_counts()
airlines

Airline Name
Caribbean Airlines                  100
GoAir                               100
Germanwings                         100
Philippine Airlines                 100
Bangkok Airways                     100
Garuda Indonesia                    100
Batik Air                           100
Swoop                               100
Frontier Airlines                   100
Sunwing Airlines                    100
Sun Country Airlines                100
Blue Air                            100
FlySafair                           100
PLAY                                100
flydubai                            100
Porter Airlines                     100
Qantas Airways                      100
SriLankan Airlines                  100
Flair Airlines                      100
Go First                            100
TAM Airlines                        100
British Airways                     100
Pegasus Airlines                    100
American Airlines                   100
American Eagle             

# Overall Analysis

##  Average of Overall_Rating

In [18]:
# Converting the data type to numeric
airlines_df['Overall_Rating'] = pd.to_numeric(airlines_df['Overall_Rating'], errors='coerce')
# Taking the median as average rating
avg_ratings = airlines_df.groupby('Airline Name')['Overall_Rating'].median().round(2)
avg_ratings

Airline Name
AB Aviation                         1.0
ANA All Nippon Airways              1.5
ASKY Airlines                       1.0
ATA Airlines                        1.5
Adria Airways                       5.0
Aegean Airlines                     1.5
Aer Lingus                          1.0
Aero VIP                            9.0
AeroItalia                          2.0
Aerocaribbean                       1.0
Aeroflot Russian Airlines           1.0
Aerolineas Argentinas               3.0
Aeromar                             1.0
Aeromexico                          1.0
Aerosur                             5.0
Africa World Airlines               1.0
Afriqiyah Airways                   4.0
Aigle Azur                          1.0
Air Algerie                         2.0
Air Antilles                        1.0
Air Arabia                          1.0
Air Astana                          1.0
Air Austral                         5.0
Air Bagan                           5.0
Air Berlin                 

## Top Airlines by Number of Reviews and average Rating

In [None]:
# Grouping by airline and calculating the number of reviews and average rating
airline_stats = airlines_df.groupby('Airline Name').agg({
    'Overall_Rating': ['count', 'median']
}).reset_index()

# Renaming the columns
airline_stats.columns = ['Airline Name', 'Review Count', 'Average Rating']

# Filtering: only airlines with at least 50 reviews
filtered = airline_stats[airline_stats['Review Count'] >= 50]

# Sorting by average rating and getting the top 10
top10_airlines = filtered.sort_values(by='Average Rating', ascending=False).head(10)

# Plot: combining the both metrics
fig, ax1 = plt.subplots(figsize=(14, 8))

# Barplot for number of reviews
sns.barplot(data=top10_airlines, x='Airline Name', y='Review Count', color='cornflowerblue', ax=ax1)
ax1.set_ylabel('Number of Reviews', color='cornflowerblue', fontsize=17)
ax1.tick_params(axis='y', labelcolor='cornflowerblue')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=60, ha='right', fontsize=14)

# Line plot for average rating
ax2 = ax1.twinx()
sns.lineplot(data=top10_airlines, x='Airline Name', y='Average Rating', color='darkgreen', marker='o', ax=ax2)
ax1.set_xlabel(' ', color='darkgreen', fontsize= 17)
ax2.set_ylabel('Average Rating', color='darkgreen', fontsize= 17)
ax2.tick_params(axis='y', labelcolor='darkgreen')
ax1.set_yticklabels(ax1.get_yticklabels(), fontsize=12)
ax2.set_yticklabels(ax1.get_yticklabels(), fontsize=12)
ax2.set_ylim(0, 10)

plt.title('QantasLink and China Southern Airlines have the Highest Ratings', fontsize=20, color='darkslategray', fontweight= 'bold')
plt.tight_layout(pad=2)
plt.show()

## Average Rating by Seat Type

In [None]:
# Grouping by seat type and calculating the average rating
avg_rating_by_seat = airlines_df.groupby('Seat Type')['Overall_Rating'].median().reset_index()

# Sorting the values descending
avg_rating_by_seat = avg_rating_by_seat.sort_values(by='Overall_Rating', ascending=False)

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(data=avg_rating_by_seat, x='Seat Type', y='Overall_Rating', palette='Blues_d')
plt.title('Median Rating by Seat Type', fontsize=25)
plt.xlabel('Seat Type', fontsize=17)
plt.ylabel('Median Rating', fontsize=17)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Checking the number of reviews for each seat type
num_of_ratings = airlines_df['Seat Type'].value_counts()
num_of_ratings

In [None]:
# Calculating average rating per seat type
seat_order = airlines_df.groupby('Seat Type')['Overall_Rating'].median().sort_values(ascending=False).index

# Plotting the sorted boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(data=airlines_df, x='Overall_Rating', y='Seat Type', palette='viridis', order=seat_order)
plt.title('Business and First Class Passengers Gave Higher Ratings', fontsize=25, fontweight='bold', color='darkslategray')
plt.xlabel('Average Rating', fontsize=20)
plt.ylabel('Seat Type', fontsize=20)
plt.xticks(fontsize=17)
plt.yticks(fontsize=15)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


## Correlation between Overall Rating and Rating Categories

In [None]:
# Choosing the columns to do the correlation test
corr_categories = airlines_df[['Overall_Rating', 'Value For Money', 'Ground Service',
                              'Food & Beverages', 'Cabin Staff Service', 'Seat Comfort']].corr()

# Extracting and sorting the correlations with rating (excluding rating itself)
correlations_with_rating = corr_categories['Overall_Rating'].drop('Overall_Rating').sort_values(ascending=True)
# plotting
plt.figure(figsize=(8, 5))
correlations_with_rating.plot(kind='barh', color='cornflowerblue')
plt.title('Value for Money Affects the Most on the Average Rating', fontsize=20, fontweight='bold', color='darkslategray')
plt.xlabel('Correlation efficiency', fontsize=17)
plt.tick_params(axis='x', labelsize=14)
plt.tick_params(axis='y', labelsize=14) 
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

## Average Rating by Type of Traveller

In [None]:
# Taking the avrage rating
traveller_avg_rating = airlines_df.groupby('Type Of Traveller')['Overall_Rating'].median().reset_index()

# Sorting the values
traveller_avg_rating = traveller_avg_rating.sort_values(by='Overall_Rating', ascending=True)

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(data=traveller_avg_rating, x='Overall_Rating', y='Type Of Traveller', palette='viridis')

plt.title('Average Overall Rating by Type of Traveller', fontsize = 27)
plt.xlabel('Average Rating', fontsize = 17)
plt.ylabel('Type of Traveller', fontsize = 17)
plt.xlim(0, 5)  # Assuming rating scale is from 0 to 5
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
# Checking the number of reviews in traveller type
num_of_traveller_type = airlines_df['Type Of Traveller'].value_counts()
num_of_traveller_type

In [None]:
 # Taking the average of Overall Rating and sorting by descending
traveller_order = airlines_df.groupby('Type Of Traveller')['Overall_Rating'].median().sort_values(ascending=False).index

# Plotting boxplot 
plt.figure(figsize=(10, 6))
sns.boxplot(data=airlines_df, x='Overall_Rating', y='Type Of Traveller', palette='viridis', order=traveller_order)

plt.title('Solo and Business Travellers Gave Higher Ratings', fontsize=27, fontweight='bold', color='darkslategray')
plt.xlabel('Overall Rating', fontsize=20)
plt.ylabel('Type of Traveller', fontsize=20)
plt.xticks(fontsize=17)
plt.yticks(fontsize=15)
plt.xlim(0, 10) 
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

## Overall Rating vs Recommendation

In [None]:
# Taking the average
recommended_rating = airlines_df.groupby('Recommended')['Overall_Rating'].median().reset_index()
# Sorting the values
recommended_rating = recommended_rating.sort_values(by='Overall_Rating', ascending=False)

# Plotting
plt.figure(figsize=(8, 6))
sns.barplot(data=recommended_rating, x='Recommended', y='Overall_Rating', palette='muted')

plt.title('Higher Rating = Recommended', fontsize=27, fontweight='bold', color='darkslategray')
plt.xlabel('Recommendation', fontsize=17)
plt.ylabel('Average Rating', fontsize=17)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.ylim(0, 10)  
plt.tight_layout()
plt.show()

## Time Based Analysis

In [None]:
# Removng ordinal suffixes and creating a new cleaned column
airlines_df['Cleaned Review Date'] = airlines_df['Review Date'].str.replace(
    r'(\d{1,2})(st|nd|rd|th)', r'\1', regex=True)

# Converting the cleaned text to datetime format
airlines_df['Cleaned Review Date'] = pd.to_datetime(
    airlines_df['Cleaned Review Date'], format='%d %B %Y')
airlines_df

In [None]:
#Saving the csv file
airlines_df.to_csv("airlines_cleaned.csv", index=False)

# Text and Sentiment Analysis

# Cleaning the Text

In [None]:
# Installing the library to clean the written text
!pip install nltk

In [None]:
# importing all the libraries for text analysis
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Choosing the languages English
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Cleaning the text
def clean_text(text):
    if pd.isnull(text): return ""
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)
# Creating new and cleaned columns   
airlines_df['clean_title'] = airlines_df['Review_Title'].apply(clean_text)
airlines_df['clean_review'] = airlines_df['Review'].apply(clean_text)

# Sentiment Analysis

In [None]:
# Intalling the model for sentiment analysis
! pip install vaderSentiment

In [None]:
# Importing the classifier
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# giving it a simple name
analyzer = SentimentIntensityAnalyzer()

# counting the polarity score
def get_sentiment_scores(text):
    if pd.isnull(text):
        return {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}
    return analyzer.polarity_scores(text)


# Applying it to cleaned review column
review_scores = airlines_df['clean_review'].apply(get_sentiment_scores).apply(pd.Series)
review_scores.columns = ['review_negative', 'review_neutral', 'review_positive', 'review_compound']

# Combining
airlines_df = pd.concat([airlines_df, review_scores], axis=1)

In [None]:
# Converting to numeric (In case)
airlines_df['review_compound'] = pd.to_numeric(airlines_df['review_compound'], errors='coerce')

# Defining sentiment classification
def classify_sentiment(score):
    if pd.isnull(score):
        return 'unknown'
    elif score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Applying the classification
airlines_df['review_sentiment'] = airlines_df['review_compound'].apply(classify_sentiment)

In [None]:
airlines_df

In [None]:
# Checking the count
reviews_sent= airlines_df['review_sentiment'].value_counts()
reviews_sent

In [None]:
# Defining custom colors for each sentiment
custom_palette = {
    'positive': 'seagreen',
    'negative': 'firebrick',
    'neutral': 'gold'
}

# Sorting the order by value count
order = airlines_df['review_sentiment'].value_counts().index

# Plotting
plt.figure(figsize=(8, 5))
sns.countplot(data=airlines_df, x='review_sentiment', order=order, palette=custom_palette)
plt.title('Positives are More Than the Negatives', fontsize=18, fontweight='bold', color='darkslategray')
plt.xlabel(' ', fontsize=14)
plt.ylabel('Count', fontsize=17)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tight_layout()
plt.show()

# Text Analysis

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Trigram Extraction
vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10)
X2 = vectorizer.fit_transform(airlines_df['clean_review'])

# Getting the top trigrams
trigrams = vectorizer.get_feature_names_out()
counts = X2.sum(axis=0).A1
trigram_freq = pd.DataFrame({'Trigram': trigrams, 'Count': counts}).sort_values(by='Count', ascending=False)
trigram_freq.rename(columns={'Trigram': ''}, inplace=True)

# Creating color list based on negatives
highlight_indices = [0, 1, 5, 6, 7]  # 0-based indexing
colors = ['firebrick' if i in highlight_indices else 'cornflowerblue' for i in range(len(trigram_freq))]

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(data=trigram_freq, x='Count', y='', palette=colors)
plt.title('Negativity Detected', fontsize=27, fontweight='bold', color='firebrick')
plt.xlabel('Count', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()

In [None]:
#filtering the positive and negative reviews
positive_reviews = airlines_df[airlines_df['review_sentiment'] == 'positive']['Review']
negative_reviews = airlines_df[airlines_df['review_sentiment'] == 'negative']['Review']


In [None]:
positive_text = " ".join(positive_reviews.astype(str))
negative_text = " ".join(negative_reviews.astype(str))

In [None]:
from wordcloud import STOPWORDS

# Setting default stopwords
custom_stopwords = set(STOPWORDS)

# Adding stopwords manually
custom_stopwords.update([
    'flight', 'flights', 'airport' , 'airline', 'plane', 'air', 'seat', 'fly', 'travel', 'passenger','even'  # common airline words
    'get', 'got', 'make', 'just', 'one', 'like', 'really', 'still', 'will'  # generic filler words
])

In [None]:
# Using the library to choose bigrams and filtering postive reviews
vectorizer_pos = CountVectorizer(ngram_range=(2, 2), stop_words=list(custom_stopwords))
X_pos = vectorizer_pos.fit_transform([positive_text])

# Creating frequency dictionary
bigrams_freq_pos = dict(zip(vectorizer_pos.get_feature_names_out(), X_pos.toarray().sum(axis=0)))

# Generating word cloud
wordcloud_positive_bigram = WordCloud(
    width=800, height=400, background_color='white', colormap='Greens'
).generate_from_frequencies(bigrams_freq_pos)

# Plotting
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive_bigram, interpolation='bilinear')
plt.axis('off')
plt.title(' ', fontsize=18, fontweight='bold', color='green')
plt.tight_layout()
plt.show()

In [None]:
# Using the library to choose bigrams and filtering negative reviews
vectorizer_neg = CountVectorizer(ngram_range=(2, 2), stop_words=list(custom_stopwords))
X_neg = vectorizer_neg.fit_transform([negative_text])

# Creating frequency dictionary
bigrams_freq_neg = dict(zip(vectorizer_neg.get_feature_names_out(), X_neg.toarray().sum(axis=0)))

# Generating word cloud
wordcloud_negative_bigram = WordCloud(
    width=800, height=400, background_color='white', colormap='Reds'
).generate_from_frequencies(bigrams_freq_neg)

#  Plotting
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative_bigram, interpolation='bilinear')
plt.axis('off')
plt.title(' ', fontsize=18, fontweight='bold', color='red')
plt.tight_layout()
plt.show()

# Arab Airlines Analysis

## Filtering the data to Arab Airlines Only

In [None]:
# List of airlines to filter
target_airlines = [
    'Air Arabia',
    'Emirates',
    'Etihad Airways',
    'Gulf Air',
    'Qatar Airways',
    'Saudi Arabian Airlines'
]

# Filtering the DataFrame
filtered_df = airlines_df[airlines_df['Airline Name'].isin(target_airlines)]
filtered_df

In [None]:
# Calculating the Average Rating per airline
median_ratings = filtered_df.groupby('Airline Name')['Overall_Rating'].mean().reset_index()

# Sorting by rating descending
median_ratings = median_ratings.sort_values(by='Overall_Rating', ascending=False).reset_index(drop=True)

# Defining colors 
colors = ['cornflowerblue'] * len(median_ratings)
colors[0] = 'tomato'  
colors[1] = 'darkorange'   

# Plotting
plt.figure(figsize=(10, 8))
sns.barplot(data=median_ratings, y='Airline Name', x='Overall_Rating', palette=colors)

plt.xlabel('Average Rating', color='black', fontsize=14)
plt.ylabel(' ', fontsize=14)
plt.title('Qatar and Gulf Air Have the Highest Ratings', fontsize=20, fontweight='bold', color='darkslategray')
plt.xlim(0, 10)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout(pad=2)
plt.grid(True, axis='x', linestyle='--', alpha=0.5)
plt.show()

## Repeating the wordcloud steps to generate positive and Negative wordcloud for GCC Airlines

In [None]:
positive_reviewss = filtered_df[filtered_df['review_sentiment'] == 'positive']['Review']
negative_reviewss = filtered_df[filtered_df['review_sentiment'] == 'negative']['Review']

In [None]:
filtered_positive_text = " ".join(positive_reviewss.astype(str))
filtered_negative_text = " ".join(negative_reviewss.astype(str))

In [None]:
# For positive reviews
wordcloud_positive_filtered = WordCloud(width=800, height=400, background_color='white', colormap='Greens', stopwords=custom_stopwords).generate(filtered_positive_text)

# For negative reviews
wordcloud_negative_filtered = WordCloud(width=800, height=400, background_color='white', colormap='Reds', stopwords=custom_stopwords).generate(filtered_negative_text)

In [None]:
#Positive
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive_filtered, interpolation='bilinear')
plt.axis('off')
plt.title('Positive Reviews Word Cloud')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative_filtered, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Reviews Word Cloud')
plt.show()

# Gulf Air Analysis

In [None]:
# Selecting the Gulf Air
targett_airlines = ['Gulf Air']

# Filtering the DataFrame
gulf_df = airlines_df[airlines_df['Airline Name'].isin(targett_airlines)]
gulf_df

In [None]:
# Counting the sentiment values
sentiment_counts = gulf_df['review_sentiment'].value_counts()

# Plotting
plt.figure(figsize=(6, 4))
sentiment_counts[['positive', 'negative']].plot(kind='bar', color=['forestgreen', 'firebrick'])

plt.title('Almost Equal', fontsize=20, fontweight='bold', color='darkslategray')
plt.xlabel(' ')
plt.ylabel('Number of Reviews', fontsize='17')
plt.xticks(rotation =0, fontsize=17)
plt.yticks(fontsize=15)
plt.tight_layout()
plt.show()

## Repeating the wordcloud steps to generate positive and Negative wordcloud for Gulf Air Reviews

In [None]:
gulf_positive_reviews = gulf_df[gulf_df['review_sentiment'] == 'positive']['Review']
gulf_negative_reviews = gulf_df[gulf_df['review_sentiment'] == 'negative']['Review']

In [None]:
gulf_positive_text = " ".join(gulf_positive_reviews.astype(str))
gulf_negative_text = " ".join(gulf_negative_reviews.astype(str))

In [None]:
# For positive reviews
wordcloud_positive_filtered = WordCloud(width=800, height=400, background_color='white', colormap='Greens', stopwords=custom_stopwords).generate(gulf_positive_text)

# For negative reviews
wordcloud_negative_filtered = WordCloud(width=800, height=400, background_color='white', colormap='Reds', stopwords=custom_stopwords).generate(gulf_negative_text)

In [None]:
#Positive
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_positive_filtered, interpolation='bilinear')
plt.axis('off')
plt.title('Positive Reviews Word Cloud')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_negative_filtered, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Reviews Word Cloud')
plt.show()

In [None]:
# Filtering only negative reviews
negative_reviews = gulf_df[gulf_df['review_sentiment'] == 'negative'].copy()

# Defining the issue classification function
def classify_issue(text):
    if any(keyword in text for keyword in ['delay', 'cancel', 'late', 'reschedule']):
        return 'Flight Delay/Cancellation'
    elif any(keyword in text for keyword in ['baggage', 'luggage', 'lost bag', 'bag']):
        return 'Baggage Issue'
    elif any(keyword in text for keyword in ['booking', 'ticket', 'reservation', 'book']):
        return 'Booking Problem'
    elif any(keyword in text for keyword in ['rude', 'service', 'staff', 'crew', 'help']):
        return 'Customer Service Issue'
    elif any(keyword in text for keyword in ['seat', 'uncomfortable', 'legroom']):
        return 'Seat Comfort Issue'
    elif any(keyword in text for keyword in ['food', 'meal', 'entertainment']):
        return 'In-flight Service Issue'
    elif any(keyword in text for keyword in ['checkin', 'check-in', 'boarding', 'gate']):
        return 'Check-in/Boarding Issue'
    elif any(keyword in text for keyword in ['refund', 'payment', 'voucher', 'charged']):
        return 'Refund/Payment Issue'
    elif any(keyword in text for keyword in ['app', 'website', 'online']):
        return 'App/Website Issue'
    else:
        return 'Other/Uncategorized'

# Applying classification to only negative reviews
negative_reviews['issue_classification'] = negative_reviews['clean_review'].apply(classify_issue)

# Counting the issue types
issue_summary = negative_reviews['issue_classification'].value_counts().reset_index()
issue_summary.columns = ['issue_classification', 'count']
issue_summary

In [None]:
# List of categories to analyze
rating_columns = ['Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Ground Service', 'Value For Money']

# Calculating average ratings
median_ratings = gulf_df[rating_columns].median().sort_values()

# Plotting
plt.figure(figsize=(8, 5))
median_ratings.plot(kind='barh', color='darkorange')

plt.title('Average Rating by Category', fontsize=20, fontweight='bold', color='darkslategray')
plt.xlabel('Average Rating', fontsize=17)
plt.ylabel('')
plt.xlim(0, 10)  # Correct: horizontal chart -> limit x-axis
plt.xticks(rotation=0, fontsize=15)
plt.yticks(fontsize=15)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Filtering negative reviews only
negative_reviews = gulf_df[gulf_df['review_sentiment'] == 'negative']

# Generating trigrams
vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10)
X2 = vectorizer.fit_transform(negative_reviews['clean_review'])

# Counting trigrams
trigrams = vectorizer.get_feature_names_out()
counts = X2.sum(axis=0).A1
trigram_freq = pd.DataFrame({'Trigram': trigrams, 'Count': counts}).sort_values(by='Count', ascending=False)
trigram_freq.rename(columns={'Trigram': ''}, inplace=True)

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(data=trigram_freq, x='Count', y='', color='indianred')
plt.title('Most Repeated Complaints are Related to Staff', fontsize=24, fontweight='bold', color='darkred')
plt.xlabel('Count', fontsize=16)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# Calculating average rating per seat type
seat_order = gulf_df.groupby('Seat Type')['Overall_Rating'].median().sort_values(ascending=False).index

# Plot the sorted boxplot
plt.figure(figsize=(10, 7))
sns.boxplot(data=airlines_df, x='Overall_Rating', y='Seat Type', palette='viridis', order=seat_order)
plt.title('Business Class Passengers Gave Higher Ratings', fontsize=25, fontweight='bold', color='darkslategray')
plt.xlabel('Average Rating', fontsize=20)
plt.ylabel('Seat Type', fontsize=20)
plt.xticks(fontsize=17)
plt.yticks(fontsize=15)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Sorting type of traveller by average rating (descending)
traveller_order = gulf_df.groupby('Type Of Traveller')['Overall_Rating'].median().sort_values(ascending=False).index

# Plotting
plt.figure(figsize=(10, 6))
sns.boxplot(data=airlines_df, x='Overall_Rating', y='Type Of Traveller', palette='viridis', order=traveller_order)

plt.title('Solo and Business Travellers Gave Higher Ratings', fontsize=27, fontweight='bold', color='darkslategray')
plt.xlabel('Overall Rating', fontsize=20)
plt.ylabel('Type of Traveller', fontsize=20)
plt.xticks(fontsize=17)
plt.yticks(fontsize=15)
plt.xlim(0, 10) 
plt.grid(True, axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Done by: Sadia Liaqat