In [17]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


# Load Data Set

In [18]:
import pandas as pd
import torch
from transformers import pipeline

# ----------------------------
# 1. Load Excel datasets
# ----------------------------
ba_path = r"C:\Users\Samiksha\Downloads\BA_updated.xlsx"
emirates_path = r"C:\Users\Samiksha\Downloads\Emirates_updated.xlsx"

ba_df = pd.read_excel(ba_path)
emirates_df = pd.read_excel(emirates_path)

print("British Airways shape:", ba_df.shape)
print("Emirates shape:", emirates_df.shape)

British Airways shape: (3218, 29)
Emirates shape: (1475, 29)


# torch / torchvision / torchaudio → PyTorch’s deep learning framework plus its vision and audio toolkits.
# transformers → Hugging Face’s library for using and fine-tuning state-of-the-art pretrained AI models for text, images, and audio.

In [19]:
!pip install torch torchvision torchaudio



In [20]:
!pip install transformers



#  Loads a pretrained Twitter-specific RoBERTa sentiment model and tokenizer, ready to process text and output sentiment probabilities.

In [21]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [22]:
def clean_review_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'\n+', ' ', text)                  # remove newlines
    text = re.sub(r'http\S+', '', text)               # remove links
    text = re.sub(r'[^a-z0-9\s\.\,\!\?\'\"]+', '', text)  # keep only basic chars
    text = re.sub(r'\s+', ' ', text).strip()
    return text

### Text Cleaning and Sentiment Scoring with RoBERTa

In [23]:
def get_sentiment_roberta(text):
    if not text or text.strip() == "":  # empty after cleaning
        return pd.Series({
            'sentiment_label': "missing",
            'sentiment_confidence': None,
            'score_neg': None,
            'score_neu': None,
            'score_pos': None
        })
    try:
        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        with torch.no_grad():
            output = model(**encoded_input)

        scores = softmax(output.logits.numpy()[0])
        labels = ['negative', 'neutral', 'positive']

        return pd.Series({
            'sentiment_label': labels[scores.argmax()],
            'sentiment_confidence': float(scores.max()),
            'score_neg': float(scores[0]),
            'score_neu': float(scores[1]),
            'score_pos': float(scores[2])
        })
    except Exception as e:
        print(f"Error processing: {text[:100]}... | {e}")
        return pd.Series({
            'sentiment_label': "error",
            'sentiment_confidence': None,
            'score_neg': None,
            'score_neu': None,
            'score_pos': None
        })

In [24]:
tqdm.pandas()

ba_df['review_text_clean'] = ba_df['review_text'].progress_apply(clean_review_text)

sentiments = ba_df['review_text_clean'].progress_apply(get_sentiment_roberta)

# Merge results back
ba_df = pd.concat([ba_df, sentiments], axis=1)

100%|██████████| 3218/3218 [00:00<00:00, 10143.57it/s]
100%|██████████| 3218/3218 [17:14<00:00,  3.11it/s]


In [30]:
tqdm.pandas()

# Clean review text
emirates_df['review_text_clean'] = emirates_df['review_text'].progress_apply(clean_review_text)

# Run sentiment scoring
sentiments_em = emirates_df['review_text_clean'].progress_apply(get_sentiment_roberta)

# Merge results back
emirates_df = pd.concat([emirates_df, sentiments_em], axis=1)

100%|██████████| 1475/1475 [00:00<00:00, 8558.17it/s]
100%|██████████| 1475/1475 [07:31<00:00,  3.27it/s]


In [31]:
missing_count = emirates_df['sentiment_label'].isin(["missing", "error"]).sum()
print(f"Missing/error sentiments: {missing_count} out of {len(emirates_df)}")

Missing/error sentiments: 0 out of 1475


In [32]:
ba_df.to_csv(r"C:\Users\Samiksha\Downloads\2BA_with_sentiment.csv", index=False)
emirates_df.to_csv(r"C:\Users\Samiksha\Downloads\2Em_with_sentiment.csv", index=False)

In [11]:
# Save the updated dataframes
emirates_df.to_csv(r"C:\Users\Samiksha\Documents\Dissertataion python\Anu_ba1_Updated_with_sentiment.csv", index=False)
ba_df.to_csv(r"C:\Users\Samiksha\Documents\Dissertataion python\Anu_em1_updated_with_sentiment.csv", index=False)

print("\n Sentiment scores added and files saved.")


 Sentiment scores added and files saved.
