# Sentiment Analysis on Customer Conversations
>Applied NLP models to classify customer survey comments from medical facilities as positive, negative, or neutral—helping surface insights for service quality evaluation and improvement

In [1]:
!pip install "transformers[torch]"




In [2]:
import pandas as pd
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_recall_fscore_support,
)
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_clean_hospital_data():
    df = pd.read_csv("../data/hospital.csv")
    # preprocess
    df.drop(columns=["Unnamed: 3"], errors="ignore")
    df.dropna(subset=["Feedback", "Sentiment Label"])

    return df
    
df = get_clean_hospital_data()

In [4]:
def process_baseline():
    sentiment_pipeline = pipeline("sentiment-analysis")
    results = sentiment_pipeline(df["Feedback"].tolist())
    # print(results)

    df["predicted_label"] = [1 if r["label"] == "POSITIVE" else 0 for r in results]
    df["confidence"] = [r["score"] for r in results]

    print("Baseline on pretrained model\n")
    print("Accuracy: ", accuracy_score(df["Sentiment Label"], df["predicted_label"]))
    print(
        "Classification report\n",
        classification_report(df["Sentiment Label"], df["predicted_label"]),
    )
process_baseline()

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


Baseline on pretrained model

Accuracy:  0.8463855421686747
Classification report
               precision    recall  f1-score   support

           0       0.65      0.93      0.76       268
           1       0.97      0.82      0.89       728

    accuracy                           0.85       996
   macro avg       0.81      0.87      0.83       996
weighted avg       0.88      0.85      0.85       996



In [5]:
df = df.rename(columns={"Feedback": "text", "Sentiment Label": "label"})
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"],
)

train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [8]:
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████| 796/796 [00:00<00:00, 6921.51 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 10588.74 examples/s]


In [10]:
trainer.train()
eval_results = trainer.evaluate()

print("Evaluation results:", eval_results)

trainer.save_model("./models/hospital-sentiment-model")
tokenizer.save_pretrained("./tokenizers/hospital-sentiment-model")

print("Complete!!!!")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.277331,0.885,0.976744,0.863014,0.916364
2,No log,0.240162,0.905,0.950355,0.917808,0.933798
3,No log,0.26395,0.91,0.921053,0.958904,0.939597
4,No log,0.257687,0.915,0.944828,0.938356,0.941581


Evaluation results: {'eval_loss': 0.25768688321113586, 'eval_accuracy': 0.915, 'eval_precision': 0.9448275862068966, 'eval_recall': 0.9383561643835616, 'eval_f1': 0.9415807560137457, 'eval_runtime': 1.8984, 'eval_samples_per_second': 105.349, 'eval_steps_per_second': 3.687, 'epoch': 4.0}
Complete!!!!
