# PyTorch Demo: Sentiment Analysis and Question Detection with Rivulet Dataset

This demo showcases how to process a dataset of messages to perform **sentiment analysis** and **question detection** using pre-trained transformer models from Hugging Face. Leveraging the `transformers` library and **DeltaCat** for efficient dataset management.

**Data Handling with DeltaCat:**
- **Importing Data:** Easily imports data from CSV files into a DeltaCat `Dataset`.
- **Pytorch Integration:** Easily allows passing of data between pytorch models and transformers.
- **Non-Destructive Transformation:** Transforms the data (e.g., adding sentiment and question classification) without modifying the original dataset.
- **Exporting Data:** Exports the modified dataset to supported formats such as Parquet and JSON for further analysis.

In [None]:
import torch
from typing import List
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import deltacat as dc
import pathlib
import pyarrow as pa
import pyarrow.csv as csv

In [None]:
# Load tokenizer and model for sentiment analysis
sentiment_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_model.eval()

question_tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")
question_model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/question-vs-statement-classifier")
question_model.eval()

In [None]:
def compute_sentiments(batch: pa.RecordBatch) -> List[float]:
    messages = batch.column("message").to_pylist()

    def predict_sentiment(texts: List[str]) -> List[float]:
        inputs = sentiment_tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = sentiment_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        return probs[:, 1].tolist()

    return predict_sentiment(messages)

In [None]:
def compute_is_questions(batch: pa.RecordBatch) -> List[float]:
    messages = batch.column("message").to_pylist()

    def predict_is_question(texts: List[str]) -> List[float]:
        inputs = question_tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = question_model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        return probs[:, 1].tolist()

    return predict_is_question(messages)

In [None]:
# Create a rivulet dataset using the CSV file
cwd = pathlib.Path.cwd()
csv_file_path = cwd / "data.csv"
ds = dc.Dataset.from_csv(
    name="chat",
    file_uri=csv_file_path,
    metadata_uri=cwd.as_uri(),
    merge_keys="msg_id"
)
ds.print(num_records=10)

In [None]:
# define a new schema with fields for pytorch classification
ds.add_fields([
    ("msg_id", dc.Datatype.int64()),
    ("sentiment", dc.Datatype.float()),
    ("is_question", dc.Datatype.float())
], schema_name="message_classifier", merge_keys=["msg_id"])

In [None]:
dataset_writer = ds.writer(schema_name="message_classifier")
for batch in ds.scan().to_arrow():
    messages = batch.column("msg_id").to_pylist()

    # compute message data statistics
    sentiments = compute_sentiments(batch)
    is_questions = compute_is_questions(batch)

    # construct columns for new fields using merge_key
    new_columns_existing_rows = []
    for idx, msg in enumerate(messages):
        new_columns_existing_rows.append({
            "msg_id": msg,
            "sentiment": sentiments[idx],
            "is_question": is_questions[idx]
        })
    dataset_writer.write(new_columns_existing_rows)

dataset_writer.flush()
print("Sentiment and is_question values have been computed and updated in the dataset.")

In [None]:
# export to a supported format (JSON, PARQUET, FEATHER)
ds.export(file_uri="./output.json", format="json")