Downloading Libraries

In [None]:
!pip install transformers datasets nltk


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

Context Mapping

In [16]:
import json
import pandas as pd

# Load JSON file
with open("/content/sarcasm_data.json", "r", encoding="utf-8") as file:
    data = json.load(file)

# Convert JSON to a structured list
rows = []
for key, value in data.items():
    context_mapping = [
        f"{speaker}: {sentence}" for speaker, sentence in zip(value["context_speakers"], value["context"])
    ]

    rows.append({
        "ID": key,
        "Utterance": value["utterance"],
        "Speaker": value["speaker"],
        "Context": " || ".join(value["context"]),
        "Context_Speakers": " || ".join(value["context_speakers"]),
        "Context_Mapped": " || ".join(context_mapping),
        "Show": value["show"],
        "Sarcasm": int(value["sarcasm"])
    })

# Convert to DataFrame
df = pd.DataFrame(rows)

print("✅ JSON to CSV conversion complete!")


✅ JSON to CSV conversion complete!


Context Summarization

In [18]:
from transformers import pipeline
from datasets import Dataset

# Initialize Summarizer (GPU Optimized)
summarizer = pipeline("summarization", model="knkarthick/MEETING_SUMMARY", device=0)
dataset = Dataset.from_pandas(df)

# Summarization Function
def batch_summarize(batch):
    texts = batch["Context_Mapped"]
    summaries = []

    for text in texts:
        max_len = max(10, int(len(text.split()) * 0.7))  # Ensure max_length < input_length
        summary = summarizer(text, max_length=max_len, min_length=10, do_sample=False)
        summaries.append(summary[0]["summary_text"])

    return {"Context_Summary": summaries}



# Apply Summarization
dataset = dataset.map(batch_summarize, batched=True, batch_size=16)
df = pd.DataFrame(dataset)

print("✅ Summarization complete!")
df.to_csv("sarcasm_data.csv", index=False)


Device set to use cuda:0


Map:   0%|          | 0/690 [00:00<?, ? examples/s]

Your max_length is set to 10, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 10, but your input_length is only 7. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=3)
Your max_length is set to 10, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length is set to 10, but your input_length is only 8. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=4)
Your max_length 

✅ Summarization complete!
