In [13]:
!pip install datasets transformers rouge-score -q
!pip install sentencepiece
!pip install 'accelerate>=0.26.0'





In [14]:
import pandas as pd
from datasets import load_dataset
import evaluate
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq


import matplotlib.pyplot as plt
import os
import csv

import warnings
warnings.filterwarnings("ignore")

In [15]:
dataset = load_dataset("webis/tldr-17", split='train', trust_remote_code=True)

In [16]:
for i, sample in enumerate(dataset):
    print(sample)
    if i >= 4:
        break

{'author': 'raysofdarkmatter', 'body': "I think it should be fixed on either UTC standard or UTC+1 year around, with the current zone offsets.\n\nMoving timescales add a lot of complexity to the implementation of timekeeping systems and have [dubious value]( \n\nI think seasonal shifting time made sense in the pre-electric past, when timekeeping was more flexible and artificial light was inefficient and often dangerous. \n\nNow we have machines that work easily with simple timekeeping rules, and it's more beneficial to spend a small amount on energy for lighting, and save the larger cost of engineering things to work with the complex timekeeping rules, as well as saving the irritation to humans.\n\nLighting has gotten much more efficient over time; we can squeeze out a lot more photons per unit of energy from a 2012 CFL or LED than a candle could in 1780, or a lightbulb could in 1950. \n\nThere's a lot of room for improvement in how we use lights as well; as lighting control gets more 

In [17]:
webis_tldr_subset = dataset.select(range(10000))  # First 10,000 rows
webis_tldr_df = webis_tldr_subset.to_pandas()
webis_tldr_df.head()


Unnamed: 0,author,body,normalizedBody,subreddit,subreddit_id,id,content,summary
0,raysofdarkmatter,I think it should be fixed on either UTC stand...,I think it should be fixed on either UTC stand...,math,t5_2qh0n,c69al3r,I think it should be fixed on either UTC stand...,Shifting seasonal time is no longer worth it.
1,Stork13,Art is about the hardest thing to categorize i...,Art is about the hardest thing to categorize i...,funny,t5_2qh33,c6a9nxd,Art is about the hardest thing to categorize i...,Personal opinions 'n shit.
2,Cloud_dreamer,Ask me what I think about the Wall Street Jour...,Ask me what I think about the Wall Street Jour...,Borderlands,t5_2r8cd,c6acx4l,Ask me what I think about the Wall Street Jour...,insults and slack ass insight. \n Wall Street ...
3,NightlyReaper,"In Mechwarrior Online, I have begun to use a m...","In Mechwarrior Online, I have begun to use a m...",gamingpc,t5_2sq2y,c8onqew,"In Mechwarrior Online, I have begun to use a m...","Yes, Joysticks in modern games have apparently..."
4,NuffZetPand0ra,"You are talking about the Charsi imbue, right?...","You are talking about the Charsi imbue, right?...",Diablo,t5_2qore,c6acxvc,"You are talking about the Charsi imbue, right?...",Class only items dropped from high-lvl monsters.


In [18]:

# webis_tldr_statistical_ml.ipynb

# SECTION 1: Install Dependencies
!pip install datasets scikit-learn rouge-score textblob -q

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from textblob import TextBlob
from rouge_score import rouge_scorer

# SECTION 2: Load Dataset
dataset = load_dataset("webis/tldr-17", split='train', trust_remote_code=True)

# SECTION 3: Preprocessing
# For simplicity, we'll use only the 'selftext' and 'tldr' fields
texts = dataset["normalizedBody"][:5000]  # limit to a subset to speed up processing
summaries = dataset["summary"][:5000]

# Feature extraction
def extract_readability(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, len(text.split())  # Sentiment and word count

def preprocess(texts):
    # Extract features (sentiment, length, etc.)
    sentiments, lengths = zip(*[extract_readability(text) for text in texts])
    return sentiments, lengths

# Apply feature extraction
sentiments, lengths = preprocess(texts)

# Convert texts to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer.fit_transform(texts)

# Combine features
import numpy as np
X = np.hstack([X_tfidf.toarray(), np.array(sentiments).reshape(-1, 1), np.array(lengths).reshape(-1, 1)])

# Target variable (TL;DR length, or you can use a regression target like semantic similarity)
y = [len(summary.split()) for summary in summaries]  # Example target: TL;DR length

# SECTION 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SECTION 5: Model Training (Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# SECTION 6: Evaluation
y_pred = model.predict(X_test)

# Print the Mean Squared Error of predictions
print(f"Mean Squared Error: {mean_squared_error(y_test, y_pred)}")

# OPTIONAL: ROUGE Evaluation (if you have generated summaries)
def compute_rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]
    return scores

# Example of comparing a sample prediction to the human-written TL;DR
sample_prediction = ["This is a sample TL;DR summary"]  # Example prediction
sample_reference = ["This is a real TL;DR"]  # Example reference
rouge_scores = compute_rouge(sample_prediction, sample_reference)
print(rouge_scores)


Mean Squared Error: 549.3044686000001
[{'rouge1': Score(precision=0.7142857142857143, recall=0.8333333333333334, fmeasure=0.7692307692307692), 'rouge2': Score(precision=0.5, recall=0.6, fmeasure=0.5454545454545454), 'rougeL': Score(precision=0.7142857142857143, recall=0.8333333333333334, fmeasure=0.7692307692307692)}]
