In [None]:
from pydantic import BaseModel
from transformers import Pipeline, pipeline

In [None]:
class Prediction(BaseModel):
    generated_text: str


class Predictions(BaseModel):
    results: list[Prediction]


class Event(BaseModel):
    title: str
    text: str

In [None]:
generator: Pipeline = pipeline("text-generation", model="gpt2")

In [None]:
from math import floor


class PromptProcessor:
    epochs: int = 10
    max_length: int = 50
    num_responses: int = 10

    def execute(self, prompts: list[str]) -> list[Predictions]:
        results: list[Predictions] = []

        for prompt in prompts:
            for i in range(self.epochs):
                max_length: int = floor(((i + 1) / self.epochs) * self.max_length) + len(prompt)

                print(f"epoch: {i+1}, max length: {max_length}")
                predictions_dict = generator(prompt, max_length=max_length, num_return_sequences=self.num_responses)
                predictions: Predictions = Predictions.parse_obj({"results": predictions_dict})
                results.append(predictions)
        return results

In [None]:
# prompts: list[str] = ["A robot", "A lone wanderer", "A crashing plane", "A swarm of spiders"]
prompts: list[str] = ["A safe haven", "A promising", "With joy in their hearts"]
samples: list[Predictions] = PromptProcessor().execute(prompts=prompts)

In [None]:
samples

In [None]:
import json

with open(file="samples.json", mode="w", encoding="utf-8") as file:
    items = []
    for element in samples:
        items.append(element.dict())

    json.dump({"samples": items}, file)

In [None]:
import traceback
from typing import Any, Optional
import string


class LineScore(BaseModel):
    text: str
    length: Optional[int]
    # float (0->1) percentage of the string
    lower: Optional[float]
    upper: Optional[float]
    numeric: Optional[float]
    white_space: Optional[float]
    punc: Optional[float]
    total: Optional[float]
    newline: Optional[float]

    def __init__(self, **data: Any):
        super().__init__(**data)

        self.length: int = len(self.text)
        white_space_count: int = len([i for i in self.text if i.isspace()])
        punc_count: int = len([i for i in self.text if i in string.punctuation])
        digits_count: int = len([i for i in self.text if i in string.digits])
        lower_count: int = len([i for i in self.text if i in string.ascii_lowercase])
        upper_count: int = len([i for i in self.text if i in string.ascii_uppercase])
        newline_count: int = len([i for i in self.text if i == "\n"])

        self.lower = lower_count / self.length
        self.upper = upper_count / self.length
        self.numeric = digits_count / self.length
        self.white_space = white_space_count / self.length
        self.punc = punc_count / self.length
        self.newline = newline_count / self.length

        try:
            self.total = self.lower + self.upper + self.numeric + self.white_space + self.punc + self.newline
        except Exception as error:
            traceback.print_exc()
            print(self.text)
            raise error

In [None]:
import pandas as pd

columns = ["text", "length", "lower", "upper", "numeric", "white_space", "punc", "total", "newline"]
data_dict = {}
for epoch in samples:
    for event in epoch.results:
        # print(event.generated_text)
        score = LineScore(text=event.generated_text)
        score_dict = score.dict()
        # print(score_dict)
        for column in columns:
            if column not in data_dict:
                data_dict[column] = [score_dict[column]]
            else:
                data_dict[column].append(score_dict[column])
# print(data_dict)
score_df = pd.DataFrame.from_dict(data_dict)

In [None]:
score_df

Excluded due to high occurrence of punctuation

by quantile

In [None]:
mean_quantile = score_df.loc[:, ~score_df.columns.isin(["text"])].quantile(0.3)

In [None]:
mean_quantile

In [None]:
filtered_df = score_df
filtered_df = filtered_df[filtered_df.upper <= mean_quantile.upper]
filtered_df = filtered_df[filtered_df.numeric <= mean_quantile.numeric]
filtered_df = filtered_df[filtered_df.punc <= mean_quantile.punc]
filtered_df = filtered_df[filtered_df.newline <= mean_quantile.newline]


# filtered_df = filtered_df[score_df.length <= mean_quantile.length]
# filtered_df = filtered_df[filtered_df.lower <= mean_quantile.lower]
# filtered_df = filtered_df[filtered_df.white_space <= mean_quantile.white_space]

In [None]:
filtered_df

# Trim to sentences

In [None]:
filtered_df.to_csv("filtered.csv")