In [None]:
import pandas as pd
from pydantic import BaseModel
from transformers import Pipeline, pipeline, set_seed

In [None]:
class Prediction(BaseModel):
    generated_text: str


class Predictions(BaseModel):
    results: list[Prediction]


class Event(BaseModel):
    title: str
    text: str

In [None]:
generator: Pipeline = pipeline("text-generation", model="gpt2")
# set_seed(42)

In [None]:
# def builder(prompt: str) -> list[Event]:
#     events: list[Event] = []
#
#     titles: Predictions = Predictions.parse_obj({"results": generator(prompt, max_length=10, num_return_sequences=30)})
#
#     for title in titles.results:
#         title_str: str = title.generated_text
#         texts: Predictions = Predictions.parse_obj({"results": generator(title_str, max_length=120, num_return_sequences=10)})
#         for text in texts.results:
#             text_str = text.generated_text
#             event = Event(title=title_str, text=text_str)
#             events.append(event)
#     return events

In [None]:
# events: list[Event] = builder('raccoon horror')
# for event in events:
#     print(
#         event.title + '\n',
#         event.text + '\n',
#         "\n"
#     )

In [None]:
prompts: list[str] = ["raccoon"]
for prompt in prompts:
    samples: Predictions = Predictions.parse_obj(
        {"results": generator(prompt, max_length=200, num_return_sequences=300)}
    )
    # for event in samples.results:
    # print(event.generated_text)
    # print(
    #     event.title + '\n',
    #     event.text + '\n',
    #     "\n"
    # )

In [None]:
samples

In [None]:
import traceback
from typing import Any, Optional
import string


class LineScore(BaseModel):
    text: str
    length: Optional[int]
    # float (0->1) percentage of the string
    lower: Optional[float]
    upper: Optional[float]
    numeric: Optional[float]
    white_space: Optional[float]
    punc: Optional[float]
    total: Optional[float]
    newline: Optional[float]

    def __init__(self, **data: Any):
        super().__init__(**data)

        self.length: int = len(self.text)
        white_space_count: int = len([i for i in self.text if i.isspace()])
        punc_count: int = len([i for i in self.text if i in string.punctuation])
        digits_count: int = len([i for i in self.text if i in string.digits])
        lower_count: int = len([i for i in self.text if i in string.ascii_lowercase])
        upper_count: int = len([i for i in self.text if i in string.ascii_uppercase])
        newline_count: int = len([i for i in self.text if i == "\n"])

        self.lower = lower_count / self.length
        self.upper = upper_count / self.length
        self.numeric = digits_count / self.length
        self.white_space = white_space_count / self.length
        self.punc = punc_count / self.length
        self.newline = newline_count / self.length

        try:
            self.total = self.lower + self.upper + self.numeric + self.white_space + self.punc + self.newline
        except Exception as error:
            traceback.print_exc()
            print(self.text)
            raise error

In [None]:
import pandas as pd

columns = ["text", "length", "lower", "upper", "numeric", "white_space", "punc", "total", "newline"]
data_dict = {}
for event in samples.results:
    # print(event.generated_text)
    score = LineScore(text=event.generated_text)
    score_dict = score.dict()
    # print(score_dict)
    for column in columns:
        if column not in data_dict:
            data_dict[column] = [score_dict[column]]
        else:
            data_dict[column].append(score_dict[column])
# print(data_dict)
score_df = pd.DataFrame.from_dict(data_dict)

In [None]:
score_df

# Filter our entries with punctuation above the group median

In [None]:
# filtered_df = score_df[score_df['punc'] <= score_df['punc'].median()]

Excluded due to high occurrence of punctuation

by quantile

In [None]:
mean_quantile = score_df.loc[:, ~score_df.columns.isin(["text"])].quantile(0.3)

In [None]:
mean_quantile

In [None]:
filtered_df = score_df
filtered_df = filtered_df[filtered_df.upper <= mean_quantile.upper]
filtered_df = filtered_df[filtered_df.numeric <= mean_quantile.numeric]
filtered_df = filtered_df[filtered_df.punc <= mean_quantile.punc]
filtered_df = filtered_df[filtered_df.newline <= mean_quantile.newline]


# filtered_df = filtered_df[score_df.length <= mean_quantile.length]
# filtered_df = filtered_df[filtered_df.lower <= mean_quantile.lower]
# filtered_df = filtered_df[filtered_df.white_space <= mean_quantile.white_space]

In [None]:
filtered_df

# Trim to sentences