In [9]:
from summary_eval.data import summary_df, prompts_df
from summary_eval.testing import cross_validate
from summary_eval.settings import TRAIN_SIZE
from tqdm import tqdm
tqdm.pandas()
import nltk
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\theaw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\theaw\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\theaw\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [10]:
join_df = summary_df.merge(prompts_df, on="prompt_id")

# Features

In [11]:
BASIC_FEATURES = [
    "num_words",
    "num_sentences",
    "words_per_sentence",
]

LT_FEATURES = [
    'count_TYPOS_norm',
    'count_REDUNDANCY_norm',
    'count_PUNCTUATION_norm',
    'count_TYPOGRAPHY_norm',
    'count_STYLE_norm',
    'count_GRAMMAR_norm',
    'count_CASING_norm',
    'count_CONFUSED_WORDS_norm'
]

QUOTE_FEATURES = [
    "quoteCount",
    "avgQuoteLength",
    "propTextInQuotes",
    "propWordsInQuotes",
    "propQuotationsInPrompt"
]

TEXTBLOB_FEATURES = [
    "propSentencesStartedWithConjunctions",
    "propConjunctions",
    "propPronouns",
    "propAdverbs",
    "propVerbs",
    "propNouns",
    "propAdjectives",
    "propAdjectivesRepeated2pTextNormalised",
    "propAdjectivesRepeated2p",
    "subjectivity",
    "polarity"
]

TEXTSTAT_FEATURES = [
    "syllables_per_word"
]

FEATURES = BASIC_FEATURES + LT_FEATURES + QUOTE_FEATURES + TEXTBLOB_FEATURES + TEXTSTAT_FEATURES


## NLTK - Basic Features

In [12]:
import nltk

In [13]:
summary_df["num_words"] = summary_df["text"].apply(lambda x: len(nltk.word_tokenize(x)))
summary_df["num_sentences"] = summary_df["text"].apply(lambda x: len(nltk.sent_tokenize(x)))
summary_df["words_per_sentence"] = summary_df["num_words"] / summary_df["num_sentences"]

## Language Tool
Spelling and grammar based features

In [14]:
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

In [15]:
mistake_categories = [
    'TYPOS',
    'REDUNDANCY',
    'PUNCTUATION',
    'TYPOGRAPHY',
    'STYLE',
    'GRAMMAR',
    'CASING',
    'CONFUSED_WORDS'
]

def pre_process_whitespace(text: str) -> str:
    # Replace all whitespace with a single space
    text = ' '.join(text.split())
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

def count_dict_field(matches: dict, field: str) -> dict:
    issue_counts = {}
    for match in matches:
        if getattr(match, field) in issue_counts:
            issue_counts[getattr(match, field)] += 1
        else:
            issue_counts[getattr(match, field)] = 1
    return issue_counts

def mistake_category_counts(text):
    matches = tool.check(pre_process_whitespace(text))
    category_count = count_dict_field(matches, 'category')
    return [category_count.get(c, 0) for c in mistake_categories]

def grammar_mistakes(df):
    new_df = df.copy()
    temp = list(zip(*new_df['text'].progress_map(mistake_category_counts)))
    for i, c in enumerate(mistake_categories): 
        new_df[f"count_{c}"] = temp[i]
    return new_df

summary_df = grammar_mistakes(summary_df)

100%|██████████| 7165/7165 [07:46<00:00, 15.35it/s]


In [26]:
summary_df["text_len"] = summary_df["text"].apply(len)

In [27]:
for c in mistake_categories:
    summary_df[f"count_{c}_norm"] = summary_df[f"count_{c}"] / summary_df["text_len"]

## Quotation Features

In [16]:
import re
from typing import List

In [17]:
def split_quotations(text: str) -> (str, List[str]):
    quotations = re.findall('"([^"]*)"', text)
    no_quote_text = text
    for quotation in quotations:
        no_quote_text = no_quote_text.replace(f'"{quotation}"', "")
    return no_quote_text, quotations

In [18]:
def count_quotations(text: str) -> int:
    return len(re.findall('"([^"]*)"', text))

def avg_quote_length(text: str) -> float:
    no_quote_text, quotations = split_quotations(text)
    if len(quotations) == 0:
        return 0
    return sum(len(q) for q in quotations) / len(quotations)

def prop_text_in_quotes(text: str) -> float:
    no_quote_text, quotations = split_quotations(text)
    return (len(text)-len(no_quote_text)) / len(text)

def prop_words_in_quotes(text: str, num_words: int) -> float:
    no_quote_text, quotations = split_quotations(text)
    return (num_words-len(nltk.word_tokenize(no_quote_text))) / num_words

def get_alpha(s: str) -> str:
    return ''.join([c for c in s if c.isalpha() or c.isspace()])

def process_str(s: str) -> str:
    return pre_process_whitespace(get_alpha(s).lower())

def prop_quotations_in_prompt(prompt: str, summary: str):
    summary_quotations = set(re.findall('"([^"]*)"', summary))
    
    if len(summary_quotations) == 0:
        return None
    
    # process prompt
    prompt_processed = process_str(prompt)
    summary_quotations = set([process_str(q) for q in summary_quotations])
    
    # for q in summary_quotations:
    #     if q not in prompt_processed:
    #         print(q, prompt_processed)
    #         break
    
    return len([q for q in summary_quotations if q in prompt_processed]) / len(summary_quotations)

summary_df["quoteCount"] = summary_df["text"].apply(count_quotations)
summary_df["avgQuoteLength"] = summary_df["text"].apply(avg_quote_length)
summary_df["propTextInQuotes"] = summary_df["text"].apply(prop_text_in_quotes)
summary_df["propWordsInQuotes"] = summary_df.apply(lambda row: prop_words_in_quotes(row["text"], row["num_words"]), axis=1)
summary_df["propQuotationsInPrompt"] = join_df.progress_apply(lambda row: prop_quotations_in_prompt(row["prompt_text"], row["text"]), axis=1)

100%|██████████| 7165/7165 [00:00<00:00, 7927.61it/s] 


## TextBlob - POS Features

In [19]:
from textblob import TextBlob
from typing import Optional

In [20]:
def get_polarity(text: str) -> float:
    blob = TextBlob(text)
    return blob.sentiment.polarity

def get_subjectivity(text: str) -> float:
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

def textblob_pos_filter(text: str, pos_tag: str) -> List[str]:
    blob = TextBlob(text)
    return [word for word, pos in blob.tags if pos.startswith(pos_tag)]

def count_duplicates(words: List[str]) -> dict:
    word_counts = {}
    for word in words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    return word_counts

def prop_adjectives_repeated(text: str) -> Optional[float]:
    adjectives = textblob_pos_filter(text, "JJ")
    if len(adjectives) == 0:
        return None
    return len([word for word, count in count_duplicates(adjectives).items() if count >= 2]) / len(adjectives)

def prop_adjectives_repeated_text_normalised(text: str) -> Optional[float]:
    adjectives = textblob_pos_filter(text, "JJ")
    if len(adjectives) == 0:
        return None
    return len([word for word, count in count_duplicates(adjectives).items() if count >= 2]) / len(text)

def prop_pos_words(text: str, num_words: int, pos: str) -> float:
    words = textblob_pos_filter(text, pos)
    return len(words) / num_words

def prop_sentences_started_with_conjunctions(text: str) -> float:
    sentences = nltk.sent_tokenize(text)
    conjunctions = ["and", "but", "or", "yet", "so", "for", "nor"]
    count = 0
    for sentence in sentences:
        if sentence.split()[0].lower() in conjunctions:
            count += 1
    return count / len(sentences)

summary_df["propSentencesStartedWithConjunctions"] = summary_df["text"].progress_apply(prop_sentences_started_with_conjunctions)
summary_df["propConjunctions"] = summary_df.progress_apply(lambda row: prop_pos_words(row["text"], row["num_words"], "CC"), axis=1)
summary_df["propPronouns"] = summary_df.progress_apply(lambda row: prop_pos_words(row["text"], row["num_words"], "PRP"), axis=1)
summary_df["propAdverbs"] = summary_df.progress_apply(lambda row: prop_pos_words(row["text"], row["num_words"], "RB"), axis=1)
summary_df["propVerbs"] = summary_df.progress_apply(lambda row: prop_pos_words(row["text"], row["num_words"], "VB"), axis=1)
summary_df["propNouns"] = summary_df.progress_apply(lambda row: prop_pos_words(row["text"], row["num_words"], "NN"), axis=1)
summary_df["propAdjectives"] = summary_df.progress_apply(lambda row: prop_pos_words(row["text"], row["num_words"], "JJ"), axis=1)
summary_df["propAdjectivesRepeated2pTextNormalised"] = summary_df["text"].progress_apply(prop_adjectives_repeated_text_normalised)
summary_df["propAdjectivesRepeated2p"] = summary_df["text"].progress_apply(prop_adjectives_repeated)
summary_df["subjectivity"] = summary_df["text"].progress_apply(get_subjectivity)
summary_df["polarity"] = summary_df["text"].progress_apply(get_polarity)

100%|██████████| 7165/7165 [00:00<00:00, 12311.30it/s]
100%|██████████| 7165/7165 [00:33<00:00, 210.74it/s]
100%|██████████| 7165/7165 [00:34<00:00, 209.55it/s]
100%|██████████| 7165/7165 [00:34<00:00, 209.54it/s]
100%|██████████| 7165/7165 [00:34<00:00, 206.60it/s]
100%|██████████| 7165/7165 [00:34<00:00, 208.64it/s]
100%|██████████| 7165/7165 [00:34<00:00, 206.86it/s]
100%|██████████| 7165/7165 [00:33<00:00, 211.19it/s]
100%|██████████| 7165/7165 [00:34<00:00, 210.28it/s]
100%|██████████| 7165/7165 [00:02<00:00, 2484.42it/s]
100%|██████████| 7165/7165 [00:02<00:00, 2557.15it/s]


## TextStat

In [21]:
import textstat

In [22]:
summary_df["num_syllables"] = summary_df["text"].apply(textstat.syllable_count)
summary_df["syllables_per_word"] = summary_df["num_syllables"] / summary_df["num_words"]

# Evaluation

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [28]:
X = summary_df[FEATURES]
y = summary_df[["content", "wording"]]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE, random_state=42)
len(X_train), len(X_test), len(y_train), len(y_test)

(5732, 1433, 5732, 1433)

In [32]:
model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))
results_df = cross_validate(model, X_train, y_train)
results_df

2024-03-15 16:32:45,104 - INFO - Using 10x10 cross validation


  0%|          | 0/100 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.47739,0.654725,0.566057,0.358751,0.500674,0.429713,0.789046,0.603164,0.696105
stdev,0.018187,0.026518,0.022352,0.010665,0.017992,0.014329,0.017464,0.031755,0.02461
n_trials,100.0,100.0,2.0,100.0,100.0,2.0,100.0,100.0,2.0


### Save feature_df to csv to avoid re-running the above code

In [34]:
summary_df.to_csv("../data/feature_df.csv", index=False)