In [1]:

!pip install -q -U accelerate --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U bitsandbytes --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U transformers --no-index --find-links ../input/llm-detect-pip/
!pip install -q -U --no-index --find-links /kaggle/input/llm-detect-pip peft
!pip install -q /kaggle/input/spacy-offline/en_core_web_lg-3.7.1-py3-none-any.whl
!pip install -q /kaggle/input/sentence-transformers-whl/sentence_transformers-2.5.1-py3-none-any.whl


# imports / misc.

In [2]:
import torch
import random
import numpy as np
import pandas as pd
import gc
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig
from accelerate import Accelerator

from peft import AutoPeftModelForCausalLM, PeftModel
import numpy as np 
import pandas as pd
import torch
import spacy

from sentence_transformers import SentenceTransformer
#https://github.com/Lightning-AI/lit-gpt/issues/327
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)


# Load up Mistral 7B!

In [3]:

model_name = '/kaggle/input/mistral-7b-it-v02'
adapter_model_name ="/kaggle/input/lora-sft-mistral7b-it/checkpoint-7101"
tokenizer = AutoTokenizer.from_pretrained(model_name) 

bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant=True,
)
config = AutoConfig.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    config=config,
    torch_dtype=torch.float16,
)

model = PeftModel.from_pretrained(model, adapter_model_name)
print(model)
model.eval()
accelerator = Accelerator()
model = accelerator.prepare(model)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(
                in_features=4096, out_features=1024, bias=False
  

# Define Components for Mistral

In [4]:
TAKE_SENTENCES = 1

In [5]:
instruct_prompt = """
An original text and its rewritten text are given. The original text was rewritten to the rewritten text using a rewrite prompt to Gemma 7b.
Your task is to recover the prompt used for rewriting the original text to the rewritten text.
original text: {original_text}
rewritten text: {rewritten_text}
"""
baseline_prompt = "Please improve this text using the writing style with maintaining the original meaning but altering the tone, the focus, the word choice, the expression, the theme or the voice."

In [6]:
# credit: https://www.kaggle.com/code/richolson/mistral-7b-prompt-recovery?scriptVersionId=166883120
def trim_to_first_x_sentences_or_lf(text, x):
    if x <= 0:
        return "" 

    #any double-spaces dealt with as linefeed
    text = text.replace("  ", "\n")

    text_chunks = text.split('\n', 1)
    first_chunk = text_chunks[0]
    sentences = first_chunk.split('.')

    if len(sentences) - 1 <= x: 
        trimmed_text = first_chunk
    else:
        # Otherwise, return the first x sentences
        trimmed_text = '.'.join(sentences[:x]).strip()

    if not trimmed_text.endswith('.'):
        trimmed_text += '.'  # Add back the final period if the text chunk ended with one and was trimmed

    return trimmed_text




VERBS = [
    "Refine", "Rewrite", "Improve", "Enhance", "Transform", "Convert", "Rephrase", "Revise", "Amend", "Alter",
    "Adjust", "Modify", "Change", "Redraft", "Reform", "Restyle", "Rephrase", "Reconstruct", "Reorganize",
    "Rebuild", "Remodel", "Remake", "Recreate", "Reinvent", "Reimagine", "Reinterpret", "Recontextualize",
    "Simplify", "Clarify", "Elaborate", "Expand", "Develop", "Enhance", "Augment", "Amplify", "Intensify",
    "Strengthen", "Deepen", "Broaden", "Heighten", "Enrich", "Embellish", "Emphasize", "Accentuate", "Highlight",
    "Explain", "Extend", "Enlarge", "Lengthen", "Stretch", "Widen", "Broaden", "Increase", "Grow", "Augment",
    "Summarize", "Condense", "Shorten", "Translate", "Turn", "Imagine", "Frame", "Describe", "Write",
    "Reveal", "Express", "Interpret", "Reframe", "Infuse", "Retell", "Reconstruct", "Recreate", "Reimagine",
    "Reword", "Make", "Revamp", "Paraphrase", "Rephrase", "Reformulate", "Reorganize", "Restructure", "Present",
    "Substitute", "Replace", "Reorder", "Rearrange", "Reposition", "Realign", "Reassign", "Reallocate", "Reassign",
    "Reconfigure", "Exaggerate", "Dramatize", "Envision", "Revert", "Inject", "Add", "Remove", "Delete", "Omit",
    "Annotate", "Apply", "Append", "Attach", "Combine", "Connect", "Consolidate", "Embed", "Incorporate", "Integrate",
    "Shift", "Reframe", "Refinish", "Change", "Recast", "Expand", "Extend", "Highlight", "Narrate", "Please", "Do", "Erase",
    "Innovate", "Invent", "Revolutionize", "Redesign", "Reinforce", "Optimize", "Polish", "Refactor", "Revise",
    "Streamline", "Overhaul", "Tune", "Customize", "Synthesize", "Visualize", "Sketch", "Draft", "Frame", "Illustrate",
    "Conceptualize", "Blueprint", "Upgrade", "Elevate", "Maximize", "Prune", "Minimize", "Distill", "Reengineer",
    "Regenerate", "Rejuvenate", "Recalibrate", "Rearticulate", "Reexpress", "Reprioritize", "Recommit", "Reinitialize",
    "Supercharge", "Empower", "Amplify", "Intensify", "Redefine", "Transform", "Embody", "Craft", "Blend", "Harmonize",
    "Adapt", "Localize", "Reposition", "Converge", "Unify", "Distinguish", "Separate", "Reintegrate", "Embed", "Foster"
]




def post_process_rewrite_prompt(prompt, original_text, rewritten_text):

    if not prompt:
        return ""

    prompt = trim_to_response(prompt)
    prompt = remove_numbered_list(prompt)

    prompt = prompt.replace("**", "")
    prompt = trim_to_first_x_sentences_or_lf(prompt, TAKE_SENTENCES)

    
    # VERB2
    for verb in VERBS:
        split_prompt = prompt.split()
        if verb in split_prompt:
            verb_index = split_prompt.index(verb)
            prompt = " ".join(split_prompt[verb_index:])
            break

    # VERB1
    split_words = prompt.split()
    if len(split_words) > 0 and split_words[0] not in VERBS:
        return get_baseline_prompt(original_text, rewritten_text) if not ADAPTIVE else ""
    

    #default to baseline if empty or unusually short
    if len(prompt) < 15:
        prompt = ""
    return prompt

In [7]:
#credit: https://www.kaggle.com/code/richolson/mistral-7b-prompt-recovery?scriptVersionId=166883120
def remove_numbered_list(text):
    final_text_paragraphs = [] 
    for line in text.split('\n'):
        # Split each line at the first occurrence of '. '
        parts = line.split('. ', 1)
        # If the line looks like a numbered list item, remove the numbering
        if len(parts) > 1 and parts[0].isdigit():
            final_text_paragraphs.append(parts[1])
        else:
            # If it doesn't look like a numbered list item, include the line as is
            final_text_paragraphs.append(line)

    return '  '.join(final_text_paragraphs)

# credit:https://www.kaggle.com/code/richolson/mistral-7b-prompt-recovery?scriptVersionId=166883120
#trims LLM output to just the response
def trim_to_response(text):
    terminate_string = "[/INST]"
    text = text.replace('</s>', '')
    #just in case it puts things in quotes
    text = text.replace('"', '')
    text = text.replace("'", '')

    last_pos = text.rfind(terminate_string)
    return text[last_pos + len(terminate_string):] if last_pos != -1 else text



def get_prompt(original_text, rewritten_text, tokenizer, model):

    messages = [
        {"role": "user", "content": instruct_prompt.format(original_text=original_text, rewritten_text=rewritten_text)},

    ]

    model_inputs = tokenizer.apply_chat_template(messages, return_tensors="pt")
    model_inputs = model_inputs.to("cuda")
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=model_inputs, max_new_tokens=50,
            do_sample=False, pad_token_id=tokenizer.eos_token_id,
        )

    decoded = tokenizer.batch_decode(generated_ids)
    final_text = post_process_rewrite_prompt(decoded[0], original_text, rewritten_text)

    return final_text


# Mean Prompt Customization  

In [8]:


nlp = spacy.load("en_core_web_lg")

for s in nlp.vocab.vectors:
    _ = nlp.vocab[s]
def clean_text(text):
    text = text.replace('\\n\\n', " ")
    text = text.replace('\\n', " ")
    # convert more than two consecutive spaces to one
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

spacy_vocab = set([word for word in nlp.vocab if word.has_vector and word.is_alpha and not word.is_stop and len(word.text) > 1])
spacy_vocab_text = np.array([word.text for word in spacy_vocab])
vocab_vectors = np.array([word.vector for word in spacy_vocab])
vocab_vectors = vocab_vectors / np.linalg.norm(vocab_vectors, axis=1, keepdims=True)

base_rewrite_prompt = "Please improve this text using the writing style with maintaining the original meaning but altering the tone, the focus, the word choice, the expression, the theme or the voice."
base_rewrite_prompt_format = "Please {verb2} this text {verb1} {noun1} with maintaining the original {intact_tone} but altering {noun2}."

encoder = SentenceTransformer("/kaggle/input/sentence-t5-xl/sentence-t5-xl")
encoder = encoder.to("cuda")

TONE_LIST = [
    "engaging", "informative", "persuasive", "entertaining", "inspiring", "motivating", "educational", "thought-provoking",
    "thoughtful", "insightful", "compelling", "captivating", "interesting", "fascinating", "intriguing", "stimulating",
    "provocative", "challenging", "controversial", "original", "creative", "innovative", "imaginative", "visionary",
    "bold", "daring", "courageous", "adventurous", "ambitious", "optimistic", "hopeful", "uplifting", "inspiring",
    "encouraging", "positive", "constructive", "supportive", "affirmative", "reassuring", "comforting", "soothing",
    "calming", "relaxing", "peaceful", "tranquil", "serene", "gentle", "mild", "soft", "quiet", "subtle", "moderate",
    "temperate", "mellow", "laid-back", "easygoing", "casual", "informal", "relaxed", "unhurried", "leisurely",
    "easy", "comfortable", "cozy", "homely", "familiar", "friendly", "welcoming", "inviting", "hospitable", "cordial",
    "warm", "affectionate", "tender", "loving", "caring", "kind", "compassionate", "sympathetic", "empathetic",
    "understanding", "tolerant", "patient", "forgiving", "accepting", "open-minded", "broad-minded", "liberated",
    "free", "independent", "autonomous", "self-reliant", "self-sufficient", "self-governing", "self-determining",
    "self-regulating", "self-sustaining", "self-supporting", "self-satisfied", "self-contented", "self-confident",
    "horrible", "terrible", "awful", "dreadful", "appalling", "frightening", "scary", "horrifying", "shocking",
    "alarming", "disturbing", "upsetting", "unsettling", "troubling", "worrying", "concerning", "distressing",
    "depressing", "disheartening", "discouraging", "dispiriting", "demoralizing", "dismaying", "disappointing",
    "humorous", "funny", "amusing", "comical", "hilarious", "laughable", "ludicrous", "ridiculous",
    "absurd", "preposterous", "nonsensical", "silly", "foolish", "stupid", "idiotic", "moronic", "imbecilic", "poetic",
    "lyrical", "rhythmic", "musical", "harmonious", "melodious", "tuneful", "euphonious", "mellifluous", "symphonic",
    "analytical", "critical", "sincere", "earnest", "dynamic", "reflective", "authoritative", "formal", "scholarly",
    "scientific", "technical", "pragmatic", "factual", "methodical", "logical", "pensive", "witty", "satirical",
    "ironical", "sarcastic", "whimsical", "dreamy", "romantic", "passionate", "enthusiastic", "vibrant", "vivid",
    "graphic", "detailed", "minimalist", "sparse", "cryptic", "esoteric", "mystical", "philosophical", "nostalgic",
    "sentimental", "bittersweet", "melancholic", "solemn", "grave", "stark", "blunt", "direct", "candid", "outspoken",
    "confrontational", "aggressive", "forceful", "spirited", "robust", "strident",
    "exhaustive", "explanatory", "introspective", "journalistic", "provocative", "pioneering", "quirky",
    "stream-of-consciousness", "surreal", "unbiased", "detailed", "objective", "subjective", "instructional",
    "descriptive", "conversational", "academic", "colloquial", "experimental", "intuitive", "speculative",
    "predictive", "analytic", "historical", "futuristic", "mythical", "allegorical", "satirical", "parodic",
    "elegiac", "epic", "tragic", "comic", "dramatic", "didactic", "ironic", "metaphoric", "symbolic",
    "narrative-driven", "expository", "argumentative", "rhetorical question-based", "alliterative",
    "anecdotal", "case study-oriented", "cause and effect", "compare and contrast", "hypothetical",
    "informational", "problem-solution", "sequential", "time lapse", "vignette-style", "flashback",
    "streamlined", "concise", "elaborate", "minimalist", "ornate", "technological", "scientifically detailed",
    "speculative fiction", "cyberpunk", "steampunk", "high fantasy", "magical realism", "realist", "naturalist",
    "modernist", "postmodernist", "neoclassical", "baroque-inspired", "gothic", "renaissance-inspired",
    "satire", "pastoral", "urban", "rural", "realistic", "surrealistic", "abstract", "expressionistic", "eloquent",
    "epistolary", "dialogic", "epigrammatic", "aphoristic", "stream of consciousness", "frame story", "allegorical",
    "cinematic", "non-linear", "pictorial", "monologue", "parable", "fable", "paradoxical", "dialectical", "diary-like",
    "documentary-style", "metafiction", "biographical", "autobiographical", "memoir-style", "fairy tale",
    "mythological", "chivalric", "utopian", "dystopian", "apocalyptic", "post-apocalyptic",
    "experimental", "episodic", "vlog", "podcast", "interview", "fantastical", "relatable", "narrative", "imagery",
    "candid", "incisive", "satirical", "resolute", "playful", "informal", "traditional", "mysterious", "mystical",
    "personified", "celebratory", "neutral",
]

TONE_SUFFIX = " writing style and tone"
TONE_PREFIX = "This text has the "
TONE_LIST = list(set([f"{TONE_PREFIX}{tone}{TONE_SUFFIX}" for tone in TONE_LIST]))


def get_similar_words_from_spacy_vocab(words: list[str], top_n=1, pos_list=None):
    word_vector = np.array([nlp(str(word)).vector for word in words]).mean(axis=0)
    word_vector = word_vector / np.linalg.norm(word_vector)
    similarity = np.dot(vocab_vectors, word_vector)
    top_indices = np.argsort(similarity)
    top_n_words = spacy_vocab_text[top_indices[-top_n:]]
    top_n_words = [
        word.lemma_.lower() for word in nlp(" ".join(top_n_words))
        if (word.lemma_.lower() not in words) and (pos_list is None or word.pos_ in pos_list)
    ]
    return top_n_words

@torch.inference_mode()
def get_sim(tones, original_text, rewritten_text, top_n):
    tones = np.array(tones)
    words_embedding = encoder.encode(tones, convert_to_tensor=True, normalize_embeddings=True)

    ori_text_embedding = encoder.encode(original_text, convert_to_tensor=True, normalize_embeddings=False)
    rew_text_embedding = encoder.encode(rewritten_text, convert_to_tensor=True, normalize_embeddings=False)
    text_embedding = rew_text_embedding - ori_text_embedding
    text_embedding = torch.nn.functional.normalize(text_embedding, p=2, dim=-1)
    sim_vectors = text_embedding @ words_embedding.T
    sorted_indices = torch.argsort(sim_vectors, descending=True).cpu()
    return tones[sorted_indices[:top_n]], sim_vectors[sorted_indices[:top_n]].cpu().numpy()

def take_similar_words(words, original_text, rewritten_text, pos_list=None, top_n=3):

    spacy_words = get_similar_words_from_spacy_vocab(list(words), top_n=10, pos_list=pos_list)
    if len(spacy_words) == 0:
        return []
    words = list(set(list(spacy_words) + list(words)))
    words, _ = get_sim(words, original_text, rewritten_text, top_n=top_n)

    if isinstance(words, str):
        words = [words]
    return words

def generate_adaptive_mean_prompt(original_text, rewritten_text):

    original_text = clean_text(original_text)
    rewritten_text = clean_text(rewritten_text)
    ori_doc = nlp(original_text.lower())
    rew_doc = nlp(rewritten_text.lower())

    ori_words = set([token.lemma_.lower().replace("'s", "").replace('"', '') for token in ori_doc if token.pos_ in ["NOUN", "ADJ"] if token.pos_ != "PROPN" and token.text_with_ws.strip()])
    ori_verbs = set([token.lemma_.lower() for token in ori_doc if token.pos_ == "VERB" if token.pos_ != "PROPN" and token.text_with_ws.strip()])
    rew_words = set([token.lemma_.lower().replace("'s", "").replace('"', '') for token in rew_doc if token.pos_ in ["NOUN", "ADJ"] if token.pos_ != "PROPN" and token.text_with_ws.strip()])
    rew_verbs = set([token.lemma_.lower() for token in rew_doc if token.pos_ == "VERB" if token.pos_ != "PROPN" and token.text_with_ws.strip()])

    rew_diff_words = list(rew_words - ori_words)
    rew_diff_verbs = list(rew_verbs - ori_verbs)

    rewriting_tone_verb = "using"
    rewriting_tone_word = "the writing style"
    rewritten_tone_word = "the tone"
    intact_tone = "meaning"


    rew_diff_words = list(rew_diff_verbs) + list(rew_diff_words)
    if len(rew_diff_words) > 3:
        print(len(TONE_LIST))
        #改
        top_tones, top_tones_sims = get_sim(TONE_LIST, original_text, rewritten_text, top_n=2)
        top_tones = list(map(lambda x: x.replace(TONE_PREFIX, "").replace(TONE_SUFFIX, ""), top_tones))

        rewriting_tone_word = " and ".join(top_tones) + " writing style"
        #改
        verbs = take_similar_words(list(rew_diff_words), original_text, rewritten_text, pos_list=["VERB"], top_n=2)
        if len(verbs) > 0:
            verbs = [verb[:-1] + "ing" if verb.endswith("e") else verb + "ing" for verb in verbs]
            rewriting_tone_verb = " and ".join(verbs)

    new_base_rewrite_prompt = base_rewrite_prompt_format.format(
        noun1=rewriting_tone_word, verb1=rewriting_tone_verb,
        verb2="improve", noun2=rewritten_tone_word, intact_tone=intact_tone
    )
    return new_base_rewrite_prompt


In [9]:

def get_baseline_prompt(original_text, rewritten_text):
    mean_prompt = generate_adaptive_mean_prompt(original_text, rewritten_text)
    return mean_prompt


# Evaluate test data

In [10]:
import re
test_df = pd.read_csv("/kaggle/input/llm-prompt-recovery/test.csv")
from tqdm import tqdm  # 引入 tqdm 庫

magic = " 'it 's ' something Think A Human Plucrarealucrarealucrarealucrarealucrarealucrarealucrarealucrarea"
# 確保 `original_text` 和 `rewritten_text` 的空值處理
test_df['original_text'] = test_df['original_text'].fillna('')
test_df['rewritten_text'] = test_df['rewritten_text'].fillna('')

# 使用 tqdm 包裝迴圈以顯示進度條
for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing rows"):
    if len(row['original_text']) == 0 or len(row['rewritten_text']) == 0:
        result = get_baseline_prompt(row['original_text'], row['rewritten_text'])
        result += magic
    else:
        try:
            result = get_prompt(row['original_text'], row['rewritten_text'], tokenizer, model)
            result += get_baseline_prompt(row['original_text'], row['rewritten_text'])
            result += magic
        except Exception as e:
            print(e)
            result = get_baseline_prompt(row['original_text'], row['rewritten_text'])
            result += magic
    test_df.at[index, 'rewrite_prompt'] = result

# 保留需要的欄位
test_df = test_df[['id', 'rewrite_prompt']]



Processing rows:   0%|          | 0/1 [00:00<?, ?it/s]2024-12-14 16:57:02.258620: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-14 16:57:02.258756: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-14 16:57:02.393795: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


325


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 1/1 [00:19<00:00, 19.56s/it]


# Submit!

In [11]:
test_df.to_csv('submission.csv', index=False)