# Persona-Based Text Rewriter

This notebook implements a retrieval-based rewriter that can transform core facts into different personas based on the personas.csv dataset.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the personas dataset
df = pd.read_csv('personas.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

Dataset shape: (20, 6)
Columns: ['Core Fact', 'Shakespearean Bard', 'Gen-Z TikToker', 'Corporate Executive', 'Kind Grandma', 'Sci-Fi Robot Historian (Year 3080)']

First few rows:


Unnamed: 0,Core Fact,Shakespearean Bard,Gen-Z TikToker,Corporate Executive,Kind Grandma,Sci-Fi Robot Historian (Year 3080)
0,"The Eiffel Tower is located in Paris, France.","Lo, the iron maiden doth pierce the skies of f...","Yo, the Eiffel Tower‚Äôs vibin‚Äô right in Paris üá´...","The Eiffel Tower, an established landmark, res...","Sweetheart, that tall, twinkling tower you see...",Geo-structure Eiffel-Twr-75 located at Paris C...
1,Water boils at 100 degrees Celsius at sea level.,"At sea‚Äôs gentle breast, the water doth boil wh...","No cap, water hits boil at 100¬∞C when you chil...",It is a standard physical fact that water reac...,"Honey, when you‚Äôre at the beach or close to th...",Phase Change Event: H‚ÇÇO transitions to vapor a...
2,"The next solar eclipse will occur on April 8, ...","Mark thy calendars, for the sun shall be shrou...",Heads up! Solar eclipse dropping on 4/8/24 üåíüëÄ ...,Please be advised that the forthcoming solar e...,"Remember dear, the sky‚Äôs gonna get dark on Apr...",Celestial Event Solar-Eclipse-2024 set for Ter...
3,You can reset your password using the ‚ÄúForgot ...,"Shouldst thou forget thy secret key, seek the ...","Bruh, just hit that ‚ÄúForgot Password‚Äù link if ...",Password recovery can be initiated by selectin...,"If you ever forget your password, dear, just c...",User Authentication Reset Procedure: Activate ...
4,The company‚Äôs quarterly earnings report will b...,Hear ye! The scroll of quarterly gain shall be...,"Heads up, the company‚Äôs earnings drop on Augus...",The quarterly financial disclosure is schedule...,They‚Äôre gonna tell us how the company did on A...,Fiscal Data Release Q3-Report scheduled for Te...


In [3]:
# Display available personas
personas = df.columns[1:].tolist()  # Exclude 'Core Fact' column
print("Available personas:")
for i, persona in enumerate(personas, 1):
    print(f"{i}. {persona}")

# Show sample core facts
print("\nSample core facts:")
for i, fact in enumerate(df['Core Fact'].head(5), 1):
    print(f"{i}. {fact}")

Available personas:
1. Shakespearean Bard
2. Gen-Z TikToker
3. Corporate Executive
4. Kind Grandma
5. Sci-Fi Robot Historian (Year 3080)

Sample core facts:
1. The Eiffel Tower is located in Paris, France.
2. Water boils at 100 degrees Celsius at sea level.
3. The next solar eclipse will occur on April 8, 2024.
4. You can reset your password using the ‚ÄúForgot Password‚Äù link.
5. The company‚Äôs quarterly earnings report will be released on August 1st.


In [4]:
# Create TF-IDF vectorizer for similarity matching
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 2),
    max_features=1000
)

# Fit on core facts
core_facts_vectors = vectorizer.fit_transform(df['Core Fact'])
print(f"TF-IDF matrix shape: {core_facts_vectors.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

TF-IDF matrix shape: (20, 185)
Vocabulary size: 185


In [5]:
class PersonaRewriter:
    """
    A retrieval-based rewriter that transforms core facts into different personas.
    """
    
    def __init__(self, df: pd.DataFrame, vectorizer: TfidfVectorizer, core_facts_vectors):
        self.df = df
        self.vectorizer = vectorizer
        self.core_facts_vectors = core_facts_vectors
        self.personas = df.columns[1:].tolist()
        
    def find_most_similar_fact(self, query: str, top_k: int = 3) -> List[Tuple[int, float]]:
        """
        Find the most similar core facts to the given query.
        """
        # Vectorize the query
        query_vector = self.vectorizer.transform([query])
        
        # Calculate similarities
        similarities = cosine_similarity(query_vector, self.core_facts_vectors).flatten()
        
        # Get top-k most similar facts
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        return [(idx, similarities[idx]) for idx in top_indices]
    
    def rewrite_fact(self, core_fact: str, target_persona: str) -> str:
        """
        Rewrite a core fact using the specified persona.
        """
        if target_persona not in self.personas:
            raise ValueError(f"Persona '{target_persona}' not found. Available personas: {self.personas}")
        
        # Find most similar fact in our dataset
        similar_facts = self.find_most_similar_fact(core_fact, top_k=1)
        
        if not similar_facts:
            return f"No similar fact found for: {core_fact}"
        
        best_match_idx, similarity_score = similar_facts[0]
        
        # Get the persona version of the most similar fact
        persona_version = self.df.iloc[best_match_idx][target_persona]
        
        return persona_version, similarity_score, self.df.iloc[best_match_idx]['Core Fact']
    
    def rewrite_fact_all_personas(self, core_fact: str) -> Dict[str, str]:
        """
        Rewrite a core fact using all available personas.
        """
        results = {}
        
        for persona in self.personas:
            try:
                persona_version, similarity, original_fact = self.rewrite_fact(core_fact, persona)
                results[persona] = {
                    'rewritten': persona_version,
                    'similarity': similarity,
                    'original_match': original_fact
                }
            except Exception as e:
                results[persona] = {
                    'rewritten': f"Error: {str(e)}",
                    'similarity': 0.0,
                    'original_match': ""
                }
        
        return results
    
    def get_persona_style_guide(self) -> Dict[str, str]:
        """
        Get a brief description of each persona's style.
        """
        style_guides = {
            'Shakespearean Bard': 'Uses archaic English, thee/thou, -eth endings, poetic language',
            'Gen-Z TikToker': 'Uses modern slang, emojis, hashtags, casual tone',
            'Corporate Executive': 'Formal, professional language, business terminology',
            'Kind Grandma': 'Warm, caring tone, uses terms of endearment, simple explanations',
            'Sci-Fi Robot Historian (Year 3080)': 'Technical, futuristic terminology, formal reporting style'
        }
        return style_guides

In [6]:
# Initialize the rewriter
rewriter = PersonaRewriter(df, vectorizer, core_facts_vectors)

# Display persona style guides
print("Persona Style Guides:")
print("=" * 50)
for persona, style in rewriter.get_persona_style_guide().items():
    print(f"{persona}:\n{style}\n")

Persona Style Guides:
Shakespearean Bard:
Uses archaic English, thee/thou, -eth endings, poetic language

Gen-Z TikToker:
Uses modern slang, emojis, hashtags, casual tone

Corporate Executive:
Formal, professional language, business terminology

Kind Grandma:
Warm, caring tone, uses terms of endearment, simple explanations

Sci-Fi Robot Historian (Year 3080):
Technical, futuristic terminology, formal reporting style



In [7]:
# Test the rewriter with a sample fact
test_fact = "The weather is sunny today."
print(f"Original fact: {test_fact}")
print("=" * 50)

results = rewriter.rewrite_fact_all_personas(test_fact)

for persona, result in results.items():
    print(f"\n{persona}:")
    print(f"Rewritten: {result['rewritten']}")
    print(f"Similarity: {result['similarity']:.3f}")
    print(f"Matched with: {result['original_match']}")

Original fact: The weather is sunny today.

Shakespearean Bard:
Rewritten: Prepare thyself, for on the morrow heavy rains shall fall.
Similarity: 0.302
Matched with: The weather forecast predicts heavy rain tomorrow.

Gen-Z TikToker:
Rewritten: Weather‚Äôs saying it‚Äôs gonna pour tomorrow ‚Äî bring that drip üåßÔ∏è‚òî
Similarity: 0.302
Matched with: The weather forecast predicts heavy rain tomorrow.

Corporate Executive:
Rewritten: Looks like a rainy day tomorrow, dear ‚Äî don‚Äôt forget your coat!
Similarity: 0.302
Matched with: The weather forecast predicts heavy rain tomorrow.

Kind Grandma:
Rewritten: Meteorological Data: Precipitation levels expected to rise significantly on 2025-07-28.
Similarity: 0.302
Matched with: The weather forecast predicts heavy rain tomorrow.

Sci-Fi Robot Historian (Year 3080):
Rewritten: nan
Similarity: 0.302
Matched with: The weather forecast predicts heavy rain tomorrow.


In [8]:
# Demo with multiple test cases
test_cases = [
    "The temperature is 25 degrees Celsius.",
    "The meeting starts at 3 PM.",
    "The restaurant is closed on Sundays.",
    "The movie was released in 2023.",
    "The library has over 10,000 books."
]

print("Demo: Testing multiple facts with all personas")
print("=" * 60)

for i, test_fact in enumerate(test_cases, 1):
    print(f"\nTest Case {i}: {test_fact}")
    print("-" * 40)
    
    results = rewriter.rewrite_fact_all_personas(test_fact)
    
    for persona, result in results.items():
        print(f"{persona}: {result['rewritten']}")
        print(f"  (Similarity: {result['similarity']:.3f})")

Demo: Testing multiple facts with all personas

Test Case 1: The temperature is 25 degrees Celsius.
----------------------------------------
Shakespearean Bard: At sea‚Äôs gentle breast, the water doth boil when the thermometer greets the hundredth degree.
  (Similarity: 0.480)
Gen-Z TikToker: No cap, water hits boil at 100¬∞C when you chillin‚Äô at sea level üî•üíß
  (Similarity: 0.480)
Corporate Executive: It is a standard physical fact that water reaches its boiling point at 100 degrees Celsius under sea level conditions.
  (Similarity: 0.480)
Kind Grandma: Honey, when you‚Äôre at the beach or close to the sea, water boils at 100 degrees Celsius ‚Äî just like when I make tea!
  (Similarity: 0.480)
Sci-Fi Robot Historian (Year 3080): Phase Change Event: H‚ÇÇO transitions to vapor at 100¬∞C (373.15 K) @1 atm pressure, Sea Level Reference.
  (Similarity: 0.480)

Test Case 2: The meeting starts at 3 PM.
----------------------------------------
Shakespearean Bard: The parcel hath reach

In [None]:
# Interactive function to rewrite facts
def interactive_rewrite():
    """
    Interactive function to test the rewriter with user input.
    """
    print("Persona-Based Text Rewriter")
    print("=" * 30)
    
    while True:
        print("\nAvailable personas:")
        for i, persona in enumerate(rewriter.personas, 1):
            print(f"{i}. {persona}")
        
        print("\nOptions:")
        print("1. Rewrite with specific persona")
        print("2. Rewrite with all personas")
        print("3. Exit")
        
        choice = input("\nEnter your choice (1-3): ").strip()
        
        if choice == '3':
            print("Goodbye!")
            break
        
        fact = input("\nEnter the fact to rewrite: ").strip()
        
        if choice == '1':
            persona_idx = input("Enter persona number: ").strip()
            try:
                persona_idx = int(persona_idx) - 1
                if 0 <= persona_idx < len(rewriter.personas):
                    persona = rewriter.personas[persona_idx]
                    rewritten, similarity, original = rewriter.rewrite_fact(fact, persona)
                    
                    print(f"\nOriginal: {fact}")
                    print(f"Persona: {persona}")
                    print(f"Rewritten: {rewritten}")
                    print(f"Similarity: {similarity:.3f}")
                    print(f"Matched with: {original}")
                else:
                    print("Invalid persona number!")
            except ValueError:
                print("Please enter a valid number!")
        
        elif choice == '2':
            results = rewriter.rewrite_fact_all_personas(fact)
            print(f"\nOriginal: {fact}")
            print("=" * 50)
            
            for persona, result in results.items():
                print(f"\n{persona}:")
                print(f"Rewritten: {result['rewritten']}")
                print(f"Similarity: {result['similarity']:.3f}")
        
        else:
            print("Invalid choice! Please enter 1, 2, or 3.")

# Uncomment the line below to run the interactive mode
interactive_rewrite()

Persona-Based Text Rewriter

Available personas:
1. Shakespearean Bard
2. Gen-Z TikToker
3. Corporate Executive
4. Kind Grandma
5. Sci-Fi Robot Historian (Year 3080)

Options:
1. Rewrite with specific persona
2. Rewrite with all personas
3. Exit


In [1]:
# Analysis: Show similarity distribution
print("Similarity Analysis")
print("=" * 30)

# Test with all core facts in the dataset
similarities = []
for fact in df['Core Fact']:
    results = rewriter.rewrite_fact_all_personas(fact)
    for persona, result in results.items():
        similarities.append(result['similarity'])

print(f"Average similarity: {np.mean(similarities):.3f}")
print(f"Median similarity: {np.median(similarities):.3f}")
print(f"Min similarity: {np.min(similarities):.3f}")
print(f"Max similarity: {np.max(similarities):.3f}")

# Show facts with highest and lowest similarities
print("\nFacts with highest similarity matches:")
for fact in df['Core Fact']:
    results = rewriter.rewrite_fact_all_personas(fact)
    max_sim = max(result['similarity'] for result in results.values())
    if max_sim > 0.8:
        print(f"- {fact} (max similarity: {max_sim:.3f})")

Similarity Analysis


NameError: name 'df' is not defined