In [None]:
%%writefile .env 
OPENAI_ENDPOINT=https://<YOUR-ENDPOINT>.openai.azure.com/
OPENAI_API_KEY=<YOUR-API-KEY>
OPENAI_COMPLETION_MODEL=<YOUR-COMPLETION-MODEL-NAME>
OPENAI_EMBEDDING_MODEL=<YOUR-EMBEDDING-MODEL-NAME>


In [None]:
import dotenv
dotenv.load_dotenv()
import pandas as pd 
import numpy as np
import re
import time
import os 

In [None]:
# Data read
def read_df(path):
    df = pd.read_csv(path)
    cols_kept = [
        'name', 
        'reviews.doRecommend','reviews.numHelpful','reviews.rating','reviews.text',
        'reviews.title',
    ]
    df = df[cols_kept]
    df['ml_id'] = df.index
    return df 


In [None]:
# Data cleanup
def normalize_text(s: str, sep_token = " \n "):
    s = s.replace("\n", " ")
    s = re.sub(r",","",s)
    s = re.sub(r'\.+', '.', s)
    s = re.sub(r'(\. )+', '. ', s)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip()

    return s

def clean_up_text(review_title:pd.Series, review_text:pd.Series)->pd.Series:
    data = review_title+' ' + review_text
    cleaned_text = data.apply(normalize_text)
    return cleaned_text



In [None]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") 

def count_tokens(series:pd.Series, *,tokenizer=tokenizer) -> pd.Series:
    n_tokens = series.apply(lambda x: len(tokenizer.encode(x)))
    return n_tokens

In [None]:
import openai
openai.api_type = 'azure'
openai.api_base = os.getenv('OPENAI_ENDPOINT')
openai.api_key = os.getenv('OPENAI_API_KEY')
openai.api_version = os.getenv('OPENAI_API_VERSION')


In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from typing import Tuple

embedding_model = os.getenv('OPENAI_EMBEDDING_MODEL')

embedding_cache={}
def get_one_embedding(input: str)->list:
    if input in embedding_cache:
        return embedding_cache[input]
    fail_counter = 1
    while True:
        try:
            if fail_counter>3:
                return None
            if fail_counter>1:
                print(f'try: {fail_counter}')
            response = openai.Embedding.create(input=[input], engine=embedding_model)
            ans= np.array(response.data[0].embedding)
            embedding_cache[input] = ans
            return ans

        except openai.error.RateLimitError:
            wait = 10
            print(f'Rate limit... waiting for {wait} seconds ({input[:20]})')
            time.sleep(wait)
            fail_counter +=1


def get_embedding(data: pd.Series)->Tuple[pd.Series, np.ndarray]:
    series =  data.apply(lambda x: get_one_embedding(x))
    matrix = np.stack(series.to_numpy())
    return series, matrix

def perform_pca(embedding_matrix:np.array, n_components:int=5)->np.ndarray:
    reduced_matrix = PCA(n_components=n_components,random_state=0).fit_transform(embedding_matrix)
    return reduced_matrix

def perform_clustering(matrix:np.array, n_clusters:int=3)->pd.Series:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10).fit(matrix)
    return pd.Series(kmeans.labels_)



In [None]:
class Persona:
    def __init__(self, cluster_id:int, data:list[str]):
        self.cluster_id=cluster_id
        self.name = f'Persona {cluster_id}'
        self.unprocessed_data = data 
        self.total_reviews = len(data)
        self.processed_data = []
        self.pending_data = []
        self.description = ''
        self.description_history = []
        self.initiated = False 
    
    def get_prompt_command(self):
        if self.initiated==False:
            return 'Write a paragraph describing the persona for the people who wrote the following reviews: \n'
        else:
            return f'Write a paragraph updating the persona below based on the following additional reviews: \n current persona: {self.description} \nReviews: \n'
    
    def update_description(self, description:str):
        # print(f'Updating description: {description}')
        self.description_history.append(description)
        self.description = description
        return 
        
    
    def process_persona(self):
        iter=1
        self.pending_data = []
        while len(self.unprocessed_data)>0:
            prompt = self.get_prompt_command()
            while len(tokenizer.encode(prompt))< 2000:
                if len(self.unprocessed_data)==0:
                    break
                review = self.unprocessed_data.pop(0)
                prompt += f'- {review}\n'
                self.pending_data.append(review)
            
            if len(self.pending_data)==0:
                raise ValueError('Cannot process any more data because max token limit was reached')
            
            print(f'Persona {self.cluster_id} - Iteration {iter} - batch size: {len(self.pending_data)} / {self.total_reviews}')
            
            res = openai.Completion.create(
                prompt=prompt, 
                engine=os.getenv('OPENAI_COMPLETION_MODEL'), 
                max_tokens=2000, 
                temperature=0.7, 
                top_p=1, 
                frequency_penalty=0, 
                presence_penalty=0.6, 
                # stop=['\n'],
            )
            ans = res.choices[0].text 
            self.initiated = True
            self.update_description(ans)
            self.processed_data += self.pending_data
            self.pending_data = []
        return 
                
    def create_marketing_msg(self):
        prompt = f'Write a marketing message for the persona below: \n {self.description}'
        res = openai.Completion.create(
            prompt=prompt,
            engine=os.getenv('OPENAI_COMPLETION_MODEL'), 
            max_tokens = 500,
        )
        self.marketing_msg = res.choices[0].text
        return self.marketing_msg
    
    def __str__(self):
        s = f'Persona {self.cluster_id} \n {self.description} \n {self.marketing_msg}'
        ans = []
        for txt in s.split('\n'):
            while len(txt)> 100:
                a = txt[:100]
                txt = txt[100:]
                ans.append(a)
            ans.append(txt)
        return '\n'.join(ans)
        
    
def create_persona(df: pd.DataFrame)->list[Persona]:
    personas = []
    for cluster_id in df.cluster_id.unique():
        data = df[df.cluster_id==cluster_id]['clean_text'].to_list()
        persona = Persona(cluster_id, data)
        persona.process_persona()
        persona.create_marketing_msg()
        personas.append(persona)
    return personas

In [None]:
df= read_df('../data/product-fire-hd-10.csv' )
df['clean_text'] = clean_up_text(df['reviews.title'], df['reviews.text'])
df['n_tokens'] = count_tokens(df['clean_text'])
# print(df['n_tokens'].describe())
emb_series, emb_matrix = get_embedding(df['clean_text'])
reduced_matrix = perform_pca(emb_matrix)
df['cluster_id'] = perform_clustering(reduced_matrix)
personas = create_persona(df)


In [None]:
for p in personas:
    print(p)

