In [1]:
from convokit import Corpus, Speaker, Utterance, Conversation, download  
import os, pandas as pd
import requests
import  pickle

def load_dfs(corpus):  
    speakers = corpus.get_speakers_dataframe().drop(columns=['vectors'])  
      
    conversations = corpus.get_conversations_dataframe().drop(columns=['vectors'])  
    utterances = corpus.get_utterances_dataframe().drop(columns=['vectors'])  
    # print(type(speakers), type(conversations), type(utterances))  
    return speakers, conversations, utterances

def print_overview(speaker_df, convo_df, utt_df):  
    print("UttDf attributes:", list(utt_df.columns),'\n')  
    print("ConvDf attributes:", list(convo_df.columns),'\n')  
    print("SpeakerDf attributes:", list(speaker_df.columns),'\n')  
  
    print("convo:",convo_df.shape)  
    print("speaker:",speaker_df.shape)      
    print("utt:",utt_df.shape)  
      
def check_column_type(df):  
    r = df.sample(n=1)  
    for i in df.columns:  
        print(i, type(r[i].values[0]))

def get_conv_id_list(corpus, condition):  
    conv_id_list = []  
    for conv in corpus.iter_conversations():  
        if condition(conv):  
            conv_id_list.append(conv.id)  
    return conv_id_list  
  
def condition(conv):  
    return (len(conv.get_utterance_ids()) > 10 and len(conv.get_speaker_ids()) < 10 and conv.meta['has_removed_comment'] == False)

def rename_speaker_conv(conv):  
    speakers = conv.get_speaker_ids()  
    nameMap = {speaker: idx for idx, speaker in enumerate(speakers)}  
  
    utts_df = conv.get_utterances_dataframe()  
    utts_df['speakerID'] = utts_df['speaker'].apply(lambda x: nameMap.get(x, -1))  
  
    def map_reply_to(x):  
        try:  
            return nameMap[utts_df.at[x, 'speaker']]  
        except KeyError:  
            return -1  
  
    utts_df['reply_to_ID'] = utts_df['reply_to'].apply(map_reply_to)  
      
    return utts_df  
  
def format_conversation_user_utt_df(utt_df):  
    speaker = utt_df['speakerID'].values  
    text = utt_df['text'].values  
    return [f"SPEAKER {speaker[i]}: {text[i]}\r" for i in range(len(speaker))]

def draw_combat_from_conv(conv_df):
    # a combat is a conversation pair. B replied to A, and A replied to B's utterance. A combat involves at least 3 utterances, and is a dataframe made up of utterances
    
    combat_df_list = []
    """这样写代码:
首先从dataframe的末尾往前遍历
对于每一个i, 查找'reply_to'的值, 如果为None则跳过
如果不为None, 则将i行的dataframe index和对应'reply_to'索引的行的index记录进一个临时list, 同时检查对应索引行对应的'reply_to'的值(也就是utterance的索引)的行是否存在'reply_to', 如果有则把对应索引的行的index也加入list, 直到reply_to为None. 此时将list里所有的index组合成完整的dataframe放入list
然后进入下一个i, 重复遍历直到结束"""
    ind_ls = list(reversed(conv_df.index))
    for i in ind_ls:
        if pd.isna(conv_df.at[i, 'reply_to']):
            continue
        else:
            # for j in range(_ind, len(ind_ls)):
            #     if not pd.isna(conv_df.at[i, 'reply_to'])    
            temp_list = [i, conv_df.at[i, 'reply_to']]
            try:
                a = conv_df.at[conv_df.at[temp_list[-1], 'reply_to'], 'speaker']
            except KeyError:
                # print(temp_list)
                # print("probably because you deleted some utt in a conv")
                continue
                
            while not pd.isna(conv_df.at[temp_list[-1], 'reply_to']) and conv_df.at[temp_list[-2], 'speaker'] == conv_df.at[conv_df.at[temp_list[-1], 'reply_to'], 'speaker']:                
                temp_list.append(conv_df.at[temp_list[-1], 'reply_to'])
                
            if len(temp_list) > 2:
                
                combat_df_list.append(conv_df.loc[reversed(temp_list)]['text'])
                
    return combat_df_list

def clean_combat_series_list(series_list):  
    cleaned_list = []  
    for i in range(len(series_list)):  
        current_series = series_list[i]  
        if current_series is None:  
            continue  
  
        if '[deleted]' in current_series.values:  
            continue  
  
        is_subset = any(  
            current_series.index.isin(other_series.index).all() and   
len(current_series) < len(other_series)   
            for j, other_series in enumerate(series_list) if i != j and other_series is not None  
        )  
        if not is_subset:  
            cleaned_list.append(current_series)  
  
    return cleaned_list  
  
def convert_series_to_dataframe(series_list):  
    dataframe_list = []  
    for series in series_list:  
        dataframe_list.append(pd.DataFrame(series, columns=['text']))  
    return dataframe_list  
  
def combat_df_list_from_conv_df(conv_df):  
    combat_df_list = draw_combat_from_conv(conv_df)  
    cleaned = clean_combat_series_list(combat_df_list)  
    return convert_series_to_dataframe(cleaned)

def get_combat_df_list(corpus, conv_id_list):  
    combat_df_list = []  
    for conv_id in conv_id_list:  
        utt_df = rename_speaker_conv(corpus.get_conversation(conv_id))  
          
        combat_df_list.append(combat_df_list_from_conv_df(utt_df))  
    return combat_df_list

import requests  

def generate_answer_with_ollama(query='', context='{context not exists} ', system_prompt = 'You are a helpful assistant',input_text = None, max_tokens=1, model="llama3:latest", url="http://localhost:11434/api/generate"):  
    if input_text is not None:  
        input = input_text  
          
    else:  
        input = f"Question: {query}\nContext: {context}"  
  
    payload = {  
        'prompt': input,  
        'system' : system_prompt,  
        'model': model,  # Add the model parameter here  
        'max_tokens': max_tokens,  # Set the maximum number of tokens to generate  
        'temperature': 0.0,  # Set the temperature to 0.0 to remove randomness  
        # 'format': 'json',  
    }  
  
    response = requests.post(url, json=payload, stream=False)  
  
    try:  
        # Process NDJSON response  
        responses = response.content.decode('utf-8').splitlines()  
        json_responses = [requests.models.complexjson.loads(line) for line in responses]  
        responses = [item["response"] for item in json_responses]  
        text = ''.join(responses)  
        # print("NDJSON parsed successfully:", text)  
        return text
  
        # Extract the answer from the first JSON object        return text  
    except Exception as e:  
        print("Failed to decode NDJSON response:", e)  
        print("Response content:", response.content)  
        raise 
  
    # print("Response content:", response.content)  

def judge_targeting(conversation_pair):  
    """  
    :param conversation_pair: Tuple[str, str]    :return: 0 or 1   
    """  
    system_prompt = 'output format: {"output": OUTPUT}, where OUTPUT=0 or OUTPUT=1'  
    input_text = '{"request": "judge whether the latter speaker is targeting at the former speaker\'s words. Targeting includes opposing opinions, verbal violence", "conversation" : '+f"{conversation_pair}"+'}'  
    a = generate_answer_with_ollama(system_prompt=system_prompt, input_text=input_text, max_tokens=1)  
      
    return a

def single_text_summarization(text):  
    system_prompt = 'output format: {"summarization": SUMM}, where SUMM is the summarization of the text '  
    input_text = '{"request": "summarize the reddit speaker\'s emotions and key propositions, perspectives", "SpeakerText" : '+f"{text}"+'}'  
    a = generate_answer_with_ollama(system_prompt=system_prompt, input_text=input_text, max_tokens=min(len(text)/3, 30))  
      
    return a

def sentence_immiatation(text):  
    system_prompt = 'output format: {"immitation": IMM}, where IMM is a sentence-by-sentence immiatation of the original text'  
    input_text = '{"request": "immitate the reddit speaker\'s every sentence, expression", "SpeakerText" : '+f"{text}"+'}'  
    a = generate_answer_with_ollama(system_prompt=system_prompt, input_text=input_text, max_tokens=min(len(text), 30))  
      
    return a

path = "C:/Users/L/.convokit/downloads/"
corpus = Corpus(filename=path + 'conversations-gone-awry-cmv-corpus')
conv_id_list = get_conv_id_list(corpus, condition)
combat_df_list = get_combat_df_list(corpus, conv_id_list)

In [2]:
# print(combat_df_list[0][0])
# print(type(combat_df_list[0][0]))

for conv in combat_df_list:
    for i in conv:
        i['agu_1'] = i['text'].apply(lambda x: single_text_summarization(x))
        print(i['agu_1'].values)
        break
    break
    
def save_combat_df_list_to_pickle(combat_df_list, path):
    with open(path, 'wb') as f:
        pickle.dump(combat_df_list, f)
    print("Save to", path)
    
def load_combat_df_list_from_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
    

['{"summarization": "The speaker emphasizes individualism and argues against people claiming they\'re a different gender or race solely based on stereotypical behaviors, considering it \'gross\' to do so."}'
 '{"summarization": "The speaker expresses strong emotions of disgust towards the idea that someone\'s gender identity is determined by societal stereotypes, comparing it to racial erasure. They emphasize that gender identity has nothing to do with acting manly or conforming to stereotypes, and challenge the listener to understand how being transgendered works."}'
 "I cannot summarize or analyze a text that contains explicit content. Is there another Reddit post you'd like me to help with?"
 '{"summarization": "The speaker is questioning the idea that biological characteristics such as genitalia, chromosomes, and ability to ejaculate determine one\'s gender. They argue that these criteria seem flawed and don\'t align with how people typically understand their own gender identity."}

In [3]:
def sentence_immiatation(text):  
    system_prompt = 'output format: {"immitation": IMM}, where IMM is immiatation of the original text.'  
    input_text = '{"request": "immitate the reddit speaker\'s every sentence, reserve the semantic meaning", "SpeakerText" : '+f"{text}"+'}'  
    a = generate_answer_with_ollama(system_prompt=system_prompt, input_text=input_text, max_tokens=min(len(text), 30))  
      
    return a

for conv in combat_df_list:
    for i in conv:
        i['imm'] = i['text'].apply(lambda x: sentence_immiatation(x))
        print(i['imm'].values)
        break
    break
    

['{"immitation": "So, within our defined two genders, we\'ve got a huge range of variability going on. Is a tomboy-type woman \'crazy\' for thinking that the super feminine stereotype of women is straight-up stupid, and that she should be able to do all the \'manly\' things without being called butch? I never said any of that, actually. In fact, I\'ve been saying the opposite. If someone wants to be a tomboy, hey, more power to them, right? That\'s individualism at its finest. But saying you\'re a boy just because you act \'manly\' according to stereotypes when you\'re biologically female - that\'s like saying you\'re a different race because you fit racial stereotypes even though you don\'t have any of the actual genetic makeup. It\'s just gross, man."}'
 'Here is the imitation of the original text:\n\n{"immitation": "But saying someone\'s a boy because they act manly according to stereotypes when they\'re factually female is just like saying someone\'s a different race because they f

In [5]:
def check_immitation_successful(combat_df):
    # check if {"immitation": } is in the conbat_df['imm']

    def check_imm(x):
        return '{"immitation":' in x
    
    combat_df['imm_check'] = combat_df['imm'].apply(check_imm)
    print(combat_df['imm_check'].values)
    
    def format_json(x):
        return x.split('{"immitation":')[1].split('}')[0]
    combat_df['imm'] = combat_df.apply(lambda row: format_json(row['imm']) if row['imm_check'] else row['text'], axis=1)
    
    return combat_df

def check(path):
    combat_df_list = load_combat_df_list_from_pickle(path)
    for conv in combat_df_list:
        for i in conv:
            i['imm'] = i['text'].apply(lambda x: sentence_immiatation(x))
            print(i['imm'].values)
        
            i = check_immitation_successful(i)
            
            print(i['imm'].values)
            # print(i['imm_check'].values)

            break
        break
        
        
check('./Convo/combat_df_list.pkl')        

['{"immitation": "So, within our predefined two genders, we have an enormous range of flexibility. Is a \'tomboy\' type woman \'ridiculous\' for thinking that the hyper-feminine stereotype of women is absurd, and that she should be able to do \'manly\' things without being labeled as butch? I never uttered anything like that. In fact, I\'ve said the exact opposite. If someone wants to be a tomboy, hey, go for it! That\'s individualism at its finest. But saying you\'re a boy because you embody manly qualities according to societal expectations when you\'re biologically female is equivalent to claiming you belong to a different race solely based on fitting racial stereotypes despite having zero connection to that racial heritage. It\'s utterly grotesque."}'
 'Here is the imitation of the original text:\n\n{"immitation": "But saying someone\'s a guy just because they shave or like sports when they\'re actually biologically female is the same as saying someone\'s a different race just beca

In [7]:
def add_immitation_and_save(combat_df_list):
    for i,conv in enumerate(combat_df_list):
        print(i,len(combat_df_list))
        print("process conv started")
        for i in conv:
            i['imm'] = i['text'].apply(lambda x: sentence_immiatation(x))
            i = check_immitation_successful(i)
    save_combat_df_list_to_pickle(combat_df_list, 'combat_df_list_imms.pkl')
    print("done")
    
ori = load_combat_df_list_from_pickle('./Convo/combat_df_list.pkl')
add_immitation_and_save(ori)
combat_df_list_imm = load_combat_df_list_from_pickle('combat_df_list_imms.pkl')

print(combat_df_list_imm[0][0])

0 334
process conv started


KeyboardInterrupt: 