In [1]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# 加载SpaCy模型
nlp = spacy.load('en_core_web_sm')

# 初始化VADER情感分析器
analyzer = SentimentIntensityAnalyzer()

# 定义情感分析函数
def sentiment_analysis(text):
    
    # 使用VADER进行情感分析
    sentiment = analyzer.polarity_scores(text)
    # sample['meta.parsed'][0]
    return sentiment

# 示例文本
text = "I love natural language processing. It's fascinating and fun!"

# 进行情感分析
result = sentiment_analysis(text)
print(result)


{'neg': 0.0, 'neu': 0.266, 'pos': 0.734, 'compound': 0.9299}


In [2]:
import spacy
from gensim import corpora, models
from convokit import Corpus, download

# 加载spaCy模型
nlp = spacy.load('en_core_web_sm')

path = "C:/Users/L/.convokit/downloads/"
# 加载对话语料库
corpus = Corpus(filename=path + 'conversations-gone-awry-cmv-corpus')

# 依存解析示例
def analyze_dependency(text):
    doc = nlp(text)
    for token in doc:
        print(f"{token.text} -> {token.dep_} -> {token.head.text}")

# 主题建模示例
def topic_modeling(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
    return lda_model.print_topics()

# 分析对话中的攻击性语言
def detect_aggression(text):
    doc = nlp(text)
    for token in doc:
        if token.dep_ == 'amod' and token.head.dep_ == 'ROOT':
            print(f"Aggressive language detected: {token.text} -> {token.head.text}")



# 示例文本
texts = [["I", "love", "fuck", "natural", "language", "processing"], ["It's", "fascinating", "and", "fun"]]
text = "I love natural language processing. It's fascinating and fun!"

# 依存解析
analyze_dependency(text)

# 主题建模
print(topic_modeling(texts))

# 检测攻击性语言
detect_aggression(text)



I -> nsubj -> love
love -> ROOT -> love
natural -> amod -> language
language -> compound -> processing
processing -> dobj -> love
. -> punct -> love
It -> nsubj -> 's
's -> ROOT -> 's
fascinating -> acomp -> 's
and -> cc -> fascinating
fun -> conj -> fascinating
! -> punct -> 's
[(0, '0.150*"love" + 0.150*"processing" + 0.150*"natural" + 0.150*"fuck" + 0.150*"language" + 0.150*"I" + 0.025*"and" + 0.025*"fun" + 0.025*"It\'s" + 0.025*"fascinating"'), (1, '0.100*"It\'s" + 0.100*"I" + 0.100*"and" + 0.100*"fascinating" + 0.100*"fun" + 0.100*"language" + 0.100*"fuck" + 0.100*"natural" + 0.100*"processing" + 0.100*"love"'), (2, '0.200*"fascinating" + 0.200*"fun" + 0.200*"It\'s" + 0.200*"and" + 0.033*"fuck" + 0.033*"language" + 0.033*"I" + 0.033*"natural" + 0.033*"processing" + 0.033*"love"'), (3, '0.100*"and" + 0.100*"fascinating" + 0.100*"It\'s" + 0.100*"fun" + 0.100*"I" + 0.100*"fuck" + 0.100*"love" + 0.100*"natural" + 0.100*"processing" + 0.100*"language"'), (4, '0.100*"It\'s" + 0.100*"and

In [3]:
import os, pandas as pd
from convokit import Corpus, Speaker, Utterance, Conversation, download

path = "C:/Users/L/.convokit/downloads/"
os.environ['http_proxy'] = 'http://localhost:7890'
os.environ['https_proxy'] = 'http://localhost:7890'

def load_dfs(corpus):
    speakers = corpus.get_speakers_dataframe().drop(columns=['vectors'])
    
    conversations = corpus.get_conversations_dataframe().drop(columns=['vectors'])
    utterances = corpus.get_utterances_dataframe().drop(columns=['vectors'])
    # print(type(speakers), type(conversations), type(utterances))
    return speakers, conversations, utterances

def print_overview(speaker_df, convo_df, utt_df):
    print("UttDf attributes:", list(utt_df.columns),'\n')
    print("ConvDf attributes:", list(convo_df.columns),'\n')
    print("SpeakerDf attributes:", list(speaker_df.columns),'\n')

    # print(convo_df.sample(n=2))
    print("convo:",convo_df.shape)
    # print(speaker_df.sample(n=2))
    print("speaker:",speaker_df.shape)
    
    print("utt:",utt_df.shape)
    # print(utt_df.sample(n=2))
    
def check_column_type(df):
    r = df.sample(n=1)
    for i in df.columns:
        print(i, type(r[i].values[0]))
        
def save_df_pickle(df, path):
    df.to_pickle(path)
    print("Save to", path)
    
def load_df_pickle(path):
    df = pd.read_pickle(path)
    return df


In [4]:
corpus = Corpus(filename=path+'conversations-gone-awry-cmv-corpus')
speakers, conversations, utterances = load_dfs(corpus)
print_overview(speakers, conversations, utterances)
check_column_type(utterances)

KeyboardInterrupt: 

In [None]:
test = load_df_pickle('utterancesPunR.pkl')
print(test.shape)
print(test.head())
test_trimmed = test[test['meta.score'] > 20]
print(test_trimmed.shape)

In [None]:
sample = test_trimmed.sample(n=1)
text = sample['text'].values[0]
print(sample['meta.parsed'][0])
print(text)
print(sentiment_analysis(text))

In [None]:
print(analyzer.polarity_scores(text))

In [None]:
utterances_trimmed = utterances[utterances['meta.score'] > 20]
print(utterances_trimmed.shape)
utterances_trimmed['meta.sentiment'] = utterances_trimmed['text'].apply(lambda x : analyzer.polarity_scores(x))
print(utterances_trimmed.head())

In [None]:
save_df_pickle(utterances_trimmed, 'utterancesTrimmedSentiment.pkl')


In [None]:
import matplotlib.pyplot as plt
x = utterances_trimmed['meta.score']
y = utterances_trimmed['meta.sentiment'].apply(lambda d: d['neg'])

# 创建散点图
plt.figure(figsize=(10, 6))
plt.scatter(x, y, color='blue', alpha=0.6)
plt.title('Scatter Plot of meta.score vs. meta.trimmed[neg]')
plt.xlabel('meta.score')
plt.ylabel('meta.trimmed[neg]')
plt.grid(True)

# 显示图形
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(utterances_trimmed['meta.sentiment'].apply(lambda d: d['pos']), utterances_trimmed['meta.sentiment'].apply(lambda d: d['neg']), color='blue', alpha=0.6)
plt.title('Scatter Plot of pos.score vs. meta.trimmed[neg]')
plt.xlabel('pos.score')
plt.ylabel('[neg]')
plt.grid(True)

# 显示图形
plt.show()

In [None]:
pop_df = load_df_pickle('utterancesTrimmedSentiment.pkl')
pop_df['datetime'] = pd.to_datetime(pop_df['timestamp'], unit='s')
print(pop_df['datetime'].head())


In [None]:
df = pop_df
# Extracting sentiment components into separate columns
df['neg'] = df['meta.sentiment'].apply(lambda x: x['neg'])
df['neu'] = df['meta.sentiment'].apply(lambda x: x['neu'])
df['pos'] = df['meta.sentiment'].apply(lambda x: x['pos'])
df['compound'] = df['meta.sentiment'].apply(lambda x: x['compound'])

# Creating subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Plotting negative sentiment
axes[0, 0].scatter(df['meta.score'], df['neg'], c='red')
axes[0, 0].set_title('Meta Score vs Negative Sentiment')
axes[0, 0].set_xlabel('Meta Score')
axes[0, 0].set_ylabel('Negative Sentiment')

# Plotting neutral sentiment
axes[0, 1].scatter(df['meta.score'], df['neu'], c='blue')
axes[0, 1].set_title('Meta Score vs Neutral Sentiment')
axes[0, 1].set_xlabel('Meta Score')
axes[0, 1].set_ylabel('Neutral Sentiment')

# Plotting positive sentiment
axes[1, 0].scatter(df['meta.score'], df['pos'], c='green')
axes[1, 0].set_title('Meta Score vs Positive Sentiment')
axes[1, 0].set_xlabel('Meta Score')
axes[1, 0].set_ylabel('Positive Sentiment')

# Plotting compound sentiment
axes[1, 1].scatter(df['meta.score'], df['compound'], c='purple')
axes[1, 1].set_title('Meta Score vs Compound Sentiment')
axes[1, 1].set_xlabel('Meta Score')
axes[1, 1].set_ylabel('Compound Sentiment')

plt.show()


In [None]:
# Extracting sentiment components into separate columns
pop_df['neg'] = pop_df['meta.sentiment'].apply(lambda x: x['neg'])
pop_df['neu'] = pop_df['meta.sentiment'].apply(lambda x: x['neu'])
pop_df['pos'] = pop_df['meta.sentiment'].apply(lambda x: x['pos'])
pop_df['compound'] = pop_df['meta.sentiment'].apply(lambda x: x['compound'])

# Creating a single plot
plt.figure(figsize=(12, 10))

# Plotting negative sentiment
plt.scatter(pop_df['meta.score'], pop_df['neg'], c='red', label='Negative Sentiment')

# Plotting neutral sentiment
plt.scatter(pop_df['meta.score'], pop_df['neu'], c='blue', label='Neutral Sentiment')

# Plotting positive sentiment
plt.scatter(pop_df['meta.score'], pop_df['pos'], c='green', label='Positive Sentiment')

# Plotting compound sentiment
plt.scatter(pop_df['meta.score'], pop_df['compound'], c='purple', label='Compound Sentiment')

plt.title('Meta Score vs Sentiment Components')
plt.xlabel('Meta Score')
plt.ylabel('Sentiment Value')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Extracting sentiment components into separate columns
pop_df['neg'] = pop_df['meta.sentiment'].apply(lambda x: x['neg'])
pop_df['neu'] = pop_df['meta.sentiment'].apply(lambda x: x['neu'])
pop_df['pos'] = pop_df['meta.sentiment'].apply(lambda x: x['pos'])
pop_df['compound'] = pop_df['meta.sentiment'].apply(lambda x: x['compound'])

# Creating histograms
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

# Histogram for negative sentiment
axes[0, 0].hist(pop_df['neg'], bins=30, color='red', edgecolor='black')
axes[0, 0].set_title('Negative Sentiment Distribution')
axes[0, 0].set_xlabel('Negative Sentiment')
axes[0, 0].set_ylabel('Frequency')

# Histogram for neutral sentiment
axes[0, 1].hist(pop_df['neu'], bins=30, color='blue', edgecolor='black')
axes[0, 1].set_title('Neutral Sentiment Distribution')
axes[0, 1].set_xlabel('Neutral Sentiment')
axes[0, 1].set_ylabel('Frequency')

# Histogram for positive sentiment
axes[1, 0].hist(pop_df['pos'], bins=30, color='green', edgecolor='black')
axes[1, 0].set_title('Positive Sentiment Distribution')
axes[1, 0].set_xlabel('Positive Sentiment')
axes[1, 0].set_ylabel('Frequency')

# Histogram for compound sentiment
axes[1, 1].hist(pop_df['compound'], bins=30, color='purple', edgecolor='black')
axes[1, 1].set_title('Compound Sentiment Distribution')
axes[1, 1].set_xlabel('Compound Sentiment')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Extracting sentiment components into separate columns
pop_df['neg'] = pop_df['meta.sentiment'].apply(lambda x: x['neg'])
pop_df['neu'] = pop_df['meta.sentiment'].apply(lambda x: x['neu'])
pop_df['pos'] = pop_df['meta.sentiment'].apply(lambda x: x['pos'])
pop_df['compound'] = pop_df['meta.sentiment'].apply(lambda x: x['compound'])

# Grouping by 'meta.score' and calculating the mean of 'pos', 'neg', 'neu', and 'compound'
grouped_df = pop_df.groupby('meta.score').agg({'pos': 'mean', 'neg': 'mean', 'neu': 'mean', 'compound': 'mean'}).reset_index()

# Plotting the results
plt.figure(figsize=(14, 8))

# Plotting average positive sentiment
plt.plot(grouped_df['meta.score'], grouped_df['pos'], marker='o', label='Average Positive Sentiment', color='green')

# Plotting average negative sentiment
plt.plot(grouped_df['meta.score'], grouped_df['neg'], marker='o', label='Average Negative Sentiment', color='red')

# Plotting average neutral sentiment
plt.plot(grouped_df['meta.score'], grouped_df['neu'], marker='o', label='Average Neutral Sentiment', color='blue')

# Plotting average compound sentiment
plt.plot(grouped_df['meta.score'], grouped_df['compound'], marker='o', label='Average Compound Sentiment', color='purple')

plt.title('Average Sentiments by Meta Score')
plt.xlabel('Meta Score')
plt.ylabel('Average Sentiment Value')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Extracting sentiment components into separate columns
pop_df['neg'] = pop_df['meta.sentiment'].apply(lambda x: x['neg'])
pop_df['neu'] = pop_df['meta.sentiment'].apply(lambda x: x['neu'])
pop_df['pos'] = pop_df['meta.sentiment'].apply(lambda x: x['pos'])
pop_df['compound'] = pop_df['meta.sentiment'].apply(lambda x: x['compound'])

# Creating subplots for box plots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(14, 12))

# Box plot for positive sentiment
pop_df.boxplot(column='pos', by='meta.score', ax=axes[0, 0], grid=False)
axes[0, 0].set_title('Positive Sentiment')
axes[0, 0].set_xlabel('Meta Score')
axes[0, 0].set_ylabel('Positive Sentiment')
axes[0, 0].tick_params(axis='x', rotation=90)

# Box plot for negative sentiment
pop_df.boxplot(column='neg', by='meta.score', ax=axes[0, 1], grid=False)
axes[0, 1].set_title('Negative Sentiment')
axes[0, 1].set_xlabel('Meta Score')
axes[0, 1].set_ylabel('Negative Sentiment')
axes[0, 1].tick_params(axis='x', rotation=90)

# Box plot for neutral sentiment
pop_df.boxplot(column='neu', by='meta.score', ax=axes[1, 0], grid=False)
axes[1, 0].set_title('Neutral Sentiment')
axes[1, 0].set_xlabel('Meta Score')
axes[1, 0].set_ylabel('Neutral Sentiment')
axes[1, 0].tick_params(axis='x', rotation=90)

# Box plot for compound sentiment
pop_df.boxplot(column='compound', by='meta.score', ax=axes[1, 1], grid=False)
axes[1, 1].set_title('Compound Sentiment')
axes[1, 1].set_xlabel('Meta Score')
axes[1, 1].set_ylabel('Compound Sentiment')
axes[1, 1].tick_params(axis='x', rotation=90)

plt.suptitle('')  # Suppress the overall title to avoid overlap
plt.tight_layout()
plt.show()


In [None]:
corpus = Corpus(filename=path+'conversations-gone-awry-cmv-corpus')
speakers, conversations, utterances = load_dfs(corpus)
print_overview(speakers, conversations, utterances)
check_column_type(utterances)

In [None]:
utterances_trimmed = load_df_pickle('utterancesTrimmedSentiment.pkl')
new_corpus = Corpus.from_pandas(utterances_df=utterances_trimmed, speakers_df=speakers, conversations_df=conversations)
new_corpus.print_summary_stats()

In [None]:
import matplotlib.pyplot as plt

def plot_utt_df_sentiment(df):
    df['meta.sentiment'] = df['text'].apply(lambda x : analyzer.polarity_scores(x))
    # Extracting sentiment components into separate columns
    df['neg'] = df['meta.sentiment'].apply(lambda x: x['neg'])
    df['neu'] = df['meta.sentiment'].apply(lambda x: x['neu'])
    df['pos'] = df['meta.sentiment'].apply(lambda x: x['pos'])
    df['compound'] = df['meta.sentiment'].apply(lambda x: x['compound'])

    # Creating histograms
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

    # Histogram for negative sentiment
    axes[0, 0].hist(df['neg'], bins=30, color='red', edgecolor='black')
    axes[0, 0].set_title('Negative Sentiment Distribution')
    axes[0, 0].set_xlabel('Negative Sentiment')
    axes[0, 0].set_ylabel('Frequency')

    # Histogram for neutral sentiment
    axes[0, 1].hist(df['neu'], bins=30, color='blue', edgecolor='black')
    axes[0, 1].set_title('Neutral Sentiment Distribution')
    axes[0, 1].set_xlabel('Neutral Sentiment')
    axes[0, 1].set_ylabel('Frequency')

    # Histogram for positive sentiment
    axes[1, 0].hist(df['pos'], bins=30, color='green', edgecolor='black')
    axes[1, 0].set_title('Positive Sentiment Distribution')
    axes[1, 0].set_xlabel('Positive Sentiment')
    axes[1, 0].set_ylabel('Frequency')

    # Histogram for compound sentiment
    axes[1, 1].hist(df['compound'], bins=30, color='purple', edgecolor='black')
    axes[1, 1].set_title('Compound Sentiment Distribution')
    axes[1, 1].set_xlabel('Compound Sentiment')
    axes[1, 1].set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()
    

In [None]:
for conv in corpus.iter_conversations():
    # print(len(conv._utterance_ids))
    if len(conv.get_utterance_ids()) > 21 and len(conv.get_speaker_ids()) < 10:
        df = conv.get_utterances_dataframe()
        print(df.shape)
        print(df.head())
        print(df.columns)
        plot_utt_df_sentiment(df)
    else:
        continue

    print(conv.print_conversation_stats())
    print(conv.get_speaker_ids())
    break

def utter_df_from_corpus_condition_conv(corpus, condiion = None):
    for conv in corpus.iter_conversations():
        condition = True if (len(conv.get_utterance_ids()>20 and len(conv.get_speaker_ids())<10)) else False
        
        # print(len(conv._utterance_ids))
        if condition:
            df = conv.get_utterances_dataframe()
            print(df.shape)
            print(df.head())
            print(df.columns)
            plot_utt_df_sentiment(df)
        else:
            continue
    
        print(conv.print_conversation_stats())
        print(conv.get_speaker_ids())
        break

In [None]:

# df = utterances_trimmed
# df['meta.sentiment'] = df['text'].apply(lambda x : analyzer.polarity_scores(x))
# # Extracting sentiment components into separate columns
# df['neg'] = df['meta.sentiment'].apply(lambda x: x['neg'])
# df['neu'] = df['meta.sentiment'].apply(lambda x: x['neu'])
# df['pos'] = df['meta.sentiment'].apply(lambda x: x['pos'])
# df['compound'] = df['meta.sentiment'].apply(lambda x: x['compound'])
# # Step 1: List unique values in the 'speaker' column
# unique_speakers = df['speaker'].unique()
# 
# # Step 2: Assign a unique color to each speaker
# colors = plt.cm.get_cmap('tab10', len(unique_speakers))
# 
# # Create a dictionary to map speakers to colors
# speaker_color_map = {speaker: colors(i) for i, speaker in enumerate(unique_speakers)}
# 
# # Step 3: Plotting
# plt.figure(figsize=(10, 6))
# 
# # Iterate over each speaker and plot their data
# for speaker in unique_speakers:
#     speaker_data = df[df['speaker'] == speaker]
#     x = speaker_data['timestamp']
#     y = speaker_data['pos'] - speaker_data['neg']
#     plt.plot(x, y, label=speaker, color=speaker_color_map[speaker])
# 
# # Adding labels and title
# plt.xlabel('Timestamp')
# plt.ylabel('pos - neg')
# plt.title('pos - neg Over Time by Speaker')
# plt.legend(title='Speaker')
# plt.grid(True)
# plt.show()


In [None]:
def get_conv_id_list(corpus, condition):
    conv_id_list = []
    for conv in corpus.iter_conversations():
        if condition(conv):
            conv_id_list.append(conv.id)
    return conv_id_list

def condition(conv):
    return (len(conv.get_utterance_ids()) > 15 and len(conv.get_speaker_ids()) < 10)

conv_id_list = get_conv_id_list(corpus, condition)
print(conv_id_list)

In [None]:
def plot_conversation_sentiment_trend(conv):
    df = conv.get_utterances_dataframe()
    df['meta.sentiment'] = df['text'].apply(lambda x : analyzer.polarity_scores(x))
    df['neg'] = df['meta.sentiment'].apply(lambda x: x['neg'])
    df['neu'] = df['meta.sentiment'].apply(lambda x: x['neu'])
    df['pos'] = df['meta.sentiment'].apply(lambda x: x['pos'])
    df['compound'] = df['meta.sentiment'].apply(lambda x: x['compound'])
    
    # Plotting the sentiment trend
    plt.figure(figsize=(12, 8))
    plt.plot(df['timestamp'], df['neg'], label='Negative Sentiment', color='red', linewidth=2)
    plt.plot(df['timestamp'], df['neu'], label='Neutral Sentiment', color='blue', linewidth=2)
    plt.plot(df['timestamp'], df['pos'], label='Positive Sentiment', color='green', linewidth=2)
    plt.plot(df['timestamp'], df['compound'], label='Compound Sentiment', color='purple', linewidth=2)
    plt.title('Sentiment Trend in Conversation')
    plt.xlabel('Timestamp')
    plt.ylabel('Sentiment Value')
    plt.legend()
    plt.grid(True)
    plt.show()


for conv in corpus.iter_conversations():
    if conv.id in conv_id_list:
        df = conv.get_utterances_dataframe()
        print(df.shape)
        # print(df.head())
        print(df.columns)
        plot_conversation_sentiment_trend(conv)

In [None]:
# print text for conversations. id:['cwh7b8s', 'd27fgau', 'd61kp2c']
import os, pandas as pd
from convokit import Corpus, Speaker, Utterance, Conversation, download
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
path = "C:/Users/L/.convokit/downloads/"
corpus = Corpus(filename=path+'conversations-gone-awry-cmv-corpus')

for conv in corpus.iter_conversations():
    if conv.id in ['cwh7b8s', 'd27fgau', 'd61kp2c']:
        print(conv.meta)
        df = conv.get_utterances_dataframe()
        print("$$$$$$$$$$$$$$$\n\n\n\n")
        for i,text in enumerate(df['text'].values):
            print(i,"\n")
            print(analyzer.polarity_scores(text))
            print(text)

In [None]:
# 检查第一个utterance是否是comment

# from convokit import Corpus, download
# 
# # 下载示例语料库
# corpus = Corpus(filename= path + 'subreddit-ADD')
# # 遍历对话
# for conv in corpus.iter_conversations():
#     # 获取对话中的所有utterance
#     utterances = list(conv.iter_utterances())

    # 检查第一个utterance
    # first_utterance = utterances[0]
    # if first_utterance.meta.get('is_post', False):
    #     print(f"Conversation {conv.id} starts with a post.")
    # else:
    #     print(f"Conversation {conv.id} starts with a comment.")

In [None]:
import requests


def generate_answer_with_ollama(query, context=' ', model="llama3:latest", url="http://localhost:11434/api/generate"):
    input_text = f"Question: {query}\nContext: {context}"
    payload = {
        'prompt': input_text,
        'model': model,  # Add the model parameter here
        'max_tokens': 50  # Set the maximum number of tokens to generate
    }

    response = requests.post(url, json=payload)

    try:
        # Process NDJSON response
        responses = response.content.decode('utf-8').splitlines()
        json_responses = [requests.models.complexjson.loads(line) for line in responses]
        responses = [item["response"] for item in json_responses]
        text = ''.join(responses)
        # print("NDJSON parsed successfully:", text)

        # Extract the answer from the first JSON object
        return text
    except Exception as e:
        print("Failed to decode NDJSON response:", e)
        print("Response content:", response.content)
        raise

    print("Response content:", response.content)
    raise Exception(f"Unexpected content type: {content_type}")
    
    
generate_answer_with_ollama('Seperate the following text into meaning--complete sentences:\n\n', context="""Why then? Call a fucking pharmacist who can tell you what the consequences are, IF ANY. That's their job. Or pay the money to talk to a doctor on a holiday. That is what you do. Don't come on here and post the question to a bunch of people who are unqualified to answer. Get off the fucking computer and call someone. Jesus Christ.""")

In [None]:
def rename_speaker_conv(conv):
    # get all possible speaker id in the dataframe, then map each one to A,B,C,D..... in a new column
    nameMap = {}
    for speaker in conv.get_speaker_ids():
        nameMap[speaker] = chr(65+len(nameMap))
        
    utts_df = conv.get_utterances_dataframe()
    utts_df['speakerID'] = utts_df['speaker'].apply(lambda x: nameMap[x])

    return utts_df

def format_conversation_user_utt_df(utt_df):
    
    speaker = utt_df['speakerID'].values
    text = utt_df['text'].values
    return [f"{speaker[i]}: {text[i]}\r" for i in range(len(speaker))]

for conv in corpus.iter_conversations():
    if conv.id in ['cwh7b8s', 'd27fgau', 'd61kp2c']:
        print(1)
        # print(conv.meta)
        df = conv.get_utterances_dataframe()
        df = rename_speaker_conv(conv)
        print(df['speakerID'].values)
        print(df['reply_to'].values)
        print(df['speaker'].values)
        print(df.index)
        print("$$$$$$$$$$$$$$$\n\n\n\n")
        for i,text in enumerate(df['text'].values):
            print(i,"\n")
            print(analyzer.polarity_scores(text))
            # print(text)
            


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# 示例文本列表
texts = [
    "I love natural language processing. It's fascinating and fun!",
    "Machine learning is a key component of artificial intelligence.",
    "Deep learning models are very powerful for image recognition.",
    "Natural language processing involves understanding and generating text.",
    "Artificial intelligence is transforming many industries."
]

utt_df = rename_speaker_conv(corpus.get_conversation('cwh7b8s'))

texts  = format_conversation_user_utt_df(utt_df)
# ��始化TF-IDF向量化器
vectorizer = TfidfVectorizer()

# 计算TF-IDF矩阵
tfidf_matrix = vectorizer.fit_transform(texts)

# 获取特征名称（关键词）
feature_names = vectorizer.get_feature_names_out()

# 将TF-IDF矩阵转换为DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# 打印每个文本的关键词及其权重
for i, text in enumerate(texts):
    print(f"Text {i+1}: {text}")
    print("Keywords and their weights:")
    for keyword, weight in zip(df_tfidf.columns, df_tfidf.iloc[i]):
        if weight > 0:
            print(f"{keyword}: {weight:.4f}")
    print("\n")