In [None]:
import os, pandas as pd
from convokit import Corpus, Speaker, Utterance, Conversation, download

path = "C:/Users/L/.convokit/downloads/"
os.environ['http_proxy'] = 'http://localhost:7890'
os.environ['https_proxy'] = 'http://localhost:7890'

def load_dfs(corpus):
    speakers = corpus.get_speakers_dataframe().drop(columns=['vectors'])
    
    conversations = corpus.get_conversations_dataframe().drop(columns=['vectors'])
    utterances = corpus.get_utterances_dataframe().drop(columns=['vectors'])
    # print(type(speakers), type(conversations), type(utterances))
    return speakers, conversations, utterances

def print_overview(speaker_df, convo_df, utt_df):
    print("UttDf attributes:", list(utt_df.columns),'\n')
    print("ConvDf attributes:", list(convo_df.columns),'\n')
    print("SpeakerDf attributes:", list(speaker_df.columns),'\n')

    # print(convo_df.sample(n=2))
    print("convo:",convo_df.shape)
    # print(speaker_df.sample(n=2))
    print("speaker:",speaker_df.shape)
    
    print("utt:",utt_df.shape)
    # print(utt_df.sample(n=2))
    


In [None]:
def optimize_dataframe_int(df,col:str):
    df[col] = df[col].astype('int32')
    return df

def check_column_type(df):
    r = df.sample(n=1)
    for i in df.columns:
        print(i, type(r[i].values[0]))

In [None]:
list1 = ['conversations-gone-awry-cmv-corpus', 'subreddit-ADD', 'subreddit-AmericanPolitics', 'subreddit-Cornell', 'subreddit-NSFW_Social', 'subreddit-POLUG3']
for i in list1:
    corpus = Corpus(filename=path+i)
    # print(corpus)
    speakers, conversations, utterances = load_dfs(corpus)
    print_overview(speakers, conversations, utterances)
    print("#############################################")

In [None]:
corpus = Corpus(filename=path+'conversations-gone-awry-cmv-corpus')
speakers, conversations, utterances = load_dfs(corpus)
print_overview(speakers, conversations, utterances)
check_column_type(utterances)
# print(utterances.shape)


In [None]:
import gc

utterances = optimize_dataframe_int(utterances, 'meta.score')
utterances = optimize_dataframe_int(utterances, 'timestamp')
utterances = optimize_dataframe_int(utterances, 'meta.gilded')
utterances = optimize_dataframe_int(utterances,'meta.retrieved_on')

# del corpus, speakers, conversations

gc.collect()

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

# 输入文本
text = "Apple is looking at buying U.K. startup for $1 billion."

def handle_txt(text):

    # 处理文本
    doc = nlp(text)
    
    # 分词
    tokens = [token.text for token in doc]
    print("Tokens:", tokens)
    
    # 词性标注
    pos_tags = [(token.text, token.pos_) for token in doc]
    print("POS Tags:", pos_tags)
    
    # 命名实体识别
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    print("Entities:", entities)
    
    # 依存句法分析
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    print("Dependencies:", dependencies)

    return tokens, pos_tags, entities, dependencies

handle_txt(text)


In [None]:
my_list = [ x[1] for x in handle_txt(text)[1]]
my_set = set(my_list)
for i in my_set:
    print(i, my_list.count(i))
    
def count_pos_tags(text):
    token_length = len(handle_txt(text)[0])
    if token_length == 0:
        return 1, {}
    pos_tags = [x[1] for x in handle_txt(text)[1]]
    pos_tags.append('PUNCT')  #+1 count
    pos_tag_counts = {tag: pos_tags.count(tag) for tag in set(pos_tags)}

    return token_length, pos_tag_counts

In [None]:
utterances['PUNCTRatio'] = utterances['text'].apply(lambda x: float(count_pos_tags(x)[1].get('PUNCT', 0)) / count_pos_tags(x)[0])

In [None]:
def save_df_pickle(df, path):
    df.to_pickle(path)
    print("Save to", path)
    
def load_df_pickle(path):
    df = pd.read_pickle(path)
    return df

# save_df_pickle(utterances, 'utterancesPunR.pkl')

In [None]:
test = load_df_pickle('utterancesPunR.pkl')
print(test.shape)
print(test.head())
test_trimmed = test[test['meta.score'] > 20]
print(test_trimmed.shape)

In [None]:
import altair as alt
alt.data_transformers.enable("vegafusion")
PUNCTR_scatter = alt.Chart(test).mark_point().encode(
    alt.X('meta.score:Q', title='Score'),
    alt.Y('PUNCTRatio:Q', title='PUNCT Ratio')
).properties(
    title='PUNCT Ratio vs. Score'
)
PUNCTR_scatter.display()

In [None]:
def get_POS_ratios(df):
    nlp = spacy.load('en_core_web_sm')

    df['nlp'] = df['text'].apply(lambda x: nlp[x] if x != '' else {})


In [None]:
def check_empty_text_column(df):
    if 'text' in df.columns:
        # 使用向量化操作检查"text"列中的空字符串
        empty_strings = df['text'] == ''
        # 输出存在空字符串的行数
        empty_count = empty_strings.sum()
        print('Exist empty text') if empty_count > 0 else print('No empty text')
    else:
        raise KeyError("DataFrame does not contain a 'text' column")

def clear_empty_utt(utt_df):
    print("Before:", utt_df.shape)
    non_empty_mask = utt_df['text'] != ''
    utt_df = utt_df.loc[non_empty_mask]
    print("After:", utt_df.shape)
    return utt_df

check_empty_text_column(utterances)
utt_df = clear_empty_utt(utterances)

In [None]:
new_corpus = Corpus.from_pandas(utterances_df=utt_df, speakers_df= speakers, conversations_df=conversations)
new_speaker_df, new_convo_df, new_utt_df = load_dfs(new_corpus)
print_overview(new_speaker_df, new_convo_df, new_utt_df)

In [None]:
import altair as alt
# alt.data_transformers.enable("vegafusion")
# utt_df = new_utt_df
utt_df = utterances
print(utt_df.head())

pop_utt_df = utt_df.loc[utt_df['meta.score'] > 20]
print(pop_utt_df.shape)
print(pop_utt_df.head())

scatter_plot = alt.Chart(pop_utt_df).mark_point().encode(
    alt.X('meta.score:Q', title='Score'),
    alt.Y('meta.score:Q', title='glided')
).properties(
    title='Message Score Distribution'
)

scatter_plot.display()

In [None]:
pop_utt_df['date'] = pd.to_datetime(pop_utt_df['timestamp'],unit='s')
pop_utt_df['year_month_day'] = pop_utt_df['date'].dt.strftime('%Y-%m-%d')


brush = alt.selection_interval(encodings=['x'])

# Create the time distribution chart with improved x-axis and interactivity
time_dist = alt.Chart(pop_utt_df).mark_line().encode(
    alt.X('year_month_day:T', title='Date', axis=alt.Axis(labelAngle=-45, tickCount='day')),
    alt.Y('count()', title='Number of Messages'),
    tooltip=['year_month_day:T', 'count()']
).properties(
    title='Messages Over Time',
    width=800
).add_params(
    brush
).interactive()

# Display the chart
time_dist.display()

In [None]:

filtered_df = pop_utt_df[['meta.gilded']]

# Create an Altair bar chart
bar_chart = alt.Chart(filtered_df).mark_bar().encode(
    x='meta.gilded:N',  # Ordinal type for categorical data
    y='count():Q',      # Quantitative type for count
    tooltip=['meta.gilded:O', 'count():Q']
).properties(
    title='Count of Messages by Gilded Status'
).interactive()  # Add interactivity

# Display the chart
bar_chart.display()

In [None]:
type(pop_utt_df.loc['c89t0pe', 'timestamp'])
print(pop_utt_df.loc[:, 'timestamp'])


In [None]:
data = pd.DataFrame({
    'timestamp': [1691118896, 1691205296, 1691291696],
    'value': [10, 20, 15]
})

# 将时间戳转换为 datetime 对象
data['date'] = pd.to_datetime(data['timestamp'], unit='s')

# 合并年月日为一个字符串列（可选）
data['year_month_day'] = data['date'].dt.strftime('%Y-%m-%d')

# 打印转换后的数据
print(data)

# 创建一个 Altair 图表
chart = alt.Chart(data).mark_line().encode(
    x='year_month_day:T',  # 使用年月日格式
    y='value:Q'
).properties(
    title='Time Series Data'
)

# 显示图表
chart.display()