In [367]:
try:
    import re
    import numpy as np
    import pandas as pd
    from typing import List
    from pandas import DataFrame
    import seaborn as sns
    from tqdm import tqdm
    from time import time
    from convokit import Corpus, download
except Exception as e:
    print(e)

In [368]:
corpus = Corpus("sanskar_transformed_corpora/tr_fc(12-23)_27p_1") # Load the modified corpora 

In [371]:
utt_df = corpus.get_utterances_dataframe().drop("vectors", axis = 1)

In [372]:
conv_metadata_agg = {} # Create a conversation-level dictionary 
utt_df = corpus.get_utterances_dataframe().drop("vectors", axis = 1) # Load the utterance dataframe 

In [373]:
%%time
utt_df["negative_polarity"] = "None"
utt_df["positive_polarity"] = "None"
utt_df["neutral_polarity"] = "None"
utt_df["gratitude"] = "None"
utt_df["deference"] = "None"
utt_df["greeting"] = "None"
utt_df["positive_lexicon"] = "None"
utt_df["apologizing"] = "None"
utt_df["please"] = "None"


for idx in tqdm(utt_df.index):
    utt_df.loc[idx, "negative_polarity"] = utt_df.loc[idx, "meta.sentiment_polarity"]["neg"]
    utt_df.loc[idx, "positive_polarity"] = utt_df.loc[idx, "meta.sentiment_polarity"]["pos"]
    utt_df.loc[idx, "neutral_polarity"]  = utt_df.loc[idx, "meta.sentiment_polarity"]["neu"]
    utt_df.loc[idx, "gratitude"]         = utt_df.loc[idx, "meta.politeness_markers"]["feature_politeness_==Gratitude=="]
    utt_df.loc[idx, "deference"]         = utt_df.loc[idx, "meta.politeness_markers"]["feature_politeness_==Deference=="]
    utt_df.loc[idx, "greeting"]          = utt_df.loc[idx, "meta.politeness_markers"]["feature_politeness_==Indirect_(greeting)=="]
    utt_df.loc[idx, "positive_lexicon"]  = utt_df.loc[idx, "meta.politeness_markers"]["feature_politeness_==HASPOSITIVE=="]
    utt_df.loc[idx, "apologizing"]       = utt_df.loc[idx, "meta.politeness_markers"]["feature_politeness_==Apologizing=="]
    utt_df.loc[idx, "please"]            = utt_df.loc[idx, "meta.politeness_markers"]["feature_politeness_==Please=="]

100%|███████████████████████████████████████████████████████████████████████████████| 487/487 [00:00<00:00, 581.42it/s]

Wall time: 853 ms





In [374]:
utt_df["negative_polarity"] = utt_df["meta.sentiment_polarity"].apply(lambda val_dict: val_dict["neg"])
utt_df"positive_polarity"] = utt_df["meta.positive_polarity"].apply(lambda val_dict: val_dict["pos"])
utt_df["neutral_polarity"] = utt_df["meta.neutral_polarity"].apply(lambda val_dict: val_dict["neu"])
utt_df["gratitude"] = utt_df.loc["meta.politeness_markers"].apply(lambda val_dict: val_dict["feature_politeness_==Gratitude=="]
utt_df["deference"] = utt_df.loc["meta.politeness_markers"].apply(lambda val_dict: val_dict["feature_politeness_==Deference=="]
utt_df["greeting"] = utt_df.loc[idx, "meta.politeness_markers"].apply(lambda val_dict: val_dict["feature_politeness_==Indirect_(greeting)=="]                                                             
utt_df["positive_lexicon"]  = utt_df["meta.politeness_markers"].apply(lambda val_dict: val_dict["feature_politeness_==HASPOSITIVE=="])
utt_df["apologizing"]       = utt_df["meta.politeness_markers"].apply(lambda val_dict: val_dict["feature_politeness_==Apologizing=="])
utt_df["please"]            = utt_df["meta.politeness_markers"].apply(lambda val_dict: val_dict["feature_politeness_==Please=="])                                                              
utt_df["subjectivity_score"] = utt_df["meta.subjectivity_score"].apply(lambda val_dict: val_dict["avg_subjectivity_score"])
utt_df["subjectivity_clue_count"] = utt_df["meta.subjectivity_score"].apply(lambda val_dict: val_dict["subjective_lexicon_count"])
utt_df["modifier_count"] = utt_df["meta.modifier_count"].apply(lambda val_dict: val_dict["count_mod_tags"])
utt_df["hedge_count"] = utt_df["meta.hedge_count"].apply(lambda val_dict: val_dict["count_hedges"])
utt_df["groupRef_count"] = utt_df["meta.groupRef_count"].apply(lambda val_dict: val_dict["count_group_ref"])

In [376]:
utt_df_grouped = utt_df.groupby(by = "conversation_id", as_index = True).agg({"hedge_count": ["mean"], "groupRef_count": ["mean"], "subjectivity_clue_count": ["mean"], "modifier_count": ["mean"], "subjectivity_score": ["mean"], "negative_polarity": ["mean"], "positive_polarity": ["mean"], "neutral_polarity": ["mean"], "gratitude": ["sum"], "deference": ["sum"], "greeting": ["sum"], "positive_lexicon": ["sum"], "apologizing": ["sum"], "please": ["sum"], "meta.insult": ["sum"], "meta.identity_attack": ["sum"], "meta.toxicity": ["mean"], "meta.severe_toxicity": ["mean"],"meta.profanity": ["mean"]})
utt_df_grouped = utt_df_grouped.reset_index()

In [377]:
for idx, row in tqdm(utt_df_grouped.iterrows()):
    conv_id = str(row["conversation_id"].values[0])
    conv_metadata_agg[conv_id] = {}
    conv_metadata_agg[conv_id]["hedge_count"] = row["hedge_count"].values[0]
    conv_metadata_agg[conv_id]["groupRef_count"] = row["groupRef_count"].values[0]
    conv_metadata_agg[conv_id]["subjectivity_clue_count"] = row["subjectivity_clue_count"].values[0]
    conv_metadata_agg[conv_id]["subjectivity_score"] = round(row["subjectivity_score"].values[0], 3)
    conv_metadata_agg[conv_id]["negative_polarity"] = round(row["negative_polarity"].values[0], 3)
    conv_metadata_agg[conv_id]["positive_polarity"] = round(row["positive_polarity"].values[0], 3)
    conv_metadata_agg[conv_id]["neutral_polarity"] = round(row["neutral_polarity"].values[0], 3)
    conv_metadata_agg[conv_id]["gratitude"] = row["gratitude"].values[0]
    conv_metadata_agg[conv_id]["deference"] = row["deference"].values[0]
    conv_metadata_agg[conv_id]["greeting"] = round(row["greeting"].values[0], 3)
    conv_metadata_agg[conv_id]["positive_lexicon"] = row["positive_lexicon"].values[0]
    conv_metadata_agg[conv_id]["please"] = row["please"].values[0]
    conv_metadata_agg[conv_id]["apologizing"] = row["apologizing"].values[0]
    conv_metadata_agg[conv_id]["insult"] = round(row["meta.insult"].values[0], 3)
    conv_metadata_agg[conv_id]["identity_attack"] = round(row["meta.identity_attack"].values[0], 3)
    conv_metadata_agg[conv_id]["toxicity"] = round(row["meta.toxicity"].values[0], 3)
    conv_metadata_agg[conv_id]["severe_toxicity"] = round(row["meta.severe_toxicity"].values[0], 3)
    conv_metadata_agg[conv_id]["profanity"] = round(row["meta.profanity"].values[0], 3)

27it [00:00, 416.24it/s]


In [378]:
for conv_id in tqdm(corpus.get_conversation_ids()):
    convo = corpus.get_conversation(conv_id)
    convo.add_meta("hedge_count", conv_metadata_agg[conv_id]["hedge_count"])
    convo.add_meta("groupRef_count", conv_metadata_agg[conv_id]["groupRef_count"])
    convo.add_meta("subjectivity_clue_count", conv_metadata_agg[conv_id]["subjectivity_clue_count"])
    convo.add_meta("subjectivity_score", conv_metadata_agg[conv_id]["subjectivity_score"])
    convo.add_meta("negative_polarity", conv_metadata_agg[conv_id]["negative_polarity"])
    convo.add_meta("positive_polarity", conv_metadata_agg[conv_id]["positive_polarity"])
    convo.add_meta("neutral_polarity", conv_metadata_agg[conv_id]["neutral_polarity"])
    convo.add_meta("gratitude", conv_metadata_agg[conv_id]["gratitude"])
    convo.add_meta("deference", conv_metadata_agg[conv_id]["deference"])
    convo.add_meta("greeting", conv_metadata_agg[conv_id]["greeting"])
    convo.add_meta("positive_lexicon", conv_metadata_agg[conv_id]["positive_lexicon"])
    convo.add_meta("apologizing", conv_metadata_agg[conv_id]["apologizing"])
    convo.add_meta("please", conv_metadata_agg[conv_id]["please"])
    convo.add_meta("insult", conv_metadata_agg[conv_id]["insult"])
    convo.add_meta("identity_attack", conv_metadata_agg[conv_id]["identity_attack"])
    convo.add_meta("toxicity", conv_metadata_agg[conv_id]["toxicity"])
    convo.add_meta("severe_toxicity", conv_metadata_agg[conv_id]["severe_toxicity"])
    convo.add_meta("profanity", conv_metadata_agg[conv_id]["profanity"])

100%|███████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 13886.72it/s]


In [380]:
# Store final dataframe to corpus 
CORPUS_NAME = "tr_fc(12-23)_27p_1"
BASE_PATH = "C:\Sagar Study\ML and Learning\CP Sem-8\Data\Reddit\saved-corpora\sanskar_transformed_corpora"
corpus.dump(CORPUS_NAME, base_path=BASE_PATH)    # Dump corpus and load 