In [1]:
!pip install rouge_chinese
!pip install evaluate
!pip install jieba
!pip install rouge_score

Collecting rouge_chinese
  Downloading rouge_chinese-1.0.3-py3-none-any.whl.metadata (7.6 kB)
Downloading rouge_chinese-1.0.3-py3-none-any.whl (21 kB)
Installing collected packages: rouge_chinese
Successfully installed rouge_chinese-1.0.3
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0

In [None]:
from rouge_chinese import Rouge
import jieba # you can use any other word cutting library
import evaluate
import pandas as pd
from scipy.stats import ttest_ind

In [38]:
def calculate_rouge(hypothesis, reference):
    """
    Calculate ROUGE scores between a hypothesis and reference text.

    This function uses the ROUGE metric to compare a generated text (hypothesis)
    with a reference text, providing scores that indicate the similarity between
    the two texts. The hypothesis and reference texts are first segmented using
    Jieba for tokenization, which is commonly used for Chinese text segmentation.

    Args:
        hypothesis (str): The generated text or hypothesis to be evaluated.
        reference (str): The reference text to compare against.

    Returns:
        dict: A dictionary containing the ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L)
              for the comparison between the hypothesis and reference. For each
              ROUGE score, "f" stands for f1_score, "p" stands for precision,
              "r" stands for recall.
    """
    hypothesis = ' '.join(jieba.cut(hypothesis))
    reference = ' '.join(jieba.cut(reference))
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)
    # print(scores)
    return scores

def calculate_rouge_english(hypothesis, reference):
    """
    Calculates the ROUGE (Recall-Oriented Understudy for Gisting Evaluation) score between a hypothesis and reference text.

    This function tokenizes the input hypothesis and reference texts using jieba, a Chinese text segmentation library,
    and then computes the ROUGE score. It is designed for evaluating the similarity between a generated text (hypothesis)
    and a ground truth text (reference) in English and tokenized Chinese.

    Parameters:
    -----------
    hypothesis : str
        The generated text (prediction) whose quality is to be evaluated.
    reference : str
        The reference text (ground truth) against which the hypothesis is compared.

    Returns:
    --------
    dict
        A dictionary containing aggregated ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L, ROUGE-Lsum) that assess the overlap between the hypothesis and reference text.
    """
    hypothesis = ' '.join(jieba.cut(hypothesis))
    reference = ' '.join(jieba.cut(reference))
    rouge = evaluate.load('rouge')
    rough_english = rouge.compute(predictions=[hypothesis], references=[reference])
    # print(rough_english)
    return rough_english


def calculate_BLEU(prediction, reference):
    """
    Calculates the BLEU (Bilingual Evaluation Understudy) score between a prediction and reference text.

    This function tokenizes the input prediction and reference texts using jieba, a Chinese text segmentation library,
    and computes the BLEU score. The BLEU metric is commonly used to evaluate the quality of machine-generated text
    by comparing it to a reference text.

    Parameters:
    -----------
    prediction : str
        The generated text (prediction) to be evaluated.
    reference : str
        The reference text (ground truth) against which the prediction is compared.

    Returns:
    --------
    dict
        A dictionary containing the BLEU score (up to max order 2), indicating the degree of overlap
        between the prediction and reference text. Higher scores indicate a closer match to the reference.
    """
    bleu = evaluate.load('bleu')
    prediction = ' '.join(jieba.cut(prediction))
    reference = ' '.join(jieba.cut(reference))
    results = bleu.compute(predictions=[prediction], references=[reference],
          max_order = 2)
    # print(results)
    return results

def generate_evaluation_df(tool_name_lst, tool_generated_content_lst, rouge_lst, BLEU_lst, English = False):
    data = []
    if English:
      for i in range(len(tool_name_lst)):
        tool_data = {
            'tool_name': tool_name_lst[i],
            'tool_generated_content': tool_generated_content_lst[i],
            'rouge-1': rouge_lst[i]['rouge1'],
            'rouge-2': rouge_lst[i]['rouge2'],
            'rouge-l': rouge_lst[i]['rougeL'],
            'rouge-l_sum': rouge_lst[i]['rougeLsum'],
            'bleu': BLEU_lst[i]['bleu']
            }
        data.append(tool_data)
    else:
      for i in range(len(tool_name_lst)):
            tool_data = {
                'tool_name': tool_name_lst[i],
                'tool_generated_content': tool_generated_content_lst[i],
                'rouge-1-recall': rouge_lst[i][0]['rouge-1']['r'],
                'rouge-1-precsion': rouge_lst[i][0]['rouge-1']['p'],
                'rouge-1-f1': rouge_lst[i][0]['rouge-1']['f'],
                'rouge-2-recall': rouge_lst[i][0]['rouge-2']['r'],
                'rouge-2-precision': rouge_lst[i][0]['rouge-2']['p'],
                'rouge-2-f1': rouge_lst[i][0]['rouge-2']['f'],
                'rouge-l-recall': rouge_lst[i][0]['rouge-l']['r'],
                'rouge-l-precision': rouge_lst[i][0]['rouge-l']['p'],
                'rouge-l-f1': rouge_lst[i][0]['rouge-l']['f'],
                'bleu': BLEU_lst[i]['bleu']
                # 'precision_1': BLEU_lst[i]['precisions'][0],
                # 'precision_2': BLEU_lst[i]['precisions'][1],
                # 'brevity_penalty': BLEU_lst[i]['brevity_penalty'],
                # 'length_ratio': BLEU_lst[i]['length_ratio'],
                # 'translation_length': BLEU_lst[i]['translation_length'],
                # 'reference_length': BLEU_lst[i]['reference_length']
            }
            data.append(tool_data)
    return pd.DataFrame(data)

def t_test(group1, group2, group3, label):
    group1_data = preferred[preferred['label'] == label][group1]
    group2_data = preferred[preferred['label'] == label][group2]
    group3_data = preferred[preferred['label'] == label][group3]
    t_stat, p_value = ttest_ind(group1_data, group2_data)
    print(f"t-test between {group1} and {group2} in {label} dataset: P-value: {p_value}")
    t_stat, p_value = ttest_ind(group1_data, group3_data)
    print(f"t-test between {group1} and {group3} in {label} dataset: P-value: {p_value}")


In [3]:
baseline_output1 = """嗨，小红书的朋友们！今天我要和大家分享一下关于纽约大学上海分校商学硕士项目的一些信息。这个项目真的是一个绝佳的选择，尤其是对于那些想要在国际商业舞台上大展拳脚的小伙伴们。

首先，这个项目结合了东西方的教育精华。在这里，你不仅能接触到全球顶尖的商业理论，还能深入了解中国市场的独特之处。课程设置非常灵活，既有严谨的学术研究，也有丰富的实践机会，让你在学习中不断成长。

其次，NYU Shanghai 的师资力量也是一流的。教授们都拥有深厚的行业背景和丰富的教学经验，他们会为你提供最前沿的商业知识和实用技能。而且，学校还经常邀请各领域的大咖来进行讲座和交流，让你有机会与行业领袖面对面沟通。

另外，这里的校园生活也非常多姿多彩。作为一个国际化的学习环境，你会遇到来自世界各地的小伙伴，一起交流、一起成长。这种多元文化氛围不仅能开阔你的视野，还能提升你的跨文化沟通能力。

最后，不得不提的是，NYU Shanghai 的地理位置真的太棒了！坐落于充满活力的大都市上海，这里有无数的发展机遇等着你去探索。不论是实习还是就业，都有着得天独厚的优势。

总之，如果你正在考虑攻读商学硕士，那么NYU Shanghai 绝对值得列入你的选项清单。希望这篇分享能够帮助到正在做决定的小伙伴们！如果还有什么问题或者想了解更多细节，欢迎在评论区留言哦~ 🌟"""

SFT_output1 = """嘿，小红书的朋友们！今天来聊聊NYU上海的商科硕士项目，真的是超赞的选择哦！

首先，这个项目结合了东西方的商业智慧，课程设计非常国际化。你会接触到全球顶尖教授和行业大咖，学习最前沿的商业理论和实战技能。无论是金融、市场营销还是创业创新，都能找到适合你的方向。

其次，NYU上海位于浦东新区，地理位置优越，让你有机会融入中国这个快速发展的经济体。学校还提供丰富的实习和就业资源，为你的职业发展铺平道路。

而且，这里的校园生活也很精彩！各种社团活动、讲座论坛，让你在学习之余还能结识来自世界各地的小伙伴，一起成长进步。

总之，如果你想在国际舞台上大展拳脚，NYU上海绝对是一个值得考虑的地方哦！快来加入我们吧～"""

DPO_output1 = """嘿，小红书的朋友们！今天来聊聊NYU上海的商科硕士项目，绝对是个让人心动的选择哦！

首先，这个项目结合了东西方的商业智慧，课程设置超级国际化。你可以在这里接触到全球顶尖的教授和行业大咖，学习最新的商业趋势和实战技能。

而且，NYU上海位于浦东新区，地理位置得天独厚。无论是实习机会还是职场资源，都非常丰富。想象一下，在这个充满活力的大都市里，你可以轻松参与各种高端商务活动和社交圈子，是不是很赞？

另外，这里的校园生活也很丰富多彩，各种社团活动、讲座、workshop应有尽有，让你的留学生活不再单调。

总之，如果你想要一个既有深度又有广度的商科学习体验，NYU上海绝对值得考虑哦！快来留言告诉我你的想法吧～"""

In [4]:
# load the preferred SFT dataset and DPO dataset
# SFT: 10 posts that have like count above median
# DPO: rest posts that have like count above median
post_df = pd.read_excel("./Competing Programs12052024.xlsx", sheet_name = "Red account data")
post_df.columns = ["row_number", "social_media", "account", "link", "title", "publication_time", "post_type", "like_count", "text"]
post_df = post_df.dropna()
post_df.shape
post_df_red_nyuNew = pd.read_excel("./Competing Programs12052024.xlsx", sheet_name = "Red account data(2)")
post_df_red_nyuNew.columns = ["row_number", "social_media", "account", "link", "title", "publication_time", "post_type", "like_count", "text", "photo_link", "video_link", "photo_count", "video_count"]
post_df_red_nyuNew = post_df_red_nyuNew[["row_number", "social_media", "account", "link", "title", "publication_time", "post_type", "like_count", "text"]]
post_df = pd.concat([post_df, post_df_red_nyuNew])
# retrieve only NYU's posts
post_df = post_df[post_df['account'] == "NYUSH-NYUStern商科硕士"]
preferred = post_df[post_df['like_count'] >= post_df['like_count'].median()]
preferred_SFT = preferred.iloc[:10,:]
preferred_DPO = preferred.iloc[10:,:]
preferred['label'] = ['SFT'] * 10 + ['DPO'] * (preferred.shape[0] - 10)
preferred.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferred['label'] = ['SFT'] * 10 + ['DPO'] * (preferred.shape[0] - 10)


Unnamed: 0,row_number,social_media,account,link,title,publication_time,post_type,like_count,text,label
1,2,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/65a...,NYU SH-Stern | 聊聊那些关于实习的事儿🤓,2024-01-19 15:54:53,图文,25,作为一个项目时长为12-20个月的紧凑型商科硕士项目，相信不少同学都会好奇，学生在项目就读期...,SFT
3,4,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/659...,NYU SH-Stern｜2024的新年愿望😆,2024-01-05 09:19:55,图文,24,快来看看NYUSH- NYUStern 商科硕士项目同学们2024年的展望和期待吧[偷笑R]...,SFT
6,7,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/657...,纽大资讯丨纪念品商店好物分享🫡,2023-12-15 10:38:06,图文,49,号外号外！爱购物的友友们有福了！🥳 经过半年的等待，上纽大的纪念品校园店(们）再次开张啦~😋...,SFT
8,9,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/656...,上纽商科硕士项目 | MBTI不完整图鉴🔍,2023-12-01 14:06:58,图文,42,NYU Shanghai - NYU Stern商科硕士项目MBTI专题来啦!🥳 “我是E...,SFT
11,12,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/654...,NYUSH-Stern｜商科硕士的早C晚A😎,2023-11-10 10:56:47,图文,37,🤔在忙碌的期中阶段，除了紧张的上课和实习之外，NYU Shanghai-NYU Stern ...,SFT
13,14,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/653...,NYU Shanghai｜忙碌之余也别忘记身边美景🔍,2023-10-27 13:14:23,图文,28,不知不觉已经来到校园Fav Spot系列的最终章了! 金秋十月🍂，除了逐渐冷却下来的空气和渐...,SFT
14,15,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/653...,NYU SHANGHAI｜最美露台🌆,2023-10-20 09:57:23,图文,47,校园Fav Spot系列又更新啦~这次让我们跟着DABC(Data Analytics an...,SFT
15,16,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/652...,NYU Shanghai ｜超级Chill的南楼一角🤩,2023-10-12 12:35:04,图文,32,学期已经过半了啦，快来跟着OMS （Organization Management and ...,SFT
17,18,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/650...,NYU Shanghai | 如果你也是球类运动爱好者,2023-09-22 18:09:41,图文,34,在上海的秋季学期已经开始一段时间啦🍂在上课之余，同学们也正在探索着校园的空间和设施，寻找自己...,SFT
19,20,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/64f...,24届商科硕士项目秋季学期开始啦🥳,2023-09-08 15:24:51,图文,26,2024届NYUSH - NYU Stern商科硕士的同学结束了在纽约三个月的精彩学习生活，...,SFT


In [56]:
# Generate ROUGE score
preferred['baseline1-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-1']['r'], axis = 1)
preferred['baseline1-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['baseline1-rouge-1-f1'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-1']['f'], axis = 1)
preferred['baseline1-rouge-2-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-2']['r'], axis = 1)
preferred['baseline1-rouge-2-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['baseline1-rouge-2-f1'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-2']['f'], axis = 1)
preferred['baseline1-rouge-l-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-l']['r'], axis = 1)
preferred['baseline1-rouge-l-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-l']['p'], axis = 1)
preferred['baseline1-rouge-l-f1'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-l']['f'], axis = 1)

preferred['SFT1-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-1']['r'], axis = 1)
preferred['SFT1-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['SFT1-rouge-1-f1'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-1']['f'], axis = 1)
preferred['SFT1-rouge-2-recall'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-2']['r'], axis = 1)
preferred['SFT1-rouge-2-precision'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['SFT1-rouge-2-f1'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-2']['f'], axis = 1)
preferred['SFT1-rouge-l-recall'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-l']['r'], axis = 1)
preferred['SFT1-rouge-l-precision'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-l']['p'], axis = 1)
preferred['SFT1-rouge-l-f1'] = preferred.apply(lambda x: calculate_rouge(SFT_output1, x['text'])[0]['rouge-l']['f'], axis = 1)

preferred['DPO1-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-1']['r'], axis = 1)
preferred['DPO1-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['DPO1-rouge-1-f1'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-1']['f'], axis = 1)
preferred['DPO1-rouge-2-recall'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-2']['r'], axis = 1)
preferred['DPO1-rouge-2-precision'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['DPO1-rouge-2-f1'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-2']['f'], axis = 1)
preferred['DPO1-rouge-l-recall'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-l']['r'], axis = 1)
preferred['DPO1-rouge-l-precision'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-l']['p'], axis = 1)
preferred['DPO1-rouge-l-f1'] = preferred.apply(lambda x: calculate_rouge(DPO_output1, x['text'])[0]['rouge-l']['f'], axis = 1)

# Generate BLEU
preferred['baseline1-BLEU'] = preferred.apply(lambda x: calculate_BLEU(baseline_output1, x['text'])['bleu'], axis = 1)
preferred['SFT1-BLEU'] = preferred.apply(lambda x: calculate_BLEU(SFT_output1, x['text'])['bleu'], axis = 1)
preferred['DPO1-BLEU'] = preferred.apply(lambda x: calculate_BLEU(DPO_output1, x['text'])['bleu'], axis = 1)

preferred.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferred['baseline1-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-1']['r'], axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferred['baseline1-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output1, x['text'])[0]['rouge-1']['p'], axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.o

Unnamed: 0,row_number,social_media,account,link,title,publication_time,post_type,like_count,text,label,...,DPO2-rouge-1-f1,DPO2-rouge-2-recall,DPO2-rouge-2-precision,DPO2-rouge-2-f1,DPO2-rouge-l-recall,DPO2-rouge-l-precision,DPO2-rouge-l-f1,baseline2-BLEU,SFT2-BLEU,DPO2-BLEU
1,2,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/65a...,NYU SH-Stern | 聊聊那些关于实习的事儿🤓,2024-01-19 15:54:53,图文,25,作为一个项目时长为12-20个月的紧凑型商科硕士项目，相信不少同学都会好奇，学生在项目就读期...,SFT,...,0.212121,0.02439,0.229508,0.026316,0.151659,0.171123,0.160804,0.053959,0.05725,0.079276
3,4,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/659...,NYU SH-Stern｜2024的新年愿望😆,2024-01-05 09:19:55,图文,24,快来看看NYUSH- NYUStern 商科硕士项目同学们2024年的展望和期待吧[偷笑R]...,SFT,...,0.170854,0.02439,0.298246,0.035971,0.066351,0.175,0.09622,0.007797,0.038882,0.028275
6,7,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/657...,纽大资讯丨纪念品商店好物分享🫡,2023-12-15 10:38:06,图文,49,号外号外！爱购物的友友们有福了！🥳 经过半年的等待，上纽大的纪念品校园店(们）再次开张啦~😋...,SFT,...,0.138686,0.009756,0.143939,0.010101,0.090047,0.085202,0.087558,0.045535,0.022917,0.037603
8,9,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/656...,上纽商科硕士项目 | MBTI不完整图鉴🔍,2023-12-01 14:06:58,图文,42,NYU Shanghai - NYU Stern商科硕士项目MBTI专题来啦!🥳 “我是E...,SFT,...,0.207171,0.039024,0.238532,0.043127,0.123223,0.140541,0.131313,0.0553,0.075687,0.09922
11,12,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/654...,NYUSH-Stern｜商科硕士的早C晚A😎,2023-11-10 10:56:47,图文,37,🤔在忙碌的期中阶段，除了紧张的上课和实习之外，NYU Shanghai-NYU Stern ...,SFT,...,0.183575,0.009756,0.292308,0.013559,0.07109,0.154639,0.097403,0.017619,0.041763,0.0268


In [57]:
preferred.columns

Index(['row_number', 'social_media', 'account', 'link', 'title',
       'publication_time', 'post_type', 'like_count', 'text', 'label',
       'baseline1-rouge-1-recall', 'baseline1-rouge-1-precision',
       'baseline1-rouge-1-f1', 'baseline1-rouge-2-recall',
       'baseline1-rouge-2-precision', 'baseline1-rouge-2-f1',
       'baseline1-rouge-l-recall', 'baseline1-rouge-l-precision',
       'baseline1-rouge-l-f1', 'SFT1-rouge-1-recall', 'SFT1-rouge-1-precision',
       'SFT1-rouge-1-f1', 'SFT1-rouge-2-recall', 'SFT1-rouge-2-precision',
       'SFT1-rouge-2-f1', 'SFT1-rouge-l-recall', 'SFT1-rouge-l-precision',
       'SFT1-rouge-l-f1', 'DPO1-rouge-1-recall', 'DPO1-rouge-1-precision',
       'DPO1-rouge-1-f1', 'DPO1-rouge-2-recall', 'DPO1-rouge-2-precision',
       'DPO1-rouge-2-f1', 'DPO1-rouge-l-recall', 'DPO1-rouge-l-precision',
       'DPO1-rouge-l-f1', 'baseline1-BLEU', 'SFT1-BLEU', 'DPO1-BLEU',
       'baseline2-rouge-1-recall', 'baseline2-rouge-1-precision',
       'baseline2-ro

In [58]:
print(preferred.groupby('label')[['baseline1-BLEU', 'SFT1-BLEU', 'DPO1-BLEU']].mean())
t_test("baseline1-BLEU", "SFT1-BLEU", "DPO1-BLEU", "DPO")
t_test("baseline1-BLEU", "SFT1-BLEU", "DPO1-BLEU", "SFT")

       baseline1-BLEU  SFT1-BLEU  DPO1-BLEU
label                                      
DPO          0.026363   0.029721   0.031414
SFT          0.035362   0.049372   0.049875
t-test between baseline1-BLEU and SFT1-BLEU in DPO dataset: P-value: 0.5718926116150247
t-test between baseline1-BLEU and DPO1-BLEU in DPO dataset: P-value: 0.41970276340663737
t-test between baseline1-BLEU and SFT1-BLEU in SFT dataset: P-value: 0.07261602624808279
t-test between baseline1-BLEU and DPO1-BLEU in SFT dataset: P-value: 0.05472374902609936


In [59]:
print(preferred.groupby('label')[['baseline1-rouge-l-f1', 'SFT1-rouge-l-f1', 'DPO1-rouge-l-f1']].mean())
t_test("baseline1-rouge-l-f1", "SFT1-rouge-l-f1", "DPO1-rouge-l-f1", "DPO")
t_test("baseline1-rouge-l-f1", "SFT1-rouge-l-f1", "DPO1-rouge-l-f1", "SFT")

       baseline1-rouge-l-f1  SFT1-rouge-l-f1  DPO1-rouge-l-f1
label                                                        
DPO                0.080012         0.105410         0.107876
SFT                0.092556         0.115373         0.120849
t-test between baseline1-rouge-l-f1 and SFT1-rouge-l-f1 in DPO dataset: P-value: 0.014151833286919269
t-test between baseline1-rouge-l-f1 and DPO1-rouge-l-f1 in DPO dataset: P-value: 0.00734454198012355
t-test between baseline1-rouge-l-f1 and SFT1-rouge-l-f1 in SFT dataset: P-value: 0.08479708195257823
t-test between baseline1-rouge-l-f1 and DPO1-rouge-l-f1 in SFT dataset: P-value: 0.01767911447832468


In [60]:
print(preferred.groupby('label')[['baseline1-rouge-l-recall', 'SFT1-rouge-l-recall', 'DPO1-rouge-l-recall']].mean())
t_test('baseline1-rouge-l-recall', 'SFT1-rouge-l-recall', 'DPO1-rouge-l-recall', "DPO")
t_test('baseline1-rouge-l-recall', 'SFT1-rouge-l-recall', 'DPO1-rouge-l-recall', "SFT")

       baseline1-rouge-l-recall  SFT1-rouge-l-recall  DPO1-rouge-l-recall
label                                                                    
DPO                    0.211797             0.191413             0.194017
SFT                    0.162894             0.140793             0.149195
t-test between baseline1-rouge-l-recall and SFT1-rouge-l-recall in DPO dataset: P-value: 0.2586078156896632
t-test between baseline1-rouge-l-recall and DPO1-rouge-l-recall in DPO dataset: P-value: 0.32450676774970966
t-test between baseline1-rouge-l-recall and SFT1-rouge-l-recall in SFT dataset: P-value: 0.1388582871940507
t-test between baseline1-rouge-l-recall and DPO1-rouge-l-recall in SFT dataset: P-value: 0.3843456844639904


In [61]:
print(preferred.groupby('label')[['baseline1-rouge-l-precision', 'SFT1-rouge-l-precision', 'DPO1-rouge-l-precision']].mean())
t_test('baseline1-rouge-l-precision', 'SFT1-rouge-l-precision', 'DPO1-rouge-l-precision', "DPO")
t_test('baseline1-rouge-l-precision', 'SFT1-rouge-l-precision', 'DPO1-rouge-l-precision', "SFT")

       baseline1-rouge-l-precision  SFT1-rouge-l-precision  \
label                                                        
DPO                       0.054857                0.084603   
SFT                       0.067692                0.105000   

       DPO1-rouge-l-precision  
label                          
DPO                  0.087167  
SFT                  0.109605  
t-test between baseline1-rouge-l-precision and SFT1-rouge-l-precision in DPO dataset: P-value: 0.006132288459567998
t-test between baseline1-rouge-l-precision and DPO1-rouge-l-precision in DPO dataset: P-value: 0.0036100750136554517
t-test between baseline1-rouge-l-precision and SFT1-rouge-l-precision in SFT dataset: P-value: 0.016886996438425818
t-test between baseline1-rouge-l-precision and DPO1-rouge-l-precision in SFT dataset: P-value: 0.003280257050972228


In [62]:
print(preferred.groupby('label')[['baseline1-rouge-2-f1', 'SFT1-rouge-2-f1', 'DPO1-rouge-2-f1']].mean())
t_test('baseline1-rouge-2-f1', 'SFT1-rouge-2-f1', 'DPO1-rouge-2-f1', "DPO")
t_test('baseline1-rouge-2-f1', 'SFT1-rouge-2-f1', 'DPO1-rouge-2-f1', "SFT")

       baseline1-rouge-2-f1  SFT1-rouge-2-f1  DPO1-rouge-2-f1
label                                                        
DPO                0.013162         0.013893         0.015677
SFT                0.016549         0.019861         0.019709
t-test between baseline1-rouge-2-f1 and SFT1-rouge-2-f1 in DPO dataset: P-value: 0.8036417336373033
t-test between baseline1-rouge-2-f1 and DPO1-rouge-2-f1 in DPO dataset: P-value: 0.4293358816845173
t-test between baseline1-rouge-2-f1 and SFT1-rouge-2-f1 in SFT dataset: P-value: 0.4855753095850347
t-test between baseline1-rouge-2-f1 and DPO1-rouge-2-f1 in SFT dataset: P-value: 0.5193475536300038


In [63]:
print(preferred.groupby('label')[['baseline1-rouge-1-f1', 'SFT1-rouge-1-f1', 'DPO1-rouge-1-f1']].mean())
t_test('baseline1-rouge-1-f1', 'SFT1-rouge-1-f1', 'DPO1-rouge-1-f1', "DPO")
t_test('baseline1-rouge-1-f1', 'SFT1-rouge-1-f1', 'DPO1-rouge-1-f1', "SFT")

       baseline1-rouge-1-f1  SFT1-rouge-1-f1  DPO1-rouge-1-f1
label                                                        
DPO                0.123745         0.158384         0.148905
SFT                0.151436         0.170397         0.174234
t-test between baseline1-rouge-1-f1 and SFT1-rouge-1-f1 in DPO dataset: P-value: 0.011283921557443032
t-test between baseline1-rouge-1-f1 and DPO1-rouge-1-f1 in DPO dataset: P-value: 0.05968618728344989
t-test between baseline1-rouge-1-f1 and SFT1-rouge-1-f1 in SFT dataset: P-value: 0.21900432453671292
t-test between baseline1-rouge-1-f1 and DPO1-rouge-1-f1 in SFT dataset: P-value: 0.1640154375090652


In [64]:
baseline_output2 = """🌟 亲爱的小红书朋友们，今天我要为大家介绍的是纽约大学上海分校的商业硕士项目！如果你对商业世界充满热情，并且希望在国际化的环境中学习，那这个项目绝对是你的不二选择哦！

📚 **项目亮点：**
1. **全球视野**：纽约大学上海分校不仅地理位置优越，还汇聚了来自世界各地的顶尖学者。课程内容紧跟全球商业趋势，让你站在最前沿。
2. **实践导向**：这里强调理论与实际结合，通过案例分析、企业实习等方式，让你将课堂知识应用于真实商业环境。
3. **多元文化交流**：在这个多元文化的校园里，你可以结识到来自不同国家和背景的小伙伴，一起分享经验，共同成长。

💼 **就业前景：**
毕业生广受各大国际企业欢迎，无论是进入跨国公司还是创业，都有非常可观的发展机会。而且，学校还提供强大的校友网络支持，为你的职业发展保驾护航。

✨ **生活体验：**
除了学术上的收获，上海这座活力四射的大都市也会让你的留学生活丰富多彩。从美食到文化，从购物到夜生活，这里总有新鲜事等着你去探索。

🔗 总之，如果你想要一个充满挑战和机遇的学习旅程，不妨考虑加入NYU Shanghai的商业硕士项目吧！赶快点击官网了解更多详情哦～

加油，未来属于勇敢追梦的你！💪

#NYUShanghai #商科硕士 #留学生活 #国际视野 #职场进阶"""

SFT_output2 = """嘿，大家好！👋 今天来聊聊NYU上海的商科硕士项目~ 🎓

首先，这个项目真的是超级有国际范儿！🌍 课程设计结合了全球商业视野和本土市场洞察，绝对能让你大开眼界。教授团队也很给力，大多是业界大咖或者学术牛人。

这里的学习氛围特别活跃，讨论课、案例分析、团队合作一个都不少。📚✌️ 同学们来自世界各地，多元文化碰撞，思维火花四溅！

还有哦，学校的位置也是一大亮点，在魔都中心地带，实习机会多到数不过来！💼 无论是投行、咨询还是科技公司，都触手可及。

如果你想提升自己的商业技能，同时获得全球化视野，这里绝对是不二选择！🔥 有问题记得留言哦，我们下次见~ 😄"""

DPO_output2 = """大家好！今天给大家分享一下NYU上海的商科硕士项目，真的超级赞哦！🎓

首先，这个项目融合了全球视野和本土智慧。你可以在国际化环境中学习到最新的商业知识，同时又能接触到中国市场的独特机会。📈

课程设置非常灵活，有丰富多样的选修课，你可以根据自己的兴趣和职业规划来选择。而且，教授们都是业内大咖，课堂上互动性很强，让人感觉既充实又有趣。🧠✨

另外，学校地处上海这座国际大都市，不仅提供了丰富的实习资源，还有各种企业合作项目，可以积累不少职场经验呢！💼🌍

最重要的是，这里汇聚了来自世界各地的小伙伴，大家互相交流、碰撞思维，每天都特别有动力。不管你是想创业还是进入知名企业，这个项目都会让你的未来更加精彩！

如果对商科感兴趣的小伙伴们，一定要考虑一下哦～期待在NYU上海遇见更优秀的你们！🌟"""

In [66]:
# Generate ROUGE score
preferred['baseline2-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-1']['r'], axis = 1)
preferred['baseline2-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['baseline2-rouge-1-f1'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-1']['f'], axis = 1)
preferred['baseline2-rouge-2-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-2']['r'], axis = 1)
preferred['baseline2-rouge-2-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['baseline2-rouge-2-f1'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-2']['f'], axis = 1)
preferred['baseline2-rouge-l-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-l']['r'], axis = 1)
preferred['baseline2-rouge-l-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-l']['p'], axis = 1)
preferred['baseline2-rouge-l-f1'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-l']['f'], axis = 1)

preferred['SFT2-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-1']['r'], axis = 1)
preferred['SFT2-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['SFT2-rouge-1-f1'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-1']['f'], axis = 1)
preferred['SFT2-rouge-2-recall'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-2']['r'], axis = 1)
preferred['SFT2-rouge-2-precision'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['SFT2-rouge-2-f1'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-2']['f'], axis = 1)
preferred['SFT2-rouge-l-recall'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-l']['r'], axis = 1)
preferred['SFT2-rouge-l-precision'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-l']['p'], axis = 1)
preferred['SFT2-rouge-l-f1'] = preferred.apply(lambda x: calculate_rouge(SFT_output2, x['text'])[0]['rouge-l']['f'], axis = 1)

preferred['DPO2-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-1']['r'], axis = 1)
preferred['DPO2-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['DPO2-rouge-1-f1'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-1']['f'], axis = 1)
preferred['DPO2-rouge-2-recall'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-2']['r'], axis = 1)
preferred['DPO2-rouge-2-precision'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
preferred['DPO2-rouge-2-f1'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-2']['f'], axis = 1)
preferred['DPO2-rouge-l-recall'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-l']['r'], axis = 1)
preferred['DPO2-rouge-l-precision'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-l']['p'], axis = 1)
preferred['DPO2-rouge-l-f1'] = preferred.apply(lambda x: calculate_rouge(DPO_output2, x['text'])[0]['rouge-l']['f'], axis = 1)

# Generate BLEU
preferred['baseline2-BLEU'] = preferred.apply(lambda x: calculate_BLEU(x['text'], baseline_output2)['bleu'], axis = 1)
preferred['SFT2-BLEU'] = preferred.apply(lambda x: calculate_BLEU(x['text'], SFT_output2)['bleu'], axis = 1)
preferred['DPO2-BLEU'] = preferred.apply(lambda x: calculate_BLEU(x['text'], DPO_output2)['bleu'], axis = 1)

preferred.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferred['baseline2-rouge-1-recall'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-1']['r'], axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preferred['baseline2-rouge-1-precision'] = preferred.apply(lambda x: calculate_rouge(baseline_output2, x['text'])[0]['rouge-1']['p'], axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.o

Unnamed: 0,row_number,social_media,account,link,title,publication_time,post_type,like_count,text,label,...,DPO2-rouge-1-f1,DPO2-rouge-2-recall,DPO2-rouge-2-precision,DPO2-rouge-2-f1,DPO2-rouge-l-recall,DPO2-rouge-l-precision,DPO2-rouge-l-f1,baseline2-BLEU,SFT2-BLEU,DPO2-BLEU
1,2,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/65a...,NYU SH-Stern | 聊聊那些关于实习的事儿🤓,2024-01-19 15:54:53,图文,25,作为一个项目时长为12-20个月的紧凑型商科硕士项目，相信不少同学都会好奇，学生在项目就读期...,SFT,...,0.212121,0.028571,0.197183,0.026316,0.171123,0.151659,0.160804,0.053959,0.05725,0.079276
3,4,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/659...,NYU SH-Stern｜2024的新年愿望😆,2024-01-05 09:19:55,图文,24,快来看看NYUSH- NYUStern 商科硕士项目同学们2024年的展望和期待吧[偷笑R]...,SFT,...,0.170854,0.068493,0.119718,0.035971,0.175,0.066351,0.09622,0.007797,0.038882,0.028275
6,7,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/657...,纽大资讯丨纪念品商店好物分享🫡,2023-12-15 10:38:06,图文,49,号外号外！爱购物的友友们有福了！🥳 经过半年的等待，上纽大的纪念品校园店(们）再次开张啦~😋...,SFT,...,0.138686,0.010471,0.133803,0.010101,0.085202,0.090047,0.087558,0.045535,0.022917,0.037603
8,9,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/656...,上纽商科硕士项目 | MBTI不完整图鉴🔍,2023-12-01 14:06:58,图文,42,NYU Shanghai - NYU Stern商科硕士项目MBTI专题来啦!🥳 “我是E...,SFT,...,0.207171,0.048193,0.183099,0.043127,0.140541,0.123223,0.131313,0.0553,0.075687,0.09922
11,12,小红书,NYUSH-NYUStern商科硕士,https://www.xiaohongshu.com/discovery/item/654...,NYUSH-Stern｜商科硕士的早C晚A😎,2023-11-10 10:56:47,图文,37,🤔在忙碌的期中阶段，除了紧张的上课和实习之外，NYU Shanghai-NYU Stern ...,SFT,...,0.183575,0.022222,0.133803,0.013559,0.154639,0.07109,0.097403,0.017619,0.041763,0.0268


In [67]:
print(preferred.groupby('label')[['baseline2-BLEU', 'SFT2-BLEU', 'DPO2-BLEU']].mean())
t_test('baseline2-BLEU', 'SFT2-BLEU', 'DPO2-BLEU', "DPO")
t_test('baseline2-BLEU', 'SFT2-BLEU', 'DPO2-BLEU', "SFT")

       baseline2-BLEU  SFT2-BLEU  DPO2-BLEU
label                                      
DPO          0.014485   0.025971   0.022270
SFT          0.031002   0.042975   0.043821
t-test between baseline2-BLEU and SFT2-BLEU in DPO dataset: P-value: 0.046295903306933964
t-test between baseline2-BLEU and DPO2-BLEU in DPO dataset: P-value: 0.20881764768260394
t-test between baseline2-BLEU and SFT2-BLEU in SFT dataset: P-value: 0.1998262422534313
t-test between baseline2-BLEU and DPO2-BLEU in SFT dataset: P-value: 0.26125001734977427


In [68]:
print(preferred.groupby('label')[['baseline2-rouge-l-f1', 'SFT2-rouge-l-f1', 'DPO2-rouge-l-f1']].mean())
t_test('baseline2-rouge-l-f1', 'SFT2-rouge-l-f1', 'DPO2-rouge-l-f1', "DPO")
t_test('baseline2-rouge-l-f1', 'SFT2-rouge-l-f1', 'DPO2-rouge-l-f1', "SFT")

       baseline2-rouge-l-f1  SFT2-rouge-l-f1  DPO2-rouge-l-f1
label                                                        
DPO                0.086164         0.091617         0.097466
SFT                0.110445         0.100827         0.109272
t-test between baseline2-rouge-l-f1 and SFT2-rouge-l-f1 in DPO dataset: P-value: 0.5475287832482787
t-test between baseline2-rouge-l-f1 and DPO2-rouge-l-f1 in DPO dataset: P-value: 0.25093871196370654
t-test between baseline2-rouge-l-f1 and SFT2-rouge-l-f1 in SFT dataset: P-value: 0.377922500023269
t-test between baseline2-rouge-l-f1 and DPO2-rouge-l-f1 in SFT dataset: P-value: 0.9201539363481727


In [69]:
print(preferred.groupby('label')[['baseline2-rouge-l-recall', 'SFT2-rouge-l-recall', 'DPO2-rouge-l-recall']].mean())
t_test('baseline2-rouge-l-recall', 'SFT2-rouge-l-recall', 'DPO2-rouge-l-recall', "DPO")
t_test('baseline2-rouge-l-recall', 'SFT2-rouge-l-recall', 'DPO2-rouge-l-recall', "SFT")

       baseline2-rouge-l-recall  SFT2-rouge-l-recall  DPO2-rouge-l-recall
label                                                                    
DPO                    0.238764             0.164035             0.205006
SFT                    0.202621             0.121667             0.146333
t-test between baseline2-rouge-l-recall and SFT2-rouge-l-recall in DPO dataset: P-value: 1.1469979767257981e-05
t-test between baseline2-rouge-l-recall and DPO2-rouge-l-recall in DPO dataset: P-value: 0.02601125152970198
t-test between baseline2-rouge-l-recall and SFT2-rouge-l-recall in SFT dataset: P-value: 0.00011254637030532827
t-test between baseline2-rouge-l-recall and DPO2-rouge-l-recall in SFT dataset: P-value: 0.0029370669771414007


In [70]:
print(preferred.groupby('label')[['baseline2-rouge-l-precision', 'SFT2-rouge-l-precision', 'DPO2-rouge-l-precision']].mean())
t_test('baseline2-rouge-l-precision', 'SFT2-rouge-l-precision', 'DPO2-rouge-l-precision', "DPO")
t_test('baseline2-rouge-l-precision', 'SFT2-rouge-l-precision', 'DPO2-rouge-l-precision', "SFT")

       baseline2-rouge-l-precision  SFT2-rouge-l-precision  \
label                                                        
DPO                       0.058348                0.074252   
SFT                       0.079819                0.092442   

       DPO2-rouge-l-precision  
label                          
DPO                  0.074340  
SFT                  0.092891  
t-test between baseline2-rouge-l-precision and SFT2-rouge-l-precision in DPO dataset: P-value: 0.09185029769613746
t-test between baseline2-rouge-l-precision and DPO2-rouge-l-precision in DPO dataset: P-value: 0.10113000990092577
t-test between baseline2-rouge-l-precision and SFT2-rouge-l-precision in SFT dataset: P-value: 0.29901434956351775
t-test between baseline2-rouge-l-precision and DPO2-rouge-l-precision in SFT dataset: P-value: 0.310279758560454


In [71]:
print(preferred.groupby('label')[['baseline2-rouge-2-f1', 'SFT2-rouge-2-f1', 'DPO2-rouge-2-f1']].mean())
t_test('baseline2-rouge-2-f1', 'SFT2-rouge-2-f1', 'DPO2-rouge-2-f1', "DPO")
t_test('baseline2-rouge-2-f1', 'SFT2-rouge-2-f1', 'DPO2-rouge-2-f1', "SFT")

       baseline2-rouge-2-f1  SFT2-rouge-2-f1  DPO2-rouge-2-f1
label                                                        
DPO                0.016748         0.017333         0.013983
SFT                0.026259         0.019378         0.020225
t-test between baseline2-rouge-2-f1 and SFT2-rouge-2-f1 in DPO dataset: P-value: 0.8484362423380087
t-test between baseline2-rouge-2-f1 and DPO2-rouge-2-f1 in DPO dataset: P-value: 0.31927382851589675
t-test between baseline2-rouge-2-f1 and SFT2-rouge-2-f1 in SFT dataset: P-value: 0.0789809880422389
t-test between baseline2-rouge-2-f1 and DPO2-rouge-2-f1 in SFT dataset: P-value: 0.20021550749987044


In [72]:
print(preferred.groupby('label')[['baseline2-rouge-1-f1', 'SFT2-rouge-1-f1', 'DPO2-rouge-1-f1']].mean())
t_test('baseline2-rouge-1-f1', 'SFT2-rouge-1-f1', 'DPO2-rouge-1-f1', "DPO")
t_test('baseline2-rouge-1-f1', 'SFT2-rouge-1-f1', 'DPO2-rouge-1-f1', "SFT")

       baseline2-rouge-1-f1  SFT2-rouge-1-f1  DPO2-rouge-1-f1
label                                                        
DPO                0.141580         0.146390         0.142964
SFT                0.171088         0.169929         0.168208
t-test between baseline2-rouge-1-f1 and SFT2-rouge-1-f1 in DPO dataset: P-value: 0.7198678951468245
t-test between baseline2-rouge-1-f1 and DPO2-rouge-1-f1 in DPO dataset: P-value: 0.9204479050164079
t-test between baseline2-rouge-1-f1 and SFT2-rouge-1-f1 in SFT dataset: P-value: 0.938202895834164
t-test between baseline2-rouge-1-f1 and DPO2-rouge-1-f1 in SFT dataset: P-value: 0.8580240258881007
