In [1]:
%pip install lexical-diversity

Collecting lexical-diversity
  Downloading lexical_diversity-0.1.1-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lexical-diversity
Successfully installed lexical-diversity-0.1.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!python -m spacy download en_core_web_lg

In [None]:
import spacy
from lexical_diversity import lex_div as ld

nlp = spacy.load("en_core_web_lg")

def lemmatize(text):
  doc = nlp(text)
  lemmatized_text = [token.lemma_ for token in doc]
  return lemmatized_text

In [None]:
def get_diversity_stats(lemmatized_text):
  return dict({
    'simple_ttr': ld.ttr(lemmatized_text),
    'root_ttr': ld.root_ttr(lemmatized_text),
    'log_ttr': ld.log_ttr(lemmatized_text),
    'mass_ttr': ld.maas_ttr(lemmatized_text),
    'msttr': ld.msttr(lemmatized_text, window_length=25),
    'mattr': ld.mattr(lemmatized_text, window_length=25),
    'hdd': ld.hdd(lemmatized_text),
    'mtld': ld.mtld(lemmatized_text),
    'mtld_ma_wrap': ld.mtld_ma_wrap(lemmatized_text),
    'mtld_ma_bid': ld.mtld_ma_bid(lemmatized_text),
  })

## Data - LLaMA-2 (SAME)

In [None]:
import re
import pandas as pd

dfs = []
TOPICS = ['Indian climate', 'Indian defense']
for topic in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/llama2-7b-200-climate-defense/{topic}/data.csv')
  dfs.append(df)
TOPICS = ['Indian economy', 'Indian infrastructure']
for topic in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/llama2-7b-200-economy-infra/{topic}/data.csv')
  dfs.append(df[:500])

TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure', 'Combined']

In [None]:
df_combined = pd.DataFrame()

for df in dfs:
  df_combined = df_combined.append(df, ignore_index=True)

dfs.append(df_combined)
df_combined

  df_combined = df_combined.append(df, ignore_index=True)


Unnamed: 0,Prompt,Generated Text,i,MAX_LEN
0,Indian climate,Indian climate is a tropical one. It is hot an...,0,181
1,Indian climate,Indian climate is tropical in nature. Therefor...,1,189
2,Indian climate,Indian climate is divided into three main seas...,2,199
3,Indian climate,"Indian climate change: Heatwaves, droughts, fl...",3,194
4,Indian climate,Indian climate is a combination of three seaso...,4,182
...,...,...,...,...
1995,Indian infrastructure,Indian infrastructure sector has been growing ...,495,185
1996,Indian infrastructure,Indian infrastructure sector is in the midst o...,496,184
1997,Indian infrastructure,Indian infrastructure companies have been on a...,497,192
1998,Indian infrastructure,Indian infrastructure sector to see 10% growth...,498,186


In [None]:
!mkdir -p "drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/llama2"

In [None]:
for theme, df in zip(TOPICS, dfs):
  diversity_stats_df = df['Generated Text'].apply(lambda x: pd.Series(get_diversity_stats(lemmatize(x))))
  df = pd.concat([df, diversity_stats_df], axis=1)
  df.to_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/llama2/{theme}.csv', index=False)
  print('THEME:', theme)
  print(df.describe())

THEME: Indian climate
                i     MAX_LEN  simple_ttr    root_ttr     log_ttr    mass_ttr  \
count  500.000000  500.000000  500.000000  500.000000  500.000000  500.000000   
mean   249.500000  190.276000    0.559554    6.538296    0.881118    0.055696   
std    144.481833    6.185819    0.060061    0.716607    0.018553    0.008245   
min      0.000000  180.000000    0.404624    4.370957    0.822511    0.033601   
25%    124.750000  185.000000    0.514883    6.114954    0.868492    0.050097   
50%    249.500000  191.000000    0.556213    6.624830    0.881040    0.055011   
75%    374.250000  196.000000    0.592416    7.065598    0.892934    0.060798   
max    499.000000  200.000000    0.795455    8.353351    0.939527    0.084086   

            msttr       mattr         hdd        mtld  mtld_ma_wrap  \
count  500.000000  500.000000  500.000000  500.000000    500.000000   
mean     0.832140    0.832076    0.751589   49.155613     48.978330   
std      0.041691    0.039575    0.

In [None]:
TOPICS[-1:]

['Combined']

In [None]:
for theme, df in zip(TOPICS[-1:], dfs[-1:]):
  diversity_stats_df = df['Generated Text'].apply(lambda x: pd.Series(get_diversity_stats(lemmatize(x))))
  df = pd.concat([df, diversity_stats_df], axis=1)
  df.to_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/llama2/{theme}.csv', index=False)
  print('THEME:', theme)
  print(df.describe())

THEME: Combined
                 i      MAX_LEN   simple_ttr     root_ttr      log_ttr  \
count  2000.000000  2000.000000  2000.000000  2000.000000  2000.000000   
mean    249.500000   189.817000     0.602097     6.788698     0.894641   
std     144.373376     6.103781     0.071512     0.674939     0.020585   
min       0.000000   180.000000     0.404624     2.773501     0.822511   
25%     124.750000   184.000000     0.553333     6.386232     0.880959   
50%     249.500000   190.000000     0.594937     6.861745     0.894463   
75%     374.250000   195.000000     0.641412     7.250616     0.907346   
max     499.000000   200.000000     0.906250     8.768776     0.971596   

          mass_ttr        msttr        mattr          hdd         mtld  \
count  2000.000000  2000.000000  2000.000000  2000.000000  2000.000000   
mean      0.049882     0.864871     0.865234     0.781427    61.857932   
std       0.008698     0.041447     0.039050     0.052405    17.547062   
min       0.018871   

## Data - LLaMA-2 (DIFFERENT)

In [None]:
import re
import pandas as pd

dfs = []
TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure']
for topic in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/llama2-7b-200-google-scholar/{topic}/data.csv')
  dfs.append(df)

TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure', 'Combined']

In [None]:
df_combined = pd.DataFrame()

for df in dfs:
  df_combined = df_combined.append(df, ignore_index=True)

dfs.append(df_combined)
df_combined

  df_combined = df_combined.append(df, ignore_index=True)


Unnamed: 0,Category,Prompt,Generated Text,i,MAX_LEN
0,Indian climate,Impact of climate change on Indian agriculture...,Impact of climate change on Indian agriculture...,0,182
1,Indian climate,"The predictive state: science, territory and t...","The predictive state: science, territory and t...",1,193
2,Indian climate,Water resources and climate change: An Indian ...,Water resources and climate change: An Indian ...,2,190
3,Indian climate,Fractal dimensional analysis of Indian climati...,Fractal dimensional analysis of Indian climati...,3,184
4,Indian climate,Temperature and rainfall extremes change under...,Temperature and rainfall extremes change under...,4,180
...,...,...,...,...,...
1978,Indian infrastructure,What Causes Agglomeration—Policy or Infrastruc...,What Causes Agglomeration—Policy or Infrastruc...,496,182
1979,Indian infrastructure,The Scenario of FDI in Infrastructure of India,The Scenario of FDI in Infrastructure of India...,497,187
1980,Indian infrastructure,The Impact Of Gaja Cyclone On Paddy And Rural ...,The Impact Of Gaja Cyclone On Paddy And Rural ...,498,194
1981,Indian infrastructure,South-South investment in infrastructure: the ...,South-South investment in infrastructure: the ...,499,194


In [None]:
!mkdir -p "drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/llama2"

In [None]:
for theme, df in zip(TOPICS, dfs):
  diversity_stats_df = df['Generated Text'].apply(lambda x: pd.Series(get_diversity_stats(lemmatize(x))))
  df = pd.concat([df, diversity_stats_df], axis=1)
  df.to_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/llama2/{theme}.csv', index=False)
  print('THEME:', theme)
  print(df.describe())

THEME: Indian climate
                i     MAX_LEN  simple_ttr    root_ttr     log_ttr    mass_ttr  \
count  501.000000  501.000000  501.000000  501.000000  501.000000  501.000000   
mean   250.000000  190.281437    0.590563    7.007476    0.892987    0.049728   
std    144.770508    5.973494    0.062027    0.683749    0.017741    0.007608   
min      0.000000  180.000000    0.452514    3.400000    0.844503    0.009703   
25%    125.000000  185.000000    0.548913    6.712196    0.880801    0.044747   
50%    250.000000  190.000000    0.582734    7.062970    0.891774    0.049991   
75%    375.000000  196.000000    0.623762    7.428703    0.903803    0.054561   
max    500.000000  200.000000    0.958333    8.433803    0.986608    0.085707   

            msttr       mattr         hdd        mtld  mtld_ma_wrap  \
count  501.000000  501.000000  501.000000  501.000000    501.000000   
mean     0.858986    0.855897    0.772965   60.408203     59.826780   
std      0.035106    0.033135    0.

In [None]:
TOPICS[-1:]

['Combined']

In [None]:
dfs[-1]

Unnamed: 0,Category,Prompt,Generated Text,i,MAX_LEN
0,Indian climate,Impact of climate change on Indian agriculture...,Impact of climate change on Indian agriculture...,0,182
1,Indian climate,"The predictive state: science, territory and t...","The predictive state: science, territory and t...",1,193
2,Indian climate,Water resources and climate change: An Indian ...,Water resources and climate change: An Indian ...,2,190
3,Indian climate,Fractal dimensional analysis of Indian climati...,Fractal dimensional analysis of Indian climati...,3,184
4,Indian climate,Temperature and rainfall extremes change under...,Temperature and rainfall extremes change under...,4,180
...,...,...,...,...,...
1978,Indian infrastructure,What Causes Agglomeration—Policy or Infrastruc...,What Causes Agglomeration—Policy or Infrastruc...,496,182
1979,Indian infrastructure,The Scenario of FDI in Infrastructure of India,The Scenario of FDI in Infrastructure of India...,497,187
1980,Indian infrastructure,The Impact Of Gaja Cyclone On Paddy And Rural ...,The Impact Of Gaja Cyclone On Paddy And Rural ...,498,194
1981,Indian infrastructure,South-South investment in infrastructure: the ...,South-South investment in infrastructure: the ...,499,194


In [None]:
!mkdir -p "drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/llama2"

In [None]:
for theme, df in zip(TOPICS[-1:], dfs[-1:]):
  diversity_stats_df = df['Generated Text'].apply(lambda x: pd.Series(get_diversity_stats(lemmatize(x))))
  df = pd.concat([df, diversity_stats_df], axis=1)
  df.to_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/llama2/{theme}.csv', index=False)
  print('THEME:', theme)
  print(df.describe())

THEME: Combined
                 i      MAX_LEN   simple_ttr     root_ttr      log_ttr  \
count  1983.000000  1983.000000  1983.000000  1983.000000  1983.000000   
mean    247.458396   190.032779     0.582690     6.927642     0.890280   
std     143.288853     6.040933     0.064747     0.743945     0.018376   
min       0.000000   180.000000     0.417112     2.713602     0.832846   
25%     123.500000   185.000000     0.541667     6.631391     0.878850   
50%     247.000000   190.000000     0.576087     7.027819     0.890049   
75%     371.000000   195.000000     0.614379     7.389618     0.901057   
max     500.000000   200.000000     0.958333     9.138115     0.986608   

          mass_ttr        msttr        mattr          hdd         mtld  \
count  1983.000000  1983.000000  1983.000000  1983.000000  1983.000000   
mean      0.050994     0.853084     0.852426     0.762147    57.706087   
std       0.007984     0.039105     0.036247     0.110194    16.868804   
min       0.008241   

## Data - Falcon (SAME)

In [None]:
import re
import pandas as pd

dfs = []
TOPICS = ['Indian climate', 'Indian defence', 'Indian economy', 'Indian infrastructure']
for topic in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/falcon-7b-200-24012024-v3/{topic}/data.csv')
  dfs.append(df)

TOPICS = ['Indian climate', 'Indian defence', 'Indian economy', 'Indian infrastructure', 'Combined']

In [None]:
df_combined = pd.DataFrame()

for df in dfs:
  df_combined = df_combined.append(df, ignore_index=True)

dfs.append(df_combined)
df_combined

  df_combined = df_combined.append(df, ignore_index=True)


Unnamed: 0,Category,Prompt,Generated Text,i,MAX_LEN
0,Indian climate,Indian climate scientists have long complained...,0,189,
1,Indian climate,Indian climate is characterized by great varia...,1,200,
2,Indian climate,"Indian climate is very hot, dry and humid. Peo...",2,186,
3,Indian climate,"Indian climate\nFrom Wikipedia, the free encyc...",3,193,
4,Indian climate,Indian climate change adaptation strategy\nThe...,4,185,
...,...,...,...,...,...
1995,Indian infrastructure,Indian infrastructure company L&T is all set t...,495,186,
1996,Indian infrastructure,Indian infrastructure is facing an unprecedent...,496,200,
1997,Indian infrastructure,Indian infrastructure giant GVK Power and Infr...,497,190,
1998,Indian infrastructure,Indian infrastructure company GMR Group has be...,498,184,


In [None]:
!mkdir -p "drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/falcon"

In [None]:
for theme, df in zip(TOPICS, dfs):
  diversity_stats_df = df['Prompt'].apply(lambda x: pd.Series(get_diversity_stats(lemmatize(x))))
  df = pd.concat([df, diversity_stats_df], axis=1)
  df.to_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/falcon/{theme}.csv', index=False)
  print('THEME:', theme)
  print(df.describe())

THEME: Indian climate
       Generated Text           i  MAX_LEN  simple_ttr    root_ttr  \
count      500.000000  500.000000      0.0  500.000000  500.000000   
mean       249.500000  189.900000      NaN    0.594627    7.031603   
std        144.481833    6.059657      NaN    0.076591    0.879908   
min          0.000000  180.000000      NaN    0.429348    2.666667   
25%        124.750000  184.000000      NaN    0.541496    6.599385   
50%        249.500000  190.000000      NaN    0.584240    7.166477   
75%        374.250000  195.000000      NaN    0.634316    7.613195   
max        499.000000  200.000000      NaN    0.941176    8.768776   

          log_ttr    mass_ttr       msttr       mattr         hdd        mtld  \
count  500.000000  500.000000  500.000000  500.000000  500.000000  500.000000   
mean     0.894201    0.049173    0.862199    0.862297    0.770547   62.653386   
std      0.020281    0.008011    0.039519    0.035628    0.120941   17.553592   
min      0.837872    0.

## Data - Falcon (DIFFERENT)

In [None]:
import re
import pandas as pd

dfs = []
TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure']
for topic in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/falcon-7b-200-google-scholar-17012024/{topic}/data.csv')
  dfs.append(df)

TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure', 'Combined']

In [None]:
df_combined = pd.DataFrame()

for df in dfs:
  df_combined = df_combined.append(df, ignore_index=True)

dfs.append(df_combined)
df_combined

  df_combined = df_combined.append(df, ignore_index=True)


Unnamed: 0,Category,Prompt,Generated Text,i,MAX_LEN
0,Indian climate,Impact of climate change on Indian agriculture...,Impact of climate change on Indian agriculture...,0,198
1,Indian climate,"The predictive state: science, territory and t...","The predictive state: science, territory and t...",1,190
2,Indian climate,Water resources and climate change: An Indian ...,Water resources and climate change: An Indian ...,2,180
3,Indian climate,Fractal dimensional analysis of Indian climati...,Fractal dimensional analysis of Indian climati...,3,198
4,Indian climate,Temperature and rainfall extremes change under...,Temperature and rainfall extremes change under...,4,189
...,...,...,...,...,...
1978,Indian infrastructure,What Causes AgglomerationPolicy or Infrastruct...,What Causes AgglomerationPolicy or Infrastruct...,496,183
1979,Indian infrastructure,The Scenario of FDI in Infrastructure of India,The Scenario of FDI in Infrastructure of India...,497,194
1980,Indian infrastructure,The Impact Of Gaja Cyclone On Paddy And Rural ...,The Impact Of Gaja Cyclone On Paddy And Rural ...,498,198
1981,Indian infrastructure,South-South investment in infrastructure: the ...,South-South investment in infrastructure: the ...,499,192


In [None]:
!mkdir -p "drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/falcon"

In [None]:
for theme, df in zip(TOPICS, dfs):
  diversity_stats_df = df['Generated Text'].apply(lambda x: pd.Series(get_diversity_stats(lemmatize(x))))
  df = pd.concat([df, diversity_stats_df], axis=1)
  df.to_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/falcon/{theme}.csv', index=False)
  print('THEME:', theme)
  print(df.describe())

THEME: Indian climate
                i     MAX_LEN  simple_ttr    root_ttr     log_ttr    mass_ttr  \
count  501.000000  501.000000  501.000000  501.000000  501.000000  501.000000   
mean   250.000000  190.041916    0.595385    7.155216    0.895250    0.048269   
std    144.770508    6.202922    0.079495    0.899612    0.020679    0.008172   
min      0.000000  180.000000    0.423913    3.000000    0.835429    0.000000   
25%    125.000000  184.000000    0.547872    6.747073    0.882341    0.043692   
50%    250.000000  190.000000    0.581250    7.366131    0.893773    0.048365   
75%    375.000000  196.000000    0.626168    7.763917    0.905435    0.052696   
max    500.000000  200.000000    1.000000    8.820097    1.000000    0.075067   

            msttr       mattr         hdd        mtld  mtld_ma_wrap  \
count  501.000000  501.000000  501.000000  501.000000    501.000000   
mean     0.862673    0.858050    0.767063   62.611663     61.004190   
std      0.037440    0.036168    0.

## Reporting

#### LLaMA-2 SAME

In [5]:
import pandas as pd

report_df = pd.DataFrame()
TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure', 'Combined']
for theme in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/llama2/{theme}.csv')
  report_df = report_df.append({
      'theme': theme,
      'MAX_LEN': df['MAX_LEN'].mean(),
      'simple_ttr': df['simple_ttr'].mean(),
      'root_ttr': df['root_ttr'].mean(),
      'log_ttr': df['log_ttr'].mean(),
      'maas_ttr': df['mass_ttr'].mean(),
      'msttr': df['msttr'].mean(),
      'mattr': df['mattr'].mean(),
      'hdd': df['hdd'].mean(),
      'mtld': df['mtld'].mean(),
      'mtld_ma_wrap': df['mtld_ma_wrap'].mean(),
      'mtld_ma_bid': df['mtld_ma_bid'].mean(),
  }, ignore_index=True)

report_df

  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({


Unnamed: 0,theme,MAX_LEN,simple_ttr,root_ttr,log_ttr,maas_ttr,msttr,mattr,hdd,mtld,mtld_ma_wrap,mtld_ma_bid
0,Indian climate,190.276,0.559554,6.538296,0.881118,0.055696,0.83214,0.832076,0.751589,49.155613,48.97833,44.926328
1,Indian defense,189.452,0.636783,6.800674,0.904111,0.046392,0.883602,0.881899,0.798909,70.202863,68.688557,53.336758
2,Indian economy,189.674,0.599143,6.8412,0.894397,0.049711,0.869656,0.873268,0.786984,62.82381,62.592828,54.541403
3,Indian infrastructure,189.866,0.61291,6.974624,0.898939,0.047731,0.874087,0.873692,0.788224,65.249444,64.880428,57.222297
4,Combined,189.817,0.602097,6.788698,0.894641,0.049882,0.864871,0.865234,0.781427,61.857932,61.285036,52.506696


#### FALCON SAME

In [6]:
report_df = pd.DataFrame()
TOPICS = ['Indian climate', 'Indian defence', 'Indian economy', 'Indian infrastructure', 'Combined']
for theme in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/same/falcon/{theme}.csv')
  report_df = report_df.append({
      'theme': theme,
      'MAX_LEN': df['i'].mean(),
      'simple_ttr': df['simple_ttr'].mean(),
      'root_ttr': df['root_ttr'].mean(),
      'log_ttr': df['log_ttr'].mean(),
      'maas_ttr': df['mass_ttr'].mean(),
      'msttr': df['msttr'].mean(),
      'mattr': df['mattr'].mean(),
      'hdd': df['hdd'].mean(),
      'mtld': df['mtld'].mean(),
      'mtld_ma_wrap': df['mtld_ma_wrap'].mean(),
      'mtld_ma_bid': df['mtld_ma_bid'].mean(),
  }, ignore_index=True)

report_df

  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({


Unnamed: 0,theme,MAX_LEN,simple_ttr,root_ttr,log_ttr,maas_ttr,msttr,mattr,hdd,mtld,mtld_ma_wrap,mtld_ma_bid
0,Indian climate,189.9,0.594627,7.031603,0.894201,0.049173,0.862199,0.862297,0.770547,62.653386,61.972035,55.223165
1,Indian defence,190.252,0.638263,7.119058,0.906615,0.044221,0.890433,0.884411,0.796046,74.724666,72.325245,56.988385
2,Indian economy,190.182,0.611832,7.030286,0.899077,0.047207,0.879628,0.880359,0.778959,68.079342,66.663638,58.476953
3,Indian infrastructure,190.11,0.640589,7.166018,0.907461,0.043737,0.890693,0.885652,0.80184,75.111651,72.449151,56.42145
4,Combined,190.111,0.621328,7.086741,0.901839,0.046085,0.880738,0.87818,0.786848,70.142261,68.352517,56.777488


#### LLaMA-2 DIFFERENT

In [7]:
report_df = pd.DataFrame()
TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure', 'Combined']
for theme in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/llama2/{theme}.csv')
  report_df = report_df.append({
      'theme': theme,
      'MAX_LEN': df['MAX_LEN'].mean(),
      'simple_ttr': df['simple_ttr'].mean(),
      'root_ttr': df['root_ttr'].mean(),
      'log_ttr': df['log_ttr'].mean(),
      'maas_ttr': df['mass_ttr'].mean(),
      'msttr': df['msttr'].mean(),
      'mattr': df['mattr'].mean(),
      'hdd': df['hdd'].mean(),
      'mtld': df['mtld'].mean(),
      'mtld_ma_wrap': df['mtld_ma_wrap'].mean(),
      'mtld_ma_bid': df['mtld_ma_bid'].mean(),
  }, ignore_index=True)

report_df

  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({


Unnamed: 0,theme,MAX_LEN,simple_ttr,root_ttr,log_ttr,maas_ttr,msttr,mattr,hdd,mtld,mtld_ma_wrap,mtld_ma_bid
0,Indian climate,190.281437,0.590563,7.007476,0.892987,0.049728,0.858986,0.855897,0.772965,60.408203,59.82678,53.006217
1,Indian defense,190.235529,0.590145,6.984259,0.892696,0.049993,0.857717,0.857009,0.758233,59.251547,59.207386,51.684288
2,Indian economy,189.847917,0.565967,6.771099,0.884628,0.053485,0.838526,0.84127,0.740491,52.530323,52.049035,46.557558
3,Indian infrastructure,189.758483,0.583385,6.941172,0.890574,0.050873,0.856497,0.855059,0.775992,58.417324,58.104515,51.976117
4,Combined,190.032779,0.58269,6.927642,0.89028,0.050994,0.853084,0.852426,0.762147,57.706087,57.352504,50.851037


#### FALCON DIFFERENT

In [8]:
report_df = pd.DataFrame()
TOPICS = ['Indian climate', 'Indian defense', 'Indian economy', 'Indian infrastructure', 'Combined']
for theme in TOPICS:
  df = pd.read_csv(f'drive/MyDrive/DUCSResearch/Bias in LLMs/colab_notebooks/bias_llm/data/diversity/different/falcon/{theme}.csv')
  report_df = report_df.append({
      'theme': theme,
      'MAX_LEN': df['MAX_LEN'].mean(),
      'simple_ttr': df['simple_ttr'].mean(),
      'root_ttr': df['root_ttr'].mean(),
      'log_ttr': df['log_ttr'].mean(),
      'maas_ttr': df['mass_ttr'].mean(),
      'msttr': df['msttr'].mean(),
      'mattr': df['mattr'].mean(),
      'hdd': df['hdd'].mean(),
      'mtld': df['mtld'].mean(),
      'mtld_ma_wrap': df['mtld_ma_wrap'].mean(),
      'mtld_ma_bid': df['mtld_ma_bid'].mean(),
  }, ignore_index=True)

report_df

  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({
  report_df = report_df.append({


Unnamed: 0,theme,MAX_LEN,simple_ttr,root_ttr,log_ttr,maas_ttr,msttr,mattr,hdd,mtld,mtld_ma_wrap,mtld_ma_bid
0,Indian climate,190.041916,0.595385,7.155216,0.89525,0.048269,0.862673,0.85805,0.767063,62.611663,61.00419,53.257405
1,Indian defense,190.54491,0.611065,7.031694,0.898516,0.047722,0.8573,0.855074,0.746862,62.045965,60.348726,49.998728
2,Indian economy,190.05,0.604452,6.666449,0.894857,0.049839,0.849814,0.850891,0.711976,54.782784,53.328238,44.716949
3,Indian infrastructure,189.626747,0.592762,6.966247,0.893476,0.049475,0.859429,0.858115,0.750161,59.396287,58.270978,50.315895
4,Combined,190.066062,0.600879,6.957956,0.895532,0.048815,0.857383,0.855582,0.744355,59.761345,58.290028,49.623662
