In [2]:
# Import packages in alphabetical order
import pandas as pd
import tiktoken
import os
import openai
import duckdb
import owlready2
import sklearn

import gensim
import matplotlib.pyplot as plt
import nltk
import pickle
import re
import spacy
import string

# Import packages in alphabetical order to avoid duplicates
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from transformers import BertModel, BertTokenizer
from wordcloud import WordCloud
from bert_score import score

In [22]:
#Load grouped_sentences_100_mistral from pickle
with open('Datasets/grouped_sentences_100_mistral.pkl', 'rb') as f:
    grouped_sentences_mistral = pickle.load(f)

In [21]:
#Load grouped_sentences_100_llama from pickle
with open('Datasets/grouped_sentences_100_llama.pkl', 'rb') as f:
    grouped_sentences_llama = pickle.load(f)

In [23]:
#Call grouped_sentences_mistral head
grouped_sentences_mistral.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary
50374,50450,Association for Information Systems AIS Electr...,association for information systems ais electr...,The study aims to examine the differences in ...
13657,13667,Communications of the Association for Informat...,communications of the association for informat...,The longitudinal study on the supply and dema...
11449,11457,Effective organizational improvisation in info...,effective organizational improvisation in info...,The study focuses on understanding organizati...
46481,46549,Researching the Costs of Information Systems T...,researching the costs of information systems t...,The paper examines research gaps in understan...
30546,30587,Holographic Recommendations in Brick - and - M...,holographic recommendations in brick and morta...,The researchers propose an in-store recommend...


In [24]:
#Call grouped_sentences_llama head
grouped_sentences_llama.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary
50374,50450,Association for Information Systems AIS Electr...,association for information systems ais electr...,\n\nThe article presents the research of the \...
13657,13667,Communications of the Association for Informat...,communications of the association for informat...,Default Summary Default Summary Default Summary
11449,11457,Effective organizational improvisation in info...,effective organizational improvisation in info...,Default Summary \nThe text is an abstract of a...
46481,46549,Researching the Costs of Information Systems T...,researching the costs of information systems t...,\n\nThe text summarizes a research paper that ...
30546,30587,Holographic Recommendations in Brick - and - M...,holographic recommendations in brick and morta...,\n\nThe text presents a research project that ...


In [25]:
#Rename summary column in grouped_sentences_mistral to summary_mistral
grouped_sentences_mistral.rename(columns={'summary':'summary_mistral'}, inplace=True)

In [26]:
#Rename summary column in grouped_sentences_llama to summary_llama
grouped_sentences_llama.rename(columns={'summary':'summary_llama'}, inplace=True)

In [27]:
#Set a new dataframe summary_100 by combining grouped_sentences_mistral and column summary_llama from grouped_sentences_llama
summary_100 = pd.concat([grouped_sentences_mistral, grouped_sentences_llama['summary_llama']], axis=1)

In [28]:
#Call summary_100 head
summary_100.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_mistral,summary_llama
50374,50450,Association for Information Systems AIS Electr...,association for information systems ais electr...,The study aims to examine the differences in ...,\n\nThe article presents the research of the \...
13657,13667,Communications of the Association for Informat...,communications of the association for informat...,The longitudinal study on the supply and dema...,Default Summary Default Summary Default Summary
11449,11457,Effective organizational improvisation in info...,effective organizational improvisation in info...,The study focuses on understanding organizati...,Default Summary \nThe text is an abstract of a...
46481,46549,Researching the Costs of Information Systems T...,researching the costs of information systems t...,The paper examines research gaps in understan...,\n\nThe text summarizes a research paper that ...
30546,30587,Holographic Recommendations in Brick - and - M...,holographic recommendations in brick and morta...,The researchers propose an in-store recommend...,\n\nThe text presents a research project that ...


In [None]:
#Save summary_100 to pickle
with open('Datasets/summary_100.pkl', 'wb') as f:
    pickle.dump(summary_100, f)

In [8]:
#Load summary_100 dataset from pickle
with open('Datasets/summary_100.pkl', 'rb') as f:
    summary_100 = pickle.load(f)

In [9]:
#load summary_100 head
summary_100.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_mistral,summary_llama
50374,50450,Association for Information Systems AIS Electr...,association for information systems ais electr...,The study aims to examine the differences in ...,\n\nThe article presents the research of the \...
13657,13667,Communications of the Association for Informat...,communications of the association for informat...,The longitudinal study on the supply and dema...,Default Summary Default Summary Default Summary
11449,11457,Effective organizational improvisation in info...,effective organizational improvisation in info...,The study focuses on understanding organizati...,Default Summary \nThe text is an abstract of a...
46481,46549,Researching the Costs of Information Systems T...,researching the costs of information systems t...,The paper examines research gaps in understan...,\n\nThe text summarizes a research paper that ...
30546,30587,Holographic Recommendations in Brick - and - M...,holographic recommendations in brick and morta...,The researchers propose an in-store recommend...,\n\nThe text presents a research project that ...


In [11]:
#Reset index for summary_100
#This was done to prevent indexing error for further processing, may be caused by previous random state selection
summary_100.reset_index(drop=True, inplace=True)

In [10]:
#Try BERTScore with silly sentences
candidates = ["So tell me what you want what you really really what"]
references = ["I'll tell you what I want what I really really want. I wanna, I wanna, I wanna, I wanna, I wanna really really really wanna zigazig ah"]

# Calculate BERTScore for all pairs
P, R, F1 = score(candidates, references, lang="en", model_type="bert-base-uncased", num_layers=1)

# Print BERTScore for each pair
for i in range(len(references)):
    print(f'Index {i}: Precision: {P[i].item():.6f}, Recall: {R[i].item():.6f}, F1 Score: {F1[i].item():.6f}')

Index 0: Precision: 0.820458, Recall: 0.514447, F1 Score: 0.632378


In [14]:
#Try BERTScore with one pair of rows from summary_100
candidates = [summary_100['summary_mistral'][0]]
references = [summary_100['sentence'][0]]

# Calculate BERTScore for all pairs
P, R, F1 = score(candidates, references, lang="en", model_type="bert-base-uncased", num_layers=1)

# Print BERTScore for each pair
for i in range(len(references)):
    print(f'Index {i}: Precision: {P[i].item():.6f}, Recall: {R[i].item():.6f}, F1 Score: {F1[i].item():.6f}')

Index 0: Precision: 0.718787, Recall: 0.658720, F1 Score: 0.687444


In [15]:
#Try BERTScore with one pair of rows from summary_100
candidates = [summary_100['summary_llama'][0]]
references = [summary_100['sentence'][0]]

# Calculate BERTScore for all pairs
P, R, F1 = score(candidates, references, lang="en", model_type="bert-base-uncased", num_layers=1)

# Print BERTScore for each pair
for i in range(len(references)):
    print(f'Index {i}: Precision: {P[i].item():.6f}, Recall: {R[i].item():.6f}, F1 Score: {F1[i].item():.6f}')

Index 0: Precision: 0.732148, Recall: 0.573293, F1 Score: 0.643055


In [16]:
#Create a new dataframe of summary_100 as summary_100_eval
summary_100_eval = summary_100

In [18]:
#Update the summary_100_eval dataframe with BERTScore results

# Create new columns for precision, recall, and F1 score
summary_100_eval['precision_mistral'] = 0.0
summary_100_eval['recall_mistral'] = 0.0
summary_100_eval['f1_mistral'] = 0.0
summary_100_eval['precision_llama'] = 0.0
summary_100_eval['recall_llama'] = 0.0
summary_100_eval['f1_llama'] = 0.0

# Iterate over rows and calculate BERTScore
# Iterate over rows and calculate BERTScore for llama and mistral
for i in range(len(summary_100_eval)):
    # Calculate BERTScore for llama
    candidate_llama = [summary_100_eval['summary_llama'][i]]
    reference_llama = [summary_100_eval['sentence'][i]]
    P_llama, R_llama, F1_llama = score(candidate_llama, reference_llama, lang="en", model_type="bert-base-uncased", num_layers=1)

    # Update DataFrame with BERTScore results for llama
    summary_100_eval.at[i, 'precision_llama'] = P_llama.item()
    summary_100_eval.at[i, 'recall_llama'] = R_llama.item()
    summary_100_eval.at[i, 'f1_llama'] = F1_llama.item()

    # Calculate BERTScore for mistral
    candidate_mistral = [summary_100_eval['summary_mistral'][i]]
    reference_mistral = [summary_100_eval['sentence'][i]]
    P_mistral, R_mistral, F1_mistral = score(candidate_mistral, reference_mistral, lang="en", model_type="bert-base-uncased", num_layers=1)

    # Update DataFrame with BERTScore results for mistral
    summary_100_eval.at[i, 'precision_mistral'] = P_mistral.item()
    summary_100_eval.at[i, 'recall_mistral'] = R_mistral.item()
    summary_100_eval.at[i, 'f1_mistral'] = F1_mistral.item()

# Print or display the updated DataFrame head
summary_100_eval.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_mistral,summary_llama,precision_mistral,recall_mistral,f1_mistral,precision_llama,recall_llama,f1_llama
0,50450,Association for Information Systems AIS Electr...,association for information systems ais electr...,The study aims to examine the differences in ...,\n\nThe article presents the research of the \...,0.718787,0.65872,0.687444,0.732148,0.573293,0.643055
1,13667,Communications of the Association for Informat...,communications of the association for informat...,The longitudinal study on the supply and dema...,Default Summary Default Summary Default Summary,0.710817,0.731577,0.721048,0.185275,0.166151,0.175193
2,11457,Effective organizational improvisation in info...,effective organizational improvisation in info...,The study focuses on understanding organizati...,Default Summary \nThe text is an abstract of a...,0.704064,0.697286,0.700659,0.69276,0.581827,0.632466
3,46549,Researching the Costs of Information Systems T...,researching the costs of information systems t...,The paper examines research gaps in understan...,\n\nThe text summarizes a research paper that ...,0.630737,0.665575,0.647688,0.721754,0.666411,0.692979
4,30587,Holographic Recommendations in Brick - and - M...,holographic recommendations in brick and morta...,The researchers propose an in-store recommend...,\n\nThe text presents a research project that ...,0.67258,0.657886,0.665152,0.715184,0.600278,0.652712


In [19]:
#save summary_100_eval to pickle
with open('Datasets/summary_100_eval.pkl', 'wb') as f:
    pickle.dump(summary_100_eval, f)

Now into the 300 dataframes

In [3]:
#import grouped_sentences_300_l from pickle
with open('Datasets/grouped_sentences_300_l.pkl', 'rb') as f:
    grouped_sentences_300_l = pickle.load(f)

In [4]:
#Call grouped_sentences_300_l head
grouped_sentences_300_l.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_llama
17899,17917,The firm 's continuance intentions to use inte...,the firm continuance intentions to use inter o...,\n \n\nThe text reviews the literature on the ...
9564,9570,The Philosopher 's Corner : Beyond Epistemolog...,the philosopher corner beyond epistemology and...,\n\nThe text is a summary of a philosophical d...
45598,45666,Interplay of Competition where ln Innovation w...,interplay of competition where ln innovation w...,\n\nThe text summarizes the main idea of the r...
39260,39308,The Impact of Crowdsourcing on Organisational ...,the impact of crowdsourcing on organisational ...,"\nThe paper examines how crowdmapping, a form ..."
23407,23440,Association for Information Systems Associatio...,association for information systems associatio...,Default Summary Default Summary Default Summar...


In [5]:
#Import grouped_sentences_300_m from pickle
with open('Datasets/grouped_sentences_300_m.pkl', 'rb') as f:
    grouped_sentences_300_m = pickle.load(f)

In [6]:
#call grouped_sentences_300_m head
grouped_sentences_300_m.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_mistral
17899,17917,The firm 's continuance intentions to use inte...,the firm continuance intentions to use inter o...,The research focuses on examining the influen...
9564,9570,The Philosopher 's Corner : Beyond Epistemolog...,the philosopher corner beyond epistemology and...,The discussion revolves around interpreting P...
45598,45666,Interplay of Competition where ln Innovation w...,interplay of competition where ln innovation w...,The text discusses an analysis of competition...
39260,39308,The Impact of Crowdsourcing on Organisational ...,the impact of crowdsourcing on organisational ...,The research focuses on understanding the imp...
23407,23440,Association for Information Systems Associatio...,association for information systems associatio...,The researchers focus on understanding the pr...


In [7]:
#Combine summary_llama column from grouped_sentence_300_l to grouped_sentences_300_m dataframe and set the variable as summary_300
summary_300 = pd.concat([grouped_sentences_300_m, grouped_sentences_300_l['summary_llama']], axis=1)

In [8]:
#Call summary_300 head
summary_300.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_mistral,summary_llama
17899,17917,The firm 's continuance intentions to use inte...,the firm continuance intentions to use inter o...,The research focuses on examining the influen...,\n \n\nThe text reviews the literature on the ...
9564,9570,The Philosopher 's Corner : Beyond Epistemolog...,the philosopher corner beyond epistemology and...,The discussion revolves around interpreting P...,\n\nThe text is a summary of a philosophical d...
45598,45666,Interplay of Competition where ln Innovation w...,interplay of competition where ln innovation w...,The text discusses an analysis of competition...,\n\nThe text summarizes the main idea of the r...
39260,39308,The Impact of Crowdsourcing on Organisational ...,the impact of crowdsourcing on organisational ...,The research focuses on understanding the imp...,"\nThe paper examines how crowdmapping, a form ..."
23407,23440,Association for Information Systems Associatio...,association for information systems associatio...,The researchers focus on understanding the pr...,Default Summary Default Summary Default Summar...


In [11]:
#Set the variable summary_300_eval as a copy of summary_300
summary_300_eval = summary_300.copy()

In [13]:
#Reset index for summary_300_eval
#This was done to prevent indexing error for further processing, may be caused by previous random state selection
summary_300_eval.reset_index(drop=True, inplace=True)

In [14]:
#Update the summary_300_eval dataframe with BERTScore results

# Create new columns for precision, recall, and F1 score
summary_300_eval['precision_mistral'] = 0.0
summary_300_eval['recall_mistral'] = 0.0
summary_300_eval['f1_mistral'] = 0.0
summary_300_eval['precision_llama'] = 0.0
summary_300_eval['recall_llama'] = 0.0
summary_300_eval['f1_llama'] = 0.0

# Iterate over rows and calculate BERTScore
# Iterate over rows and calculate BERTScore for llama and mistral

for i in range(len(summary_300_eval)):
    # Calculate BERTScore for llama
    candidate_llama = [summary_300_eval['summary_llama'][i]]
    reference_llama = [summary_300_eval['sentence'][i]]
    P_llama, R_llama, F1_llama = score(candidate_llama, reference_llama, lang="en", model_type="bert-base-uncased", num_layers=1)

    # Update DataFrame with BERTScore results for llama
    summary_300_eval.at[i, 'precision_llama'] = P_llama.item()
    summary_300_eval.at[i, 'recall_llama'] = R_llama.item()
    summary_300_eval.at[i, 'f1_llama'] = F1_llama.item()

    # Calculate BERTScore for mistral
    candidate_mistral = [summary_300_eval['summary_mistral'][i]]
    reference_mistral = [summary_300_eval['sentence'][i]]
    P_mistral, R_mistral, F1_mistral = score(candidate_mistral, reference_mistral, lang="en", model_type="bert-base-uncased", num_layers=1)

    # Update DataFrame with BERTScore results for mistral
    summary_300_eval.at[i, 'precision_mistral'] = P_mistral.item()
    summary_300_eval.at[i, 'recall_mistral'] = R_mistral.item()
    summary_300_eval.at[i, 'f1_mistral'] = F1_mistral.item()

# Print or display the updated DataFrame head
summary_300_eval.head()

Unnamed: 0,article_id,sentence,processed_sentence,summary_mistral,summary_llama,precision_mistral,recall_mistral,f1_mistral,precision_llama,recall_llama,f1_llama
0,17917,The firm 's continuance intentions to use inte...,the firm continuance intentions to use inter o...,The research focuses on examining the influen...,\n \n\nThe text reviews the literature on the ...,0.711649,0.708032,0.709836,0.65356,0.618974,0.635797
1,9570,The Philosopher 's Corner : Beyond Epistemolog...,the philosopher corner beyond epistemology and...,The discussion revolves around interpreting P...,\n\nThe text is a summary of a philosophical d...,0.683973,0.678129,0.681038,0.732366,0.628799,0.676642
2,45666,Interplay of Competition where ln Innovation w...,interplay of competition where ln innovation w...,The text discusses an analysis of competition...,\n\nThe text summarizes the main idea of the r...,0.695689,0.65517,0.674822,0.611366,0.636767,0.623808
3,39308,The Impact of Crowdsourcing on Organisational ...,the impact of crowdsourcing on organisational ...,The research focuses on understanding the imp...,"\nThe paper examines how crowdmapping, a form ...",0.629071,0.617529,0.623247,0.650022,0.604516,0.626444
4,23440,Association for Information Systems Associatio...,association for information systems associatio...,The researchers focus on understanding the pr...,Default Summary Default Summary Default Summar...,0.710848,0.747031,0.728491,0.640211,0.524985,0.576901


In [15]:
#Save summary_300_eval to pickle
with open('Datasets/summary_300_eval.pkl', 'wb') as f:
    pickle.dump(summary_300_eval, f)