## All imports

In [1]:
import pandas as pd
import pandoc
import pickle 
import re
from collections import Counter
from collections import defaultdict
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import pearsonr
import statsmodels.api as sm
import ast

[nltk_data] Downloading package punkt to
[nltk_data]     /home/sh.aubakirov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Dataset

In [2]:
def convert_string_to_list(string):  # for dataset of 17038 article texts
    try:
        return ast.literal_eval(string)
    except ValueError:
        # In case of error, return the original string
        return string

In [3]:
dataset = pd.read_csv('ds_merged_5_methods.csv',sep = ';')
dataset['article_text'] = dataset['article_text'].apply(convert_string_to_list)
dataset['abstract_text'] = dataset['abstract_text'].apply(convert_string_to_list)
dataset.head()

Unnamed: 0,article_id,article_text,abstract_text,section_names,sections,len_text,len_abstract,sent_ext,word_ext,word_inf,...,len_sum_vot,len_sum_vns_init_greed,best_summary_gen,best_rouge1_gen,best_rouge2_gen,len_sum_gen,best_summary_gen_greedinit,best_rouge1_gen_greedinit,best_rouge2_gen_greedinit,len_sum_gen_greedinit
0,astro-ph0202198,[the study of supernovae ( sne ) has greatly a...,[large numbers of supernovae ( sne ) have been...,"['introduction', 'method', 'results', 'future ...",[['the study of supernovae ( sne ) has greatly...,341,12,0.0,0.897436,0.076004,...,15,5,"['once discovered , the study of a particular ...",0.565217,0.157718,12,['the method presented here can become signifi...,0.568182,0.153094,12
1,0905.2691,[it is believed that solar magnetic fields are...,[we investigate the emergence of magnetic flux...,"['introduction', 'observations and data reduct...",[['it is believed that solar magnetic fields a...,329,11,0.0,0.884892,0.073258,...,9,9,"[', the process of flux emergence occurs on ve...",0.598753,0.2881,11,['an important result is that 23% of the loops...,0.598778,0.282209,11
2,1305.7010,[the origin - destination ( od ) matrix is imp...,[the estimation of the number of passengers wi...,"['introduction', 'the passenger model', 'the o...",[['the origin - destination ( od ) matrix is i...,225,10,0.0,0.740157,0.074367,...,19,14,['+ the purpose of this paper is then to devel...,0.528455,0.130612,10,['the ad - hoc estimation is plotted in green ...,0.524109,0.113684,10
3,1402.0371,[multiscale dynamics is present in many phenom...,[the classical structure - function ( sf ) met...,"['introduction', 'detrending analysis and detr...",[['multiscale dynamics is present in many phen...,216,15,0.066667,0.949045,0.171659,...,11,11,['power - law behavior is observed on a large ...,0.623656,0.354391,15,['this procedure is designated as detrending a...,0.736508,0.487261,15
4,1701.00774,[the @xmath3-transformation has been extensive...,"[given a real number @xmath0 , we study the as...","['introduction', 'coded negative beta-shift', ...",[['the @xmath3-transformation has been extensi...,353,11,0.0,0.867925,0.059547,...,4,4,['we understand the closure of the set of expa...,0.599496,0.222785,11,['the previous theorem can be proved also just...,0.560185,0.2,11


In [None]:
dataset['article_text']

## Functions required

In [None]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text.lower()
    
def split_into_sentences(text):
    return sent_tokenize(text)

def distance_to_line(x0,y0,b,a):
    return np.abs(b*x0 - y0 + a)/(np.sqrt(b**2+1))

def compute_line_params(x_data, y_data):
    m = (y_data[-1] - y_data[0]) / (x_data[-1] - x_data[0])
    c = y_data[0] - m * x_data[0]
    return m, c

def exp_decreasing(x, a, b, c):
    return a * np.exp(-b * x) + c

## Keep the article texts and abstract texts separately 

In [None]:
article_texts = {}
abstract_texts= {}

for index, text in dataset['article_text'].items():
    paper_id = index
    article_texts[paper_id] = text

for index,text in dataset['abstract_text'].items():
    paper_id = index
    abstract_texts[paper_id] = text

keys = {key: [] for key in article_texts.keys()}
#keys

In [None]:
article_texts[4741]

#### The function processes a dictionary of article texts, combining text segments per article, splitting them into sentences, and cleaning the text to identify unique words. For each word, it tracks how often it appears across sentences and stores these sentences. The results are aggregated into two dictionaries, one detailing the frequency of each word per article and another mapping words to the sentences they appear in, both keyed by article index. "Dictionary in Dictionary"

In [None]:
def count_sentences_with_unique_words_per_article_fixed(article_texts):
    article_word_sentence_counts = {}
    article_word_sentence_sets = {}

    for index, text_list in article_texts.items():
        word_sentence_counts = defaultdict(int) 
        word_sentence_sets = defaultdict(set)
        combined_text = ' '.join(text_list)
        sentences = split_into_sentences(combined_text)

        for sentence in sentences:
            cleaned_sentence = clean_text(sentence)
            words = set(cleaned_sentence.split())
            for word in words:
                if word and word not in {',', '.'}:  
                    word_sentence_counts[word] += 1  
                    word_sentence_sets[word].add(cleaned_sentence)

        article_word_sentence_counts[index] = word_sentence_counts
        article_word_sentence_sets[index] = word_sentence_sets
    return article_word_sentence_counts,article_word_sentence_sets
dict_id_words_counts, dict_id_words_sentences = count_sentences_with_unique_words_per_article_fixed(dataset['article_text'])

### Here we tracked the number of sentences in each article text. At the first time we calculated them by ourselves, then we decided to take already found number of sentences from the dataset

In [None]:
number_of_sentences_per_article = {}

for article_id, word_sentence_sets in dict_id_words_sentences.items():
    all_sentences = set()
    for sentences in word_sentence_sets.values():
        all_sentences.update(sentences)
    number_of_sentences_per_article[article_id] = len(all_sentences)

number_of_sentences_article_texts_real = dataset['len_text'].to_numpy()
number_of_sentences_abstract_texts_real = dataset['len_abstract'].to_numpy()

for key,new_value in zip(number_of_sentences_per_article,number_of_sentences_article_texts_real):
    number_of_sentences_per_article[key] = new_value

#number_of_sentences_per_article

### Function that finds optimal min_df values for each document

In [None]:
min_df_values = {key: [] for key in keys.keys()}

def find_min_df(key,threshold):
    word_count_pairs = [(word, count) for word, count in dict_id_words_counts[key].items()] 
    filtered_word_count_pairs = [pair for pair in word_count_pairs if pair[1] >= threshold]

    sorted_pairs = sorted(filtered_word_count_pairs, key=lambda pair: pair[1], reverse=True)
    sorted_words = [pair[0] for pair in sorted_pairs]
    sorted_counts = [pair[1] for pair in sorted_pairs]
    
    words = np.arange(len(sorted_words))
    counts = np.array(sorted_counts)

    initial_params = [max(counts), 0.1, min(counts)]
    params, _ = curve_fit(exp_decreasing, words, counts, p0=initial_params, maxfev=5000)
    exp_values = [exp_decreasing(x,*params) for x in words]
    m, c = compute_line_params(words, exp_values)
    distances_from_exp = [distance_to_line(x, exp_decreasing(x, *params), m, c) for x in words]

    intersection_points = []
    
    for i in range(1,len(exp_values)):
        if (exp_values[i-1]-distances_from_exp[i-1]) * (exp_values[i]-distances_from_exp[i]) < 0:
            intersection_points.append(i)

    try:
        min_df = exp_values[intersection_points[1]]/number_of_sentences_per_article[key]
    except:
        min_df = -1
   
    return min_df

### Function that generates summaries for each article text

In [None]:
def greed_sum(text, num_sent, min_df, max_df=1.0):
    try:
        # Fit a TFIDF vectorizer
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df)
        vectorizer.fit(text)

        # Get the matrix
        X = vectorizer.transform(text).toarray()

        # Get the sentence indices
        idx = []
        while sum(sum(X)) != 0:
            ind = np.argmax(X.sum(axis=1))
            idx.append(ind)

            # Update the matrix by deleting the columns corresponding to the words found in the previous step
            cols = X[ind]
            col_idx = [i for i in range(len(cols)) if cols[i] > 0]
            X = np.delete(X, col_idx, 1)

        idx = idx[:num_sent]
        idx.sort()

        summary = [text[i] for i in idx]
        return summary
    except ValueError as e:
        return ["Error: " + str(e)]

### Function that compares the generated summary with etalon text

In [None]:
def compute_rouge_score(generated_text, etalon_text):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    generated_text = ' '.join(generated_text)  
    etalon_text = ' '.join(etalon_text)  
    scores = scorer.score(generated_text, etalon_text)
    rouge1_fmeasure = scores['rouge1'].fmeasure  #
    return rouge1_fmeasure

### The main part of the code. Here we used optimization to find optimal min_df values with all created functions. For each document there is optimal minimum word frequency (threshold), which in turn leads to optimal min_df value such that the highest possible ROUGE score for that document is obtained.

In [None]:
optimal_thresholds = {}
rouge1_fmeasure_scores = {}
summary_texts = {}
min_df_values_dict = {}
for document_id in article_texts.keys():
    best_rouge_score = 0
    optimal_threshold = None
    optimal_generated_text = None
    optimal_min_df = None

    for threshold in range(0, 20):  
        min_df = find_min_df(document_id, threshold)
        generated_text = greed_sum(article_texts[document_id], number_of_sentences_per_article[document_id],min_df) 
        reference_text = abstract_texts[document_id]  
        
        current_rouge_score = compute_rouge_score(generated_text, reference_text)
        if current_rouge_score > best_rouge_score:
            best_rouge_score = current_rouge_score
            optimal_threshold = threshold
            optimal_generated_text = generated_text
            optimal_min_df = min_df
    rouge1_fmeasure_scores[document_id] = best_rouge_score
    min_df_values_dict[document_id] = optimal_min_df
    summary_texts[document_id] = optimal_generated_text
    
    optimal_thresholds[document_id] = optimal_threshold
    print(f"Optimal threshold for document {document_id}: {optimal_threshold}")

### Example of calculating the ROUGE score for document 4741

In [None]:
summary_texts[4741]

In [None]:
abstract_texts[4741]

In [None]:
compute_rouge_score(summary_texts[4741],abstract_texts[4741])

In [None]:
rouge1_fmeasure_scores.values()
rouge1_fmeasure_scores = list(rouge1_fmeasure_scores.values())
mean_rouge1_fmeasure =np.mean(rouge1_fmeasure_scores)
print(mean_rouge1_fmeasure)

### Distribution of optimal thresholds 

In [None]:
optimal_thresholds_list = list(optimal_thresholds.values())
max_value = max(optimal_thresholds_list)
min_value = min(optimal_thresholds_list)
bins = range(min_value, max_value + 2)  # +2 ensures the last bin includes the max value
plt.figure(figsize=(10, 6))
plt.hist(optimal_thresholds_list, bins=bins, align='left', rwidth=0.8)  # align='left' centers the bars over the integers
plt.xlabel('Optimal Thresholds')
plt.ylabel('Frequency')
plt.grid(True)  
plt.show()

In [None]:
data_series = pd.Series(optimal_thresholds_list)

# Create and display the boxplot
plt.figure(figsize=(10, 6))
data_series.plot(kind='box')
plt.title('Boxplot of Data')
plt.ylabel('Values')
plt.show()

***Describing relationship between ROUGE scores and min_df values***

In [None]:
min_df_values = []
for value in min_df_values_dict.values():
    min_df_values.append(value)

correlation_coefficient = np.corrcoef(rouge1_fmeasure_scores, min_df_values)[0, 1]
print("Correlation Coefficient:", correlation_coefficient)

plt.figure(figsize=(10, 6))
plt.scatter(min_df_values, rouge1_fmeasure_scores, color='blue', alpha=0.7)
plt.title('Scatter Plot of Two Variables')
plt.xlabel('Min_df values')
plt.ylabel('ROUGE1 F-measure scores')
plt.show()

**Distribution of ROUGE scores**

In [None]:
plt.hist(rouge1_fmeasure_scores, bins=20, alpha=0.5)
plt.xlabel('ROUGE-1 F-measure scores')
plt.ylabel('Frequency')
plt.show()
print(np.mean(rouge1_fmeasure_scores))