# Measures

In [33]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy.stats import pearsonr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Function: preprocess_text 
Parameters: text (type: string).
This function is designed to preprocess textual data on hightlights(title) and articles by removing punctuation, converting all text to lowercase, and removing stopwords. 

In [37]:
# Load the existing data from CSV
df = pd.read_csv('articles.csv')

# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = ''.join([char.lower() for char in text if char.isalnum() or char.isspace()])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text

# Preprocess the text in the 'highlights' and 'article' columns
df['preprocessed_highlights'] = df['highlights'].apply(preprocess_text)
df['preprocessed_articles'] = df['articles'].apply(preprocess_text)

Cosine Similarity: In the context of TF-IDF vectors, it is used to measure how similar the documents are irrespective of their size.

In [38]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Combine preprocessed highlights and articles for vectorization, which ensures the same features space is used for both
combined_texts = df['preprocessed_highlights'].tolist() + df['preprocessed_articles'].tolist()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

# Split the TF-IDF representation into two separate matrices for highlights and articles
half_n = int(len(tfidf_matrix.toarray()) / 2)
tfidf_highlights = tfidf_matrix[:half_n]
tfidf_articles = tfidf_matrix[half_n:]

# Calculate the cosine similarity for each pair of highlight and article
cosine_similarities = [cosine_similarity(tfidf_highlights[i], tfidf_articles[i])[0][0] for i in range(half_n)]

# Add the cosine similarity to the DataFrame
df['cosine_similarity'] = cosine_similarities

Compression Ratio: Calculates the ratio of the length of preprocessed highlights to the length of preprocessed articles for each pair. This is typically used to measure how much shorter the highlights are compared to the articles. The mean is 0.03, which indicates indicates a high level of condensation, where the summaries are very brief compared to the original articles.

Pearson Correlation Coefficient: Measures the linear correlation between the lengths of the articles and their highlights. A correlation coefficient closer to 1 or -1 implies a stronger linear relationship. The result is 0.05. This implies that as the length of the articles increases, there is a slight tendency for the summaries to be longer, but this tendency is not strong. 

P-value: Provides the probability of observing the data if the null hypothesis (no association) is true. Small p-values (typically ≤ 0.05) suggest strong evidence against the null hypothesis. The result is 0.34158127783660464, which suggests that there is no linear correlation between the lengths of the articles and their summaries.


In [40]:
# Calculate the compression ratio for each row
df['compression_ratio'] = df.apply(lambda row: len(row['preprocessed_highlights']) / len(row['preprocessed_articles']) if len(row['preprocessed_articles']) > 0 else 0, axis=1)

# Calculate correlation and p-value once for the entire dataset
highlight_lengths = df['preprocessed_highlights'].apply(len)
article_lengths = df['preprocessed_articles'].apply(len)
correlation, p_value = pearsonr(article_lengths, highlight_lengths)

# Assign the correlation and p-value to each row in new columns
df['correlation'] = correlation
df['p_value'] = p_value

# Reorder the DataFrame

column_order = ['id', 'articles','highlights','preprocessed_highlights', 'preprocessed_articles','cosine_similarity', 'compression_ratio', 'correlation', 'p_value', ]

df = df[column_order]

#df.drop(columns=['highlights', 'article'], inplace=True)
df.to_csv('articles_with_preprocessed_and_measures.csv', index=False)

print("Data saved to 'articles_with_preprocessed_and_measures.csv'")

Data saved to 'articles_with_preprocessed_and_measures.csv'
