In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import ngrams, word_tokenize
from collections import Counter
from scipy.optimize import curve_fit
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import nltk
import matplotlib.cm as cm
nltk.download('punkt_tab')

In [None]:
path_name = '/Users/sperdijk/Documents/PhD/Datasets/Pretraining/chisor_dataset_all/ChiSCor_CoNLL_paper/csv/ChiSCor_master_df_password/ChiSCor_master_df.csv'

In [None]:
df = pd.read_csv(path_name, index_col=0)
df

In [None]:
oldcmp = cm.get_cmap('RdYlBu', 512)

line_color = oldcmp(0)
newcmp = ListedColormap(oldcmp(np.linspace(0.23, 1., 256)))

### Lemmatized

In [None]:
def zipf(x, a, c):
    return c * x**(-a)

def calculate_and_plot_zipf_theirs(data):
    # Combine all text and tokenize
	tokens = []
	for entry in data.dropna():
		tokens.extend(word_tokenize(entry.lower()))

	# Generate 4-grams
	four_grams = list(ngrams(tokens, 4))

	# Count frequencies
	freq_dist = Counter(four_grams)

	# Rank by frequency
	freqs = np.array(sorted(freq_dist.values(), reverse=True))
	ranks = np.arange(1, len(freqs) + 1)

	param, param_cov = curve_fit(zipf, ranks, freqs)
	zipf_fit = zipf(ranks, *param)

	# Plot
	plt.figure(figsize=(8, 6))
	plt.loglog(ranks, freqs, 'o', label='4-gram frequencies')
	plt.loglog(ranks, zipf_fit, label=f'Zipf Fit, ∝ x^{param[0]:.2f}')
	plt.xlabel('Frequency Rank')
	plt.ylabel('Frequency')
	plt.title('4-gram Frequency Distribution with Zipf Fit')
	plt.legend()
	plt.grid(True, which="both", ls="--", linewidth=0.5)
	plt.tight_layout()

def calculate_and_plot_zipf_mine(data, line_color='red', cmap='Blues'):
	# Combine all text and tokenize
	tokens = []
	for entry in data:
		tokens.extend(word_tokenize(entry.lower()))

	# Generate 4-grams
	four_grams = list(ngrams(tokens, 4))

	# Count frequencies
	freq_dist = Counter(four_grams)

	# Rank by frequency
	freqs = np.array(sorted(freq_dist.values(), reverse=True))
	ranks = np.arange(1, len(freqs) + 1)

	param, param_cov = curve_fit(zipf, ranks, freqs)
	zipf_fit = zipf(ranks, *param)

	# Hexbin plot
	plt.figure(figsize=(8, 6))
	hb = plt.hexbin(np.log10(ranks), np.log10(freqs), gridsize=50, cmap=newcmp, mincnt=1)
	cb = plt.colorbar(hb)
	cb.set_label('Counts per hexbin')

	# Plot fitted Zipf line
	plt.plot(np.log10(ranks), np.log10(zipf_fit), color=line_color,
			label=f'Zipf Fit, ∝ {round(param[1], 2)} * x^-{param[0]:.2f}')

	plt.xlabel('log10(Frequency Rank)')
	plt.ylabel('log10(Frequency)')
	plt.title('4-gram Frequency Distribution with Zipf Fit')
	plt.legend()
	plt.grid(True, which="both", ls="--", linewidth=0.5)
	plt.tight_layout()

In [None]:
data = df['story_lemmatized']
calculate_and_plot_zipf_theirs(data)


In [None]:
calculate_and_plot_zipf_mine(data, line_color=line_color, cmap=newcmp)

## Not lemmatized

In [None]:
data_raw = df['story_raw_no_newlines']
calculate_and_plot_zipf_mine(data_raw, line_color=line_color, cmap=newcmp)

### SimpleStories

In [None]:
from datasets import load_dataset

In [None]:
simplystories = load_dataset('SimpleStories/SimpleStories')

In [None]:
subset = simplystories['train'].train_test_split(train_size=0.1, seed=42)['train']


In [None]:
calculate_and_plot_zipf_mine(subset['story'], line_color=line_color, cmap=newcmp)

In [None]:
def calculate_and_plot_zipf_greedy(data, line_color='red', cmap='Blues'):
    # Tokenize all text
    tokens = []
    for entry in data:
        tokens.extend(word_tokenize(entry.lower()))

    # Generate 4-grams
    n = 4
    all_ngrams = list(ngrams(tokens, n))
    freq_dist = Counter(all_ngrams)

    # Sort by frequency descending
    sorted_ngrams = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)

    # Greedy filtering: only keep n-grams with ≤(n−2) word overlap
    selected = []
    selected_set = []

    for ngram, _ in sorted_ngrams:
        overlaps = False
        for s in selected:
            # Count how many words overlap (position doesn't matter)
            if len(set(ngram) & set(s)) >= n - 1:
                overlaps = True
                break
        if not overlaps:
            selected.append(ngram)
            selected_set.append(freq_dist[ngram])

    # Prepare ranks and frequencies
    freqs = np.array(sorted(selected_set, reverse=True))
    ranks = np.arange(1, len(freqs) + 1)

    param, _ = curve_fit(zipf, ranks, freqs)
    zipf_fit = zipf(ranks, *param)

    # Plot
    plt.figure(figsize=(8, 6))
    hb = plt.hexbin(np.log10(ranks), np.log10(freqs), gridsize=50, cmap=cmap, mincnt=1)
    cb = plt.colorbar(hb)
    cb.set_label('Counts per hexbin')

    plt.plot(np.log10(ranks), np.log10(zipf_fit), color=line_color,
             label=f'Zipf Fit, ∝ {round(param[1], 2)} * x^-{param[0]:.2f}')

    plt.xlabel('log10(Frequency Rank)')
    plt.ylabel('log10(Frequency)')
    plt.title('Filtered 4-gram Frequency Distribution with Zipf Fit')
    plt.legend()
    plt.grid(True, which="both", ls="--", linewidth=0.5)
    plt.tight_layout()

In [None]:
calculate_and_plot_zipf_greedy(data_raw, line_color=line_color, cmap=newcmp)

In [None]:
calculate_and_plot_zipf_greedy(subset['story'], line_color=line_color, cmap=newcmp)