-
Notifications
You must be signed in to change notification settings - Fork 0
/
jbdelta_average.py
163 lines (131 loc) · 6.09 KB
/
jbdelta_average.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from faststylometry import tokenise_remove_pronouns_en
class Corpus:
def __init__(self, authors=None, books=None, tokens=None):
if authors and books and tokens:
assert len(authors) == len(books) == len(tokens), "Length of authors, books, and tokens must be equal"
self.authors = authors
self.books = books
self.tokens = tokens
self.texts = None
else:
self.authors = []
self.books = []
self.tokens = []
self.texts = []
def add_book(self, author, book, text):
assert not self.tokens, "Cannot add more books if tokens are already initialized"
self.authors.append(author)
self.books.append(book)
self.texts.append(text)
def tokenise(self, tokenise):
self.tokens = [tokenise(text) for text in self.texts]
def calculate_word_frequencies(tokens):
word_freq = {}
for token_list in tokens:
for word in token_list:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
return word_freq
def calculate_z_scores(corpus):
all_tokens = [token for book in corpus.tokens for token in book]
total_word_freq = calculate_word_frequencies(all_tokens)
word_means = {}
word_stdevs = {}
num_books = len(corpus.tokens)
for word in total_word_freq:
word_counts = [token.count(word) for token in corpus.tokens]
mean = np.mean(word_counts)
stdev = np.std(word_counts)
if stdev == 0:
stdev = 0.01 # Small constant to avoid zero division
word_means[word] = mean
word_stdevs[word] = stdev
z_scores = []
for tokens in corpus.tokens:
doc_freqs = calculate_word_frequencies(tokens)
doc_z_scores = {word: (doc_freqs.get(word, 0) - word_means[word]) / word_stdevs[word] for word in total_word_freq.keys()}
z_scores.append(doc_z_scores)
return z_scores
def calculate_burrows_delta(z_scores, test_index):
deltas = []
test_scores = z_scores[test_index]
for index, scores in enumerate(z_scores):
if index != test_index:
common_words = set(test_scores.keys()).intersection(set(scores.keys()))
delta = np.mean([abs(test_scores[word] - scores[word]) for word in common_words]) if common_words else np.nan
deltas.append((index, delta))
return sorted(deltas, key=lambda x: x[1])
def calculate_corpus_average(corpus):
# Calculate average word frequencies across the entire corpus.
all_tokens = [token for book in corpus.tokens for token in book]
total_word_freq = calculate_word_frequencies(all_tokens)
corpus_size = sum(total_word_freq.values())
return {word: freq / corpus_size for word, freq in total_word_freq.items()}
def calculate_standard_deviation(corpus, corpus_average):
# Calculate standard deviation for word frequencies across the corpus.
word_variances = {word: [] for word in corpus_average.keys()}
for tokens in corpus.tokens:
book_freq = calculate_word_frequencies(tokens)
book_size = sum(book_freq.values())
if book_size == 0: # Check if book_size is zero
continue # Skip this book to avoid division by zero
for word, avg_freq in corpus_average.items():
freq = book_freq.get(word, 0) / book_size
word_variances[word].append((freq - avg_freq) ** 2)
# Calculate the square root of the average of the variance for each word
return {word: np.sqrt(np.mean(variances)) for word, variances in word_variances.items() if variances}
return {word: np.sqrt(np.mean(variances)) for word, variances in word_variances.items()}
def calculate_text_deviations(corpus, corpus_average, corpus_stdev):
# Calculate z-scores for each text against the corpus average and compute mean absolute z-score
deviations = []
for tokens in corpus.tokens:
book_freq = calculate_word_frequencies(tokens)
book_size = sum(book_freq.values())
if book_size == 0: # Check if there are no tokens in the book
deviations.append(np.nan) # Append NaN or another placeholder to indicate no data
continue
z_scores = {}
for word in corpus_average.keys():
if corpus_stdev[word] == 0:
z_score = 0 # Assign a neutral z-score when there is no variation
else:
freq = book_freq.get(word, 0) / book_size
z_score = (freq - corpus_average[word]) / corpus_stdev[word]
z_scores[word] = z_score
mean_abs_z = np.mean([abs(z) for z in z_scores.values()])
deviations.append(mean_abs_z)
return deviations
folder = 'csv_files'
corpus = Corpus()
for filename in os.listdir(folder):
if filename.endswith(".csv"):
filepath = os.path.join(folder, filename)
df = pd.read_csv(filepath, header=None)
text = ' '.join(df.iloc[:, 0].astype(str).dropna())
title = filename.replace('.csv', '')
author = "N/A"
corpus.add_book(author, title, text)
corpus.tokenise(tokenise_remove_pronouns_en)
corpus_average = calculate_corpus_average(corpus)
corpus_stdev = calculate_standard_deviation(corpus, corpus_average)
deviations = calculate_text_deviations(corpus, corpus_average, corpus_stdev)
print("Deviations from Corpus Norm:")
for i, deviation in enumerate(deviations, 1):
print(f"Response {i}: Mean Absolute Z-Score = {deviation}")
book_titles = [title for title in corpus.books] # Extract book titles from the corpus
# Plotting the deviations with book titles as x-axis labels
plt.figure(figsize=(15, 10)) # Increase figure size for better readability
plt.bar(book_titles, deviations, color='skyblue')
plt.xlabel('Schools')
plt.ylabel('Mean Absolute Z-Score')
plt.title('Deviations from Corpus Norm')
plt.xticks(rotation=90) # Rotate x-axis labels to avoid overlap
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout() # Adjust layout
plt.show()