# Preprocess
## Number of words, number of sentences
## Average sentence length, average word length

In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize, sent_tokenize

nltk.download('punkt')

tokenizer_regex = RegexpTokenizer(r'\w+')


def preprocess_text(text):
    # remove punctuation for tokenized text
    tokens_without_punctuation = tokenizer_regex.tokenize(text)
    return ' '.join(tokens_without_punctuation)

#count words
def count_words(text):
    words = word_tokenize(text)
    return len(words)

#count sentences
def count_sentences(text):
    sentences = sent_tokenize(text)
    return len(sentences)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Find averae word length
def average_word_length(text):
    words = word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    return (total_characters / total_words)

## Allan Poe

In [3]:
# Allan Poe
file_path_Poe = 'Allan Poe - Full Corpus.txt'

with open(file_path_Poe, 'r') as file:
    text_Poe = file.read()

preprocessed_text_Poe = preprocess_text(text_Poe)

num_words_after_preprocess_Poe = count_words(preprocessed_text_Poe)

print("Number of words after preprocessing:", num_words_after_preprocess_Poe)


Number of words after preprocessing: 199831


In [4]:
num_sentences_Poe = count_sentences(text_Poe)
print("Number of sentences:", num_sentences_Poe)

Number of sentences: 7571


In [5]:
average_sentence_length_Poe = num_words_after_preprocess_Poe / num_sentences_Poe
print("Average sentence length:", average_sentence_length_Poe)

Average sentence length: 26.39426760005283


In [6]:
average_word_length_Poe = average_word_length(preprocessed_text_Poe)
print("Average word length:", average_word_length_Poe)

Average word length: 4.470207325189786


In [7]:
# Split the text by whitespace to get individual words
words_Poe = text_Poe.split()

# Count the number of words
num_words_Poe = len(words_Poe)

print("Number of words in the Allan Poe corpus:", num_words_Poe)


Number of words in the Allan Poe corpus: 196261


In [9]:
# Calculate the number of words in each subset
subset_size = num_words_Poe // 5

# Create 5 subsets (a list of subsets which each subset is list of words)
subsets_Poe = []
for i in range(5):
    subset_start = i * subset_size
    subset_end = (i + 1) * subset_size
    subset = words_Poe[subset_start:subset_end]
    subsets_Poe.append(subset)

# Print the size of each subset
for i, subset in enumerate(subsets_Poe):
    print(f"Subset {i + 1} size:", len(subset))


Subset 1 size: 39252
Subset 2 size: 39252
Subset 3 size: 39252
Subset 4 size: 39252
Subset 5 size: 39252


In [10]:
# Function to calculate average word length in a subset
def average_word_length(subset):
    total_length = sum(len(word) for word in subset)
    return total_length / len(subset)

# Calculate and print average word length for each subset
for i, subset in enumerate(subsets_Poe):
    avg_length = average_word_length(subset)
    print(f"Average word length in Subset {i + 1}: {avg_length:.3f}")


Average word length in Subset 1: 4.746
Average word length in Subset 2: 4.757
Average word length in Subset 3: 4.648
Average word length in Subset 4: 4.655
Average word length in Subset 5: 4.801


In [28]:
# number of sentences in each subset
import re

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

# Calculate the number of sentences in each subset
for i, subset in enumerate(subsets_Poe):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    print(f"Number of sentences in Subset {i + 1}: {len(subset_sentences)}")


Number of sentences in Subset 1: 1661
Number of sentences in Subset 2: 1809
Number of sentences in Subset 3: 1556
Number of sentences in Subset 4: 1352
Number of sentences in Subset 5: 1467


In [29]:
#average sentence length in each subset
# Function to calculate average sentence length in a subset
def average_sentence_length(subset_sentences):
    total_words = sum(len(sentence.split()) for sentence in subset_sentences)
    return total_words / len(subset_sentences)

# Calculate and print average sentence length for each subset
for i, subset in enumerate(subsets_Poe):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    avg_length = average_sentence_length(subset_sentences)
    print(f"Average sentence length in Subset {i + 1}: {avg_length:.3f} words")


Average sentence length in Subset 1: 23.632 words
Average sentence length in Subset 2: 21.698 words
Average sentence length in Subset 3: 25.226 words
Average sentence length in Subset 4: 29.033 words
Average sentence length in Subset 5: 26.757 words


## Sheridan Le Fanu

In [87]:
#Sheridan Le Fanu
file_path_Fanu = 'Full Corpus - Sheridan Le Fanu.txt'

with open(file_path_Fanu, 'r') as file:
    text_Fanu = file.read()

preprocessed_text_Fanu = preprocess_text(text_Fanu)

num_words_after_preprocess_Faun = count_words(preprocessed_text_Fanu)

print("Number of words after preprocessing:", num_words_after_preprocess_Faun)

Number of words after preprocessing: 152056


In [12]:
num_sentences_Fanu = count_sentences(text_Fanu)
print("Number of sentences:", num_sentences_Fanu)

Number of sentences: 6285


In [13]:
average_sentence_length_Fanu = num_words_after_preprocess_Faun / num_sentences_Fanu
print("Average sentence length:", average_sentence_length_Fanu)

Average sentence length: 24.193476531424025


In [14]:
average_word_length_Fanu = average_word_length(preprocessed_text_Fanu)
print("Average word length:", average_word_length_Fanu)

Average word length: 1.0


In [15]:
# Split the text by whitespace to get individual words
words_Fanu = text_Fanu.split()

# Count the number of words
num_words_Fanu = len(words_Fanu)

print("Number of words in the Sheridan Le Fanu corpus:", num_words_Fanu)


Number of words in the Sheridan Le Fanu corpus: 148482


In [16]:
# Calculate the number of words in each subset
subset_size = num_words_Fanu // 5

# Create 5 subsets (a list of subsets which each subset is list of words)
subsets_Fanu = []
for i in range(5):
    subset_start = i * subset_size
    subset_end = (i + 1) * subset_size
    subset = words_Fanu[subset_start:subset_end]
    subsets_Fanu.append(subset)

# Print the size of each subset
for i, subset in enumerate(subsets_Fanu):
    print(f"Subset {i + 1} size:", len(subset))


Subset 1 size: 29696
Subset 2 size: 29696
Subset 3 size: 29696
Subset 4 size: 29696
Subset 5 size: 29696


In [17]:
# Calculate and print average word length for each subset
for i, subset in enumerate(subsets_Fanu):
    avg_length = average_word_length(subset)
    print(f"Average word length in Subset {i + 1}: {avg_length:.3f}")


Average word length in Subset 1: 4.654
Average word length in Subset 2: 4.462
Average word length in Subset 3: 4.515
Average word length in Subset 4: 4.574
Average word length in Subset 5: 4.507


In [39]:
# Calculate the number of sentences in each subset
for i, subset in enumerate(subsets_Fanu):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    print(f"Number of sentences in Subset {i + 1}: {len(subset_sentences)}")


Number of sentences in Subset 1: 951
Number of sentences in Subset 2: 1245
Number of sentences in Subset 3: 1144
Number of sentences in Subset 4: 1014
Number of sentences in Subset 5: 1134


In [44]:
# Calculate and print average sentence length for each subset
for i, subset in enumerate(subsets_Fanu):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    avg_length = average_sentence_length(subset_sentences)
    print(f"Average sentence length in Subset {i + 1}: {avg_length:.3f} words")


Average sentence length in Subset 1: 31.226 words
Average sentence length in Subset 2: 23.852 words
Average sentence length in Subset 3: 25.958 words
Average sentence length in Subset 4: 29.286 words
Average sentence length in Subset 5: 26.187 words


## Gothic corpus

In [88]:
# Gothic corpus
file_path_Gothic = 'Full Gothic corpus.txt'

with open(file_path_Gothic, 'r') as file:
    text_Gothic = file.read()

preprocessed_text_Gothic = preprocess_text(text_Gothic)

num_words_after_preprocess_Gothic = count_words(preprocessed_text_Gothic)

print("Number of words after preprocessing:", num_words_after_preprocess_Gothic)

Number of words after preprocessing: 3005036


In [19]:
num_sentences_Gothic = count_sentences(text_Gothic)
print("Number of sentences:", num_sentences_Gothic)

Number of sentences: 120849


In [20]:
average_sentence_length_Gothic = num_words_after_preprocess_Gothic / num_sentences_Gothic
print("Average sentence length:", average_sentence_length_Gothic)

Average sentence length: 24.866039437645327


In [21]:
average_word_length_Gothic = average_word_length(preprocessed_text_Gothic)
print("Average word length:", average_word_length_Gothic)

Average word length: 1.0


In [22]:
# Split the text by whitespace to get individual words
words_Gothic = text_Gothic.split()

# Count the number of words
num_words_Gothic = len(words_Gothic)

print("Number of words in the Gothic corpus:", num_words_Gothic)


Number of words in the Gothic corpus: 2943122


In [23]:
#subset for the comparison with Allan Poe

# Calculate the number of words in each subset
subset_size = 39252

# Create 5 subsets (a list of subsets which each subset is list of words)
subsets_Gothic_vs_Poe = []
for i in range(25):
    subset_start = i * subset_size
    subset_end = (i + 1) * subset_size
    subset = words_Gothic[subset_start:subset_end]
    subsets_Gothic_vs_Poe.append(subset)

# Print the size of each subset
for i, subset in enumerate(subsets_Gothic_vs_Poe):
    print(f"Subset {i + 1} size:", len(subset))


Subset 1 size: 39252
Subset 2 size: 39252
Subset 3 size: 39252
Subset 4 size: 39252
Subset 5 size: 39252
Subset 6 size: 39252
Subset 7 size: 39252
Subset 8 size: 39252
Subset 9 size: 39252
Subset 10 size: 39252
Subset 11 size: 39252
Subset 12 size: 39252
Subset 13 size: 39252
Subset 14 size: 39252
Subset 15 size: 39252
Subset 16 size: 39252
Subset 17 size: 39252
Subset 18 size: 39252
Subset 19 size: 39252
Subset 20 size: 39252
Subset 21 size: 39252
Subset 22 size: 39252
Subset 23 size: 39252
Subset 24 size: 39252
Subset 25 size: 39252


In [24]:
# Calculate and print average word length for each subset
for i, subset in enumerate(subsets_Gothic_vs_Poe):
    avg_length = average_word_length(subset)
    print(f"Average word length in Subset {i + 1}: {avg_length:.3f}")


Average word length in Subset 1: 4.598
Average word length in Subset 2: 4.538
Average word length in Subset 3: 4.517
Average word length in Subset 4: 4.654
Average word length in Subset 5: 4.651
Average word length in Subset 6: 4.635
Average word length in Subset 7: 4.682
Average word length in Subset 8: 4.638
Average word length in Subset 9: 4.516
Average word length in Subset 10: 4.456
Average word length in Subset 11: 4.501
Average word length in Subset 12: 4.580
Average word length in Subset 13: 4.468
Average word length in Subset 14: 4.444
Average word length in Subset 15: 4.460
Average word length in Subset 16: 4.407
Average word length in Subset 17: 4.571
Average word length in Subset 18: 4.500
Average word length in Subset 19: 4.475
Average word length in Subset 20: 4.499
Average word length in Subset 21: 4.486
Average word length in Subset 22: 4.468
Average word length in Subset 23: 4.562
Average word length in Subset 24: 4.450
Average word length in Subset 25: 4.560


In [31]:
# Calculate the number of sentences in each subset
for i, subset in enumerate(subsets_Gothic_vs_Poe):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    print(f"Number of sentences in Subset {i + 1}: {len(subset_sentences)}")


Number of sentences in Subset 1: 1838
Number of sentences in Subset 2: 1941
Number of sentences in Subset 3: 1941
Number of sentences in Subset 4: 1672
Number of sentences in Subset 5: 1754
Number of sentences in Subset 6: 1759
Number of sentences in Subset 7: 1614
Number of sentences in Subset 8: 2036
Number of sentences in Subset 9: 1389
Number of sentences in Subset 10: 1339
Number of sentences in Subset 11: 1286
Number of sentences in Subset 12: 1223
Number of sentences in Subset 13: 1263
Number of sentences in Subset 14: 1218
Number of sentences in Subset 15: 1199
Number of sentences in Subset 16: 1001
Number of sentences in Subset 17: 1672
Number of sentences in Subset 18: 2011
Number of sentences in Subset 19: 1892
Number of sentences in Subset 20: 2143
Number of sentences in Subset 21: 1947
Number of sentences in Subset 22: 1926
Number of sentences in Subset 23: 1740
Number of sentences in Subset 24: 1598
Number of sentences in Subset 25: 1261


In [32]:
# Calculate and print average sentence length for each subset
for i, subset in enumerate(subsets_Gothic_vs_Poe):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    avg_length = average_sentence_length(subset_sentences)
    print(f"Average sentence length in Subset {i + 1}: {avg_length:.3f} words")


Average sentence length in Subset 1: 21.356 words
Average sentence length in Subset 2: 20.223 words
Average sentence length in Subset 3: 20.223 words
Average sentence length in Subset 4: 23.476 words
Average sentence length in Subset 5: 22.379 words
Average sentence length in Subset 6: 22.315 words
Average sentence length in Subset 7: 24.320 words
Average sentence length in Subset 8: 19.279 words
Average sentence length in Subset 9: 28.259 words
Average sentence length in Subset 10: 29.314 words
Average sentence length in Subset 11: 30.523 words
Average sentence length in Subset 12: 32.095 words
Average sentence length in Subset 13: 31.078 words
Average sentence length in Subset 14: 32.227 words
Average sentence length in Subset 15: 32.737 words
Average sentence length in Subset 16: 39.213 words
Average sentence length in Subset 17: 23.476 words
Average sentence length in Subset 18: 19.519 words
Average sentence length in Subset 19: 20.746 words
Average sentence length in Subset 20: 18

In [25]:
#subset for the comparison with Fanu
# Calculate the number of words in each subset
subset_size = 29696

# Create 5 subsets (a list of subsets which each subset is list of words)
subsets_Gothic_vs_Fanu = []
for i in range(25):
    subset_start = i * subset_size
    subset_end = (i + 1) * subset_size
    subset = words_Gothic[subset_start:subset_end]
    subsets_Gothic_vs_Fanu.append(subset)

# Print the size of each subset
for i, subset in enumerate(subsets_Gothic_vs_Fanu):
    print(f"Subset {i + 1} size:", len(subset))


Subset 1 size: 29696
Subset 2 size: 29696
Subset 3 size: 29696
Subset 4 size: 29696
Subset 5 size: 29696
Subset 6 size: 29696
Subset 7 size: 29696
Subset 8 size: 29696
Subset 9 size: 29696
Subset 10 size: 29696
Subset 11 size: 29696
Subset 12 size: 29696
Subset 13 size: 29696
Subset 14 size: 29696
Subset 15 size: 29696
Subset 16 size: 29696
Subset 17 size: 29696
Subset 18 size: 29696
Subset 19 size: 29696
Subset 20 size: 29696
Subset 21 size: 29696
Subset 22 size: 29696
Subset 23 size: 29696
Subset 24 size: 29696
Subset 25 size: 29696


In [26]:
# Calculate and print average word length for each subset
for i, subset in enumerate(subsets_Gothic_vs_Fanu):
    avg_length = average_word_length(subset)
    print(f"Average word length in Subset {i + 1}: {avg_length:.3f}")


Average word length in Subset 1: 4.591
Average word length in Subset 2: 4.544
Average word length in Subset 3: 4.537
Average word length in Subset 4: 4.538
Average word length in Subset 5: 4.645
Average word length in Subset 6: 4.666
Average word length in Subset 7: 4.650
Average word length in Subset 8: 4.633
Average word length in Subset 9: 4.672
Average word length in Subset 10: 4.665
Average word length in Subset 11: 4.562
Average word length in Subset 12: 4.530
Average word length in Subset 13: 4.467
Average word length in Subset 14: 4.435
Average word length in Subset 15: 4.587
Average word length in Subset 16: 4.555
Average word length in Subset 17: 4.460
Average word length in Subset 18: 4.465
Average word length in Subset 19: 4.444
Average word length in Subset 20: 4.445
Average word length in Subset 21: 4.403
Average word length in Subset 22: 4.565
Average word length in Subset 23: 4.520
Average word length in Subset 24: 4.503
Average word length in Subset 25: 4.492


In [30]:
# Calculate the number of sentences in each subset
for i, subset in enumerate(subsets_Gothic_vs_Fanu):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    print(f"Number of sentences in Subset {i + 1}: {len(subset_sentences)}")


Number of sentences in Subset 1: 1308
Number of sentences in Subset 2: 1567
Number of sentences in Subset 3: 1399
Number of sentences in Subset 4: 1500
Number of sentences in Subset 5: 1282
Number of sentences in Subset 6: 1334
Number of sentences in Subset 7: 1278
Number of sentences in Subset 8: 1305
Number of sentences in Subset 9: 1240
Number of sentences in Subset 10: 1425
Number of sentences in Subset 11: 1413
Number of sentences in Subset 12: 1008
Number of sentences in Subset 13: 1043
Number of sentences in Subset 14: 967
Number of sentences in Subset 15: 1009
Number of sentences in Subset 16: 877
Number of sentences in Subset 17: 973
Number of sentences in Subset 18: 921
Number of sentences in Subset 19: 867
Number of sentences in Subset 20: 899
Number of sentences in Subset 21: 767
Number of sentences in Subset 22: 1124
Number of sentences in Subset 23: 1469
Number of sentences in Subset 24: 1492
Number of sentences in Subset 25: 1414


In [33]:
# Calculate and print average sentence length for each subset
for i, subset in enumerate(subsets_Gothic_vs_Fanu):
    # Join subset words back into a string
    subset_text = ' '.join(subset)
    # Split subset text into sentences
    subset_sentences = split_into_sentences(subset_text)
    avg_length = average_sentence_length(subset_sentences)
    print(f"Average sentence length in Subset {i + 1}: {avg_length:.3f} words")


Average sentence length in Subset 1: 22.703 words
Average sentence length in Subset 2: 18.951 words
Average sentence length in Subset 3: 21.227 words
Average sentence length in Subset 4: 19.797 words
Average sentence length in Subset 5: 23.164 words
Average sentence length in Subset 6: 22.261 words
Average sentence length in Subset 7: 23.236 words
Average sentence length in Subset 8: 22.756 words
Average sentence length in Subset 9: 23.948 words
Average sentence length in Subset 10: 20.839 words
Average sentence length in Subset 11: 21.016 words
Average sentence length in Subset 12: 29.460 words
Average sentence length in Subset 13: 28.472 words
Average sentence length in Subset 14: 30.709 words
Average sentence length in Subset 15: 29.431 words
Average sentence length in Subset 16: 33.861 words
Average sentence length in Subset 17: 30.520 words
Average sentence length in Subset 18: 32.243 words
Average sentence length in Subset 19: 34.251 words
Average sentence length in Subset 20: 33

# finding number of samples required

# Poe

In [162]:
words = preprocessed_text_Poe.split()
pilot_poe = ' '.join(words[:1000])

In [197]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
verb_count = Counter(tag for word, tag in tagged_words if tag.startswith('VB'))

# Calculate the total number of verbs
total_verbs = sum(verb_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_verbs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(verb_count.values()))
variance = ((total_verbs*((1-mean_score)**2))+((total_words-total_verbs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of verbs in the entire corpus:", mean_score)
print("Standard deviation of verb frequency:", std_deviation)


Mean score of verbs in the entire corpus: 0.15
Standard deviation of verb frequency: 0.3572500911513216


In [198]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.1575
CI min: 0.1425
CI: 0.015000000000000013


In [204]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 8716.343009676326


In [205]:
8716.343009676326 < 199831

True

In [206]:
#nouns
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
noun_count = Counter(tag for word, tag in tagged_words if tag.startswith('NN'))

# Calculate the total number of verbs
total_nouns = sum(noun_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_nouns / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(noun_count.values()))
variance = ((total_nouns*((1-mean_score)**2))+((total_words-total_nouns)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of nouns in the entire corpus:", mean_score)
print("Standard deviation of noun frequency:", std_deviation)


Mean score of nouns in the entire corpus: 0.251
Standard deviation of noun frequency: 0.43380547159664457


In [207]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.26355
CI min: 0.23845
CI: 0.02510000000000001


In [208]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 4590.021734882289


In [209]:
4590.021734882289 < 199831

True

In [210]:
#adj
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adj_count = Counter(tag for word, tag in tagged_words if tag.startswith('JJ'))

# Calculate the total number of verbs
total_adjs = sum(adj_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_adjs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(noun_count.values()))
variance = ((total_adjs*((1-mean_score)**2))+((total_words-total_adjs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of adjs in the entire corpus:", mean_score)
print("Standard deviation of adj frequency:", std_deviation)


Mean score of adjs in the entire corpus: 0.086
Standard deviation of adj frequency: 0.2805043362992499


In [211]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.09029999999999999
CI min: 0.0817
CI: 0.008599999999999997


In [212]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 16347.614591335538


In [213]:
16347.614591335538 < 199831

True

In [214]:
#adv
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adv_count = Counter(tag for word, tag in tagged_words if tag.startswith('RB'))

# Calculate the total number of verbs
total_advs = sum(adv_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_advs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(adv_count.values()))
variance = ((total_advs*((1-mean_score)**2))+((total_words-total_advs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of advs in the entire corpus:", mean_score)
print("Standard deviation of adv frequency:", std_deviation)


Mean score of advs in the entire corpus: 0.073
Standard deviation of adv frequency: 0.2602666685127751


In [215]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.07665
CI min: 0.06935
CI: 0.007300000000000001


In [216]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 19532.755769468095


In [217]:
19532.755769468095 < 199831

True

In [50]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = preprocessed_text_Poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
verb_count = Counter(tag for word, tag in tagged_words if tag.startswith('VB'))

# Calculate the mean score
total_words = len(words)
mean_score = (verb_count['VB'] / total_words) * 1000 if 'VB' in verb_count else 0

# Calculate the standard deviation for 1000 words
std_deviation_1000 = np.sqrt(variance * (1000 / total_words))

print("Mean score of verbs per 1000 words:", mean_score)
print("Standard deviation of verb frequency per 1000 words:", std_deviation_1000)


Mean score of verbs per 1000 words: 29.89526149596409
Standard deviation of verb frequency per 1000 words: 460.64053711217315


In [51]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 31.390024570762293
CI min: 28.400498421165885
CI: 2.989526149596408


In [66]:
t =   1.282
n = (std_deviation_1000**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 156083.01169825267


In [67]:
156083.01169825267 < 199831

True

In [68]:
# noun
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = preprocessed_text_Poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
nouns_count = Counter(tag for word, tag in tagged_words if tag.startswith('NN'))

# Calculate the mean score
total_words = len(words)
mean_score = (nouns_count['NN'] / total_words) * 1000 if 'NN' in nouns_count else 0

# Calculate the standard deviation for 1000 words
std_deviation_1000 = np.sqrt(variance * (1000 / total_words))

print("Mean score of nouns per 1000 words:", mean_score)
print("Standard deviation of noun frequency per 1000 words:", std_deviation_1000)


Mean score of nouns per 1000 words: 159.60486611186454
Standard deviation of noun frequency per 1000 words: 460.64053711217315


In [43]:
nouns_count

Counter({'NNP': 5229, 'NN': 31894, 'NNS': 7827, 'NNPS': 31})

In [69]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 167.58510941745777
CI min: 151.6246228062713
CI: 15.960486611186468


In [74]:
t =  3.291
n = (std_deviation_1000**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 36086.75302766323


In [75]:
36086.75302766323 < 199831

True

In [76]:
# adj
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = preprocessed_text_Poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adjs_count = Counter(tag for word, tag in tagged_words if tag.startswith('JJ'))

# Calculate the mean score
total_words = len(words)
mean_score = (adjs_count['JJ'] / total_words) * 1000 if 'JJ' in adjs_count else 0

# Calculate the standard deviation for 1000 words
std_deviation_1000 = np.sqrt(variance * (1000 / total_words))

print("Mean score of nouns per 1000 words:", mean_score)
print("Standard deviation of noun frequency per 1000 words:", std_deviation_1000)


Mean score of nouns per 1000 words: 70.51958905274958
Standard deviation of noun frequency per 1000 words: 460.64053711217315


In [77]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 74.04556850538705
CI min: 66.99360960011211
CI: 7.051958905274944


In [78]:
t =  3.291
n = (std_deviation_1000**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 184850.45254368294


In [79]:
184850.45254368294 < 199831

True

In [80]:
# adv
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = preprocessed_text_Poe

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
advs_count = Counter(tag for word, tag in tagged_words if tag.startswith('RB'))

# Calculate the mean score
total_words = len(words)
mean_score = (advs_count['RB'] / total_words) * 1000 if 'RB' in advs_count else 0

# Calculate the standard deviation for 1000 words
std_deviation_1000 = np.sqrt(variance * (1000 / total_words))

print("Mean score of nouns per 1000 words:", mean_score)
print("Standard deviation of noun frequency per 1000 words:", std_deviation_1000)


Mean score of nouns per 1000 words: 60.716305277959876
Standard deviation of noun frequency per 1000 words: 460.64053711217315


In [81]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 63.75212054185787
CI min: 57.68049001406188
CI: 6.071630527795989


In [85]:
t =  2.576
n = (std_deviation_1000**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 152779.5915911607


In [86]:
152779.5915911607 < 199831

True

# Fanu

In [218]:
words = preprocessed_text_Fanu.split()
pilot_Fanu = ' '.join(words[:1000])

In [219]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Fanu

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
verb_count = Counter(tag for word, tag in tagged_words if tag.startswith('VB'))

# Calculate the total number of verbs
total_verbs = sum(verb_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_verbs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(verb_count.values()))
variance = ((total_verbs*((1-mean_score)**2))+((total_words-total_verbs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of verbs in the entire corpus:", mean_score)
print("Standard deviation of verb frequency:", std_deviation)


Mean score of verbs in the entire corpus: 0.158
Standard deviation of verb frequency: 0.3649235113954281


In [220]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.1659
CI min: 0.1501
CI: 0.01579999999999998


In [221]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 8197.126747000182


In [222]:
8197.126747000182 < 152056

True

In [223]:
#nouns
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Fanu

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
noun_count = Counter(tag for word, tag in tagged_words if tag.startswith('NN'))

# Calculate the total number of verbs
total_nouns = sum(noun_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_nouns / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(noun_count.values()))
variance = ((total_nouns*((1-mean_score)**2))+((total_words-total_nouns)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of nouns in the entire corpus:", mean_score)
print("Standard deviation of noun frequency:", std_deviation)


Mean score of nouns in the entire corpus: 0.231
Standard deviation of noun frequency: 0.42168331341993703


In [224]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.24255000000000002
CI min: 0.21945
CI: 0.02310000000000001


In [225]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 5120.6018139351445


In [226]:
5120.6018139351445 < 152056

True

In [227]:
#adj
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Fanu

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adj_count = Counter(tag for word, tag in tagged_words if tag.startswith('JJ'))

# Calculate the total number of verbs
total_adjs = sum(adj_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_adjs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(noun_count.values()))
variance = ((total_adjs*((1-mean_score)**2))+((total_words-total_adjs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of adjs in the entire corpus:", mean_score)
print("Standard deviation of adj frequency:", std_deviation)


Mean score of adjs in the entire corpus: 0.107
Standard deviation of adj frequency: 0.3092679204939411


In [228]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.11235
CI min: 0.10164999999999999
CI: 0.010700000000000015


In [229]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 12837.318814141208


In [230]:
12837.318814141208 < 152056

True

In [231]:
#adv
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Fanu

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adv_count = Counter(tag for word, tag in tagged_words if tag.startswith('RB'))

# Calculate the total number of verbs
total_advs = sum(adv_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_advs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(adv_count.values()))
variance = ((total_advs*((1-mean_score)**2))+((total_words-total_advs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of advs in the entire corpus:", mean_score)
print("Standard deviation of adv frequency:", std_deviation)


Mean score of advs in the entire corpus: 0.062
Standard deviation of adv frequency: 0.24127621974453722


In [232]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.0651
CI min: 0.0589
CI: 0.006200000000000004


In [233]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 23271.14727630853


In [234]:
23271.14727630853 < 152056

True

# Gothic

In [235]:
words = preprocessed_text_Gothic.split()
pilot_Gothic = ' '.join(words[:1000])

In [236]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Gothic

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
verb_count = Counter(tag for word, tag in tagged_words if tag.startswith('VB'))

# Calculate the total number of verbs
total_verbs = sum(verb_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_verbs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(verb_count.values()))
variance = ((total_verbs*((1-mean_score)**2))+((total_words-total_verbs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of verbs in the entire corpus:", mean_score)
print("Standard deviation of verb frequency:", std_deviation)


Mean score of verbs in the entire corpus: 0.164
Standard deviation of verb frequency: 0.370460849809047


In [237]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.17220000000000002
CI min: 0.1558
CI: 0.016400000000000026


In [238]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 7840.957054615566


In [239]:
7840.957054615566 < 3005036

True

In [240]:
#nouns
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Gothic

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
noun_count = Counter(tag for word, tag in tagged_words if tag.startswith('NN'))

# Calculate the total number of verbs
total_nouns = sum(noun_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_nouns / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(noun_count.values()))
variance = ((total_nouns*((1-mean_score)**2))+((total_words-total_nouns)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of nouns in the entire corpus:", mean_score)
print("Standard deviation of noun frequency:", std_deviation)


Mean score of nouns in the entire corpus: 0.27
Standard deviation of noun frequency: 0.44418160396092193


In [241]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.28350000000000003
CI min: 0.2565
CI: 0.027000000000000024


In [242]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 4158.778037296549


In [243]:
4158.778037296549 < 3005036

True

In [244]:
#adj
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Gothic

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adj_count = Counter(tag for word, tag in tagged_words if tag.startswith('JJ'))

# Calculate the total number of verbs
total_adjs = sum(adj_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_adjs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(noun_count.values()))
variance = ((total_adjs*((1-mean_score)**2))+((total_words-total_adjs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of adjs in the entire corpus:", mean_score)
print("Standard deviation of adj frequency:", std_deviation)


Mean score of adjs in the entire corpus: 0.08
Standard deviation of adj frequency: 0.27142894774447635


In [245]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.084
CI min: 0.076
CI: 0.008000000000000007


In [246]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 17689.049049049016


In [247]:
17689.049049049016 < 3005036

True

In [248]:
#adv
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter
import numpy as np

# Assuming you have your preprocessed_text_Poe stored as a string
corpus = pilot_Gothic

# Tokenize the text into words
words = word_tokenize(corpus)

# Tag each word with its part of speech
tagged_words = pos_tag(words)

# Count occurrences of verbs
adv_count = Counter(tag for word, tag in tagged_words if tag.startswith('RB'))

# Calculate the total number of verbs
total_advs = sum(adv_count.values())

# Calculate the total number of words
total_words = len(words)

# Calculate the mean score
mean_score = total_advs / total_words if total_words > 0 else 0

# Calculate the standard deviation
verb_frequencies = np.array(list(adv_count.values()))
variance = ((total_advs*((1-mean_score)**2))+((total_words-total_advs)*((0-mean_score)**2)))/(total_words-1)
std_deviation = np.sqrt(variance)

print("Mean score of advs in the entire corpus:", mean_score)
print("Standard deviation of adv frequency:", std_deviation)


Mean score of advs in the entire corpus: 0.05
Standard deviation of adv frequency: 0.21805400144814482


In [249]:
CI_Max = mean_score + (5/100*mean_score)
CI_Min = mean_score - (5/100*mean_score)
CI = CI_Max - CI_Min
print("CI max:", CI_Max)
print("CI min:", CI_Min)
print("CI:", CI)

CI max: 0.052500000000000005
CI min: 0.0475
CI: 0.0050000000000000044


In [250]:
t = 1.96
n = (std_deviation**2)/((0.5*CI)/t)**2
print("required sample size:", n)

required sample size: 29225.38538538533


In [251]:
29225.38538538533 < 3005036

True

## POS Frequency classes

In [31]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from collections import Counter
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_frequency(text):
    #tokenize into words
    words = word_tokenize(text)
    # Tag each word with its part of speech
    tagged_words = pos_tag(words)
    # POS classes
    pos_classes = [tag[1] for tag in tagged_words]
    # Count the frequency of each POS class
    pos_freq = Counter(pos_classes)
    #total number of POS tags
    total_pos_tags = sum(pos_freq.values())
    #relative frequency of each POS class
    relative_freq = {pos_class: freq / total_pos_tags for pos_class, freq in pos_freq.items()}
    return pos_freq, relative_freq

[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Allan Poe

In [32]:
pos_freq_Poe, relative_freq_Poe = pos_frequency(preprocessed_text_Poe)

POS_Poe = pd.DataFrame({"POS Class": list(pos_freq_Poe.keys()),
                   "Raw Frequency": list(pos_freq_Poe.values()),
                   "Relative Frequency": list(relative_freq_Poe.values())})

POS_Poe = POS_Poe.sort_values(by='Relative Frequency', ascending=False)

In [33]:
POS_Poe

Unnamed: 0,POS Class,Raw Frequency,Relative Frequency
4,NN,31894,0.159605
8,IN,30575,0.153004
2,DT,24809,0.12415
10,JJ,14092,0.07052
1,VBD,12328,0.061692
12,RB,12133,0.060716
7,PRP,12019,0.060146
5,CC,8003,0.040049
9,NNS,7827,0.039168
13,VBN,6559,0.032823


In [34]:
# Filter out rows where POS Class is not "NOUN"
filtered_df = POS_Poe[POS_Poe['POS Class'] != 'NN']

# Sum the values in the Raw Frequency column of the filtered DataFrame
sum_except_noun = filtered_df['Raw Frequency'].sum()

print("Sum of Raw Frequency except NOUN:", sum_except_noun)


Sum of Raw Frequency except NOUN: 167937


In [27]:
#spacey version
import spacy
from collections import Counter
import pandas as pd

# Load the English language model in SpaCy
nlp = spacy.load("en_core_web_sm")

# Increase the max_length limit
nlp.max_length = 1500000  # or any value that suits your text length

def pos_frequency_spacy(text):
    # Process the text with SpaCy
    doc = nlp(text)
    # Get POS tags and count their frequencies
    pos_freq = Counter(token.pos_ for token in doc)
    # Calculate total number of POS tags
    total_pos_tags = sum(pos_freq.values())
    # Calculate relative frequency of each POS class
    relative_freq = {pos: freq / total_pos_tags for pos, freq in pos_freq.items()}
    return pos_freq, relative_freq

# Example text
pos_freq, relative_freq = pos_frequency_spacy(preprocessed_text_Poe)

# Create a DataFrame to store the results
POS_df = pd.DataFrame({"POS Class": list(pos_freq.keys()),
                       "Raw Frequency": list(pos_freq.values()),
                       "Relative Frequency": list(relative_freq.values())})

# Sort the DataFrame by relative frequency in descending order
POS_df = POS_df.sort_values(by='Relative Frequency', ascending=False)



In [28]:
POS_df

Unnamed: 0,POS Class,Raw Frequency,Relative Frequency
1,NOUN,38550,0.192167
7,ADP,27952,0.139337
2,DET,24074,0.120006
4,VERB,23264,0.115968
0,PRON,21731,0.108326
11,ADJ,14759,0.073572
12,ADV,12427,0.061947
8,AUX,11953,0.059584
5,CCONJ,8091,0.040333
6,SCONJ,5837,0.029097


In [29]:
# Filter out rows where POS Class is not "NOUN"
filtered_df = POS_df[POS_df['POS Class'] != 'NOUN']

# Sum the values in the Raw Frequency column of the filtered DataFrame
sum_except_noun = filtered_df['Raw Frequency'].sum()

print("Sum of Raw Frequency except NOUN:", sum_except_noun)


Sum of Raw Frequency except NOUN: 162057


## Sheridan Le Fanu

In [126]:
pos_freq_Fanu, relative_freq_Fanu = pos_frequency(preprocessed_text_Fanu)

POS_Fanu = pd.DataFrame({"POS Class": list(pos_freq_Fanu.keys()),
                   "Raw Frequency": list(pos_freq_Fanu.values()),
                   "Relative Frequency": list(relative_freq_Fanu.values())})

POS_Fanu = POS_Fanu.sort_values(by='Relative Frequency', ascending=False)

In [127]:
POS_Fanu

Unnamed: 0,POS Class,Raw Frequency,Relative Frequency
0,NN,24101,0.158501
1,IN,19136,0.125848
5,DT,16228,0.106724
2,PRP,11827,0.077781
23,VBD,11188,0.073578
14,JJ,11000,0.072342
4,RB,8567,0.056341
9,CC,7945,0.05225
12,NNP,5648,0.037144
8,VB,4901,0.032232


## Gothic corpus

In [128]:
pos_freq_Gothic, relative_freq_Gothic = pos_frequency(preprocessed_text_Gothic)

POS_Gothic = pd.DataFrame({"POS Class": list(pos_freq_Gothic.keys()),
                   "Raw Frequency": list(pos_freq_Gothic.values()),
                   "Relative Frequency": list(relative_freq_Gothic.values())})

POS_Gothic = POS_Gothic.sort_values(by='Relative Frequency', ascending=False)

In [129]:
POS_Gothic

Unnamed: 0,POS Class,Raw Frequency,Relative Frequency
7,NN,425028,0.141439
5,IN,373937,0.124437
6,DT,288298,0.095938
1,PRP,286779,0.095433
16,VBD,210191,0.069946
10,JJ,185413,0.061701
3,RB,173611,0.057773
15,CC,128019,0.042601
12,VB,125237,0.041676
14,NNP,124532,0.041441


# Preposition/Subordinating conjunction 

In [34]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def IN_finder(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    IN = [word for word, tag in tagged_words if tag.startswith('IN')]
    return IN

def IN_counter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    IN = [word for word, tag in tagged_words if tag.startswith('IN')]
    return len(IN)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Poe

In [27]:
n_IN_Poe = IN_counter(preprocessed_text_Poe)
print('Number of IN:', n_IN_Poe)

Number of IN: 30575


In [28]:
rel_freq_IN_Poe = rel_frequency(30575, 199831)
print("realative frequency of IN:", rel_freq_IN_Poe)

realative frequency of IN: 0.15300428862388718


In [35]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

# Function to find prepositions in text
def IN_finder(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    IN = [word for word, tag in tagged_words if tag.startswith('IN')]
    return IN

# Read the Poe corpus text from file
file_path_Poe = 'Allan Poe - Full Corpus.txt'
with open(file_path_Poe, 'r') as file:
    text_Poe = file.read()

# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    prepositions = IN_finder(subset_text)
    print(f"Number of prepositions in Subset {i + 1}: {len(prepositions)}")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


1568
Number of prepositions in Subset 1: 5580
Number of prepositions in Subset 2: 5301
Number of prepositions in Subset 3: 5501
Number of prepositions in Subset 4: 7196
Number of prepositions in Subset 5: 6518


# Sheridan Le Fanu

In [29]:
n_IN_Fanu = IN_counter(preprocessed_text_Fanu)
print('Number of IN:', n_IN_Fanu)

Number of IN: 19136


In [30]:
rel_freq_IN_Fanu = rel_frequency(19136, 152056)
print("realative frequency of IN:", rel_freq_IN_Fanu)

realative frequency of IN: 0.12584837165254906


In [56]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    prepositions = IN_finder(subset_text)
    print(f"Number of prepositions in Subset {i + 1}: {len(prepositions)}")


1097
Number of prepositions in Subset 1: 4512
Number of prepositions in Subset 2: 2966
Number of prepositions in Subset 3: 3490
Number of prepositions in Subset 4: 4091
Number of prepositions in Subset 5: 3801


# Gothic corpus

In [31]:
n_IN_Gothic = IN_counter(preprocessed_text_Gothic)
print('Number of IN:', n_IN_Gothic)

Number of IN: 373937


In [32]:
rel_freq_IN_Gothic = rel_frequency(373937, 3005036)
print("realative frequency of IN:", rel_freq_IN_Gothic)

realative frequency of IN: 0.12443677879399781


In [36]:
# Gothic vs Poe

sentences = split_into_sentences(text_Gothic)
subset_size = 1568


# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    prepositions = IN_finder(subset_text)
    print(f"Number of prepositions in Subset {i + 1}: {len(prepositions)}")


Number of prepositions in Subset 1: 3845
Number of prepositions in Subset 2: 3437
Number of prepositions in Subset 3: 3342
Number of prepositions in Subset 4: 3535
Number of prepositions in Subset 5: 4425
Number of prepositions in Subset 6: 3992
Number of prepositions in Subset 7: 3937
Number of prepositions in Subset 8: 4171
Number of prepositions in Subset 9: 3272
Number of prepositions in Subset 10: 4955
Number of prepositions in Subset 11: 5121
Number of prepositions in Subset 12: 5655
Number of prepositions in Subset 13: 6399
Number of prepositions in Subset 14: 6004
Number of prepositions in Subset 15: 6426
Number of prepositions in Subset 16: 6959
Number of prepositions in Subset 17: 4174
Number of prepositions in Subset 18: 3925
Number of prepositions in Subset 19: 4361
Number of prepositions in Subset 20: 3806
Number of prepositions in Subset 21: 3550
Number of prepositions in Subset 22: 4573
Number of prepositions in Subset 23: 4421
Number of prepositions in Subset 24: 4867
N

In [37]:
# Gothic vs Fanu

sentences = split_into_sentences(text_Gothic)
subset_size = 1097


# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    prepositions = IN_finder(subset_text)
    print(f"Number of prepositions in Subset {i + 1}: {len(prepositions)}")


Number of prepositions in Subset 1: 2871
Number of prepositions in Subset 2: 2190
Number of prepositions in Subset 3: 2624
Number of prepositions in Subset 4: 2439
Number of prepositions in Subset 5: 2201
Number of prepositions in Subset 6: 2629
Number of prepositions in Subset 7: 3240
Number of prepositions in Subset 8: 2614
Number of prepositions in Subset 9: 2792
Number of prepositions in Subset 10: 2889
Number of prepositions in Subset 11: 2921
Number of prepositions in Subset 12: 2571
Number of prepositions in Subset 13: 2192
Number of prepositions in Subset 14: 3527
Number of prepositions in Subset 15: 3549
Number of prepositions in Subset 16: 3887
Number of prepositions in Subset 17: 3988
Number of prepositions in Subset 18: 4766
Number of prepositions in Subset 19: 3971
Number of prepositions in Subset 20: 4142
Number of prepositions in Subset 21: 4680
Number of prepositions in Subset 22: 4382
Number of prepositions in Subset 23: 4804
Number of prepositions in Subset 24: 3013
N

# Adj

In [38]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def adjectives_finder(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    adjectives = [word for word, tag in tagged_words if tag.startswith('JJ')]
    return adjectives

def adjectives_counter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    adjectives = [word for word, tag in tagged_words if tag.startswith('JJ')]
    return len(adjectives)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Allan Poe

In [62]:
n_adj_Poe = adjectives_counter(preprocessed_text_Poe)
print('Number of adjectives:', n_adj_Poe)

Number of adjectives: 15089


In [68]:
# move it up
def rel_frequency (n_POS, n_words):
    return n_POS/n_words

In [52]:
rel_freq_adj_Poe = rel_frequency(15089, 199831)
print("realative frequency of adjective:", rel_freq_adj_Poe)

realative frequency of adjective: 0.07550880494017445


In [75]:
def adjectives_counter_spacy(text):
    # Split the text into smaller chunks
    chunk_size = 100000  # Adjust the chunk size as needed
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    adjectives = []
    for chunk in chunks:
        # Process each chunk with spaCy
        doc = nlp(chunk)
        # Extract adjectives from the chunk
        adjectives.extend([token.text for token in doc if token.pos_ == 'ADJ'])
    
    return len(adjectives)

In [76]:
#spacey mode
adjectives_counter_spacy(preprocessed_text_Poe)

14761

In [77]:
rel_freq_adj_Poe = rel_frequency(14761, 199831)
print("realative frequency of adjective:", rel_freq_adj_Poe)

realative frequency of adjective: 0.07386741796818312


In [63]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_finder(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

1568
Number of adjectives in Subset 1: 2621
Number of adjectives in Subset 2: 3025
Number of adjectives in Subset 3: 2971
Number of adjectives in Subset 4: 3296
Number of adjectives in Subset 5: 3627


# Sheridan Le Fanu

In [45]:
n_adj_Fanu = adjectives_counter(preprocessed_text_Fanu)
print('Number of adjectives:', n_adj_Fanu)

Number of adjectives: 11652


In [54]:
rel_freq_adj_Fanu = rel_frequency(11652, 152056)
print("realative frequency of adjective:", rel_freq_adj_Fanu)

realative frequency of adjective: 0.0766296627558268


In [78]:
#spacey mode
adjectives_counter_spacy(preprocessed_text_Fanu)

11438

In [79]:
rel_freq_adj_Fanu = rel_frequency(11438, 152056)
print("realative frequency of adjective:", rel_freq_adj_Fanu)

realative frequency of adjective: 0.07522228652601673


In [64]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_finder(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

1097
Number of adjectives in Subset 1: 2749
Number of adjectives in Subset 2: 1767
Number of adjectives in Subset 3: 1954
Number of adjectives in Subset 4: 2586
Number of adjectives in Subset 5: 2373


# Gothic corpus

In [85]:
n_adj_Gothic = adjectives_counter(preprocessed_text_Gothic)
print('Number of adjectives:', n_adj_Gothic)

Number of adjectives: 198201


In [86]:
rel_freq_adj_Gothic = rel_frequency(198201, 3005036)
print("realative frequency of adjective:", rel_freq_adj_Gothic)

realative frequency of adjective: 0.06595628138897504


In [87]:
#spacey mode
adjectives_counter_spacy(preprocessed_text_Gothic)

190307

In [88]:
rel_freq_adj_Gothicothic = rel_frequency(190307, 3005036)
print("realative frequency of adjective:", rel_freq_adj_Gothic)

realative frequency of adjective: 0.06595628138897504


In [39]:
# Gothic vs Poe

sentences = split_into_sentences(text_Gothic)
subset_size = 1568


# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_finder(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

Number of adjectives in Subset 1: 2637
Number of adjectives in Subset 2: 2322
Number of adjectives in Subset 3: 2541
Number of adjectives in Subset 4: 2542
Number of adjectives in Subset 5: 3427
Number of adjectives in Subset 6: 3190
Number of adjectives in Subset 7: 3029
Number of adjectives in Subset 8: 3411
Number of adjectives in Subset 9: 2473
Number of adjectives in Subset 10: 2574
Number of adjectives in Subset 11: 2807
Number of adjectives in Subset 12: 3081
Number of adjectives in Subset 13: 3158
Number of adjectives in Subset 14: 2963
Number of adjectives in Subset 15: 3043
Number of adjectives in Subset 16: 3433
Number of adjectives in Subset 17: 2550
Number of adjectives in Subset 18: 2183
Number of adjectives in Subset 19: 2545
Number of adjectives in Subset 20: 2060
Number of adjectives in Subset 21: 1972
Number of adjectives in Subset 22: 2318
Number of adjectives in Subset 23: 2179
Number of adjectives in Subset 24: 2509
Number of adjectives in Subset 25: 3061


In [40]:
# Gothic vs Poe

sentences = split_into_sentences(text_Gothic)
subset_size = 1097


# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_finder(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

Number of adjectives in Subset 1: 1933
Number of adjectives in Subset 2: 1542
Number of adjectives in Subset 3: 1765
Number of adjectives in Subset 4: 1871
Number of adjectives in Subset 5: 1565
Number of adjectives in Subset 6: 1971
Number of adjectives in Subset 7: 2477
Number of adjectives in Subset 8: 2091
Number of adjectives in Subset 9: 2246
Number of adjectives in Subset 10: 2211
Number of adjectives in Subset 11: 2364
Number of adjectives in Subset 12: 2042
Number of adjectives in Subset 13: 1670
Number of adjectives in Subset 14: 1798
Number of adjectives in Subset 15: 1939
Number of adjectives in Subset 16: 2068
Number of adjectives in Subset 17: 2162
Number of adjectives in Subset 18: 2373
Number of adjectives in Subset 19: 2003
Number of adjectives in Subset 20: 2031
Number of adjectives in Subset 21: 2183
Number of adjectives in Subset 22: 2164
Number of adjectives in Subset 23: 2446
Number of adjectives in Subset 24: 1833
Number of adjectives in Subset 25: 1525


# Adj -ly

In [41]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def adjectives_counter_with_ly_filter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    adjectives_ly = [word for word, tag in tagged_words if tag.startswith('JJ') and word.endswith('ly')]
    return len(adjectives_ly)

def adjectives_with_ly_filter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    adjectives_ly = [word for word, tag in tagged_words if tag.startswith('JJ') and word.endswith('ly')]
    return (adjectives_ly)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Allan Poe

In [59]:
count_adjectives_ly_Poe = adjectives_counter_with_ly_filter(preprocessed_text_Poe)
print("Number of adjectives ending with '-ly':", count_adjectives_ly_Poe)


Number of adjectives ending with '-ly': 196


In [60]:
rel_freq_adj_ly_Poe = rel_frequency(196, 199831)
print("realative frequency of adjectives end with -ly:", rel_freq_adj_ly_Poe)

realative frequency of adjectives end with -ly: 0.0009808288003362842


In [70]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_with_ly_filter(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

1568
Number of adjectives in Subset 1: 19
Number of adjectives in Subset 2: 42
Number of adjectives in Subset 3: 31
Number of adjectives in Subset 4: 44
Number of adjectives in Subset 5: 39


# Sheridan Le Fanu

In [61]:
count_adjectives_ly_Fanu = adjectives_counter_with_ly_filter(preprocessed_text_Fanu)
print("Number of adjectives ending with '-ly':", count_adjectives_ly_Fanu)

Number of adjectives ending with '-ly': 198


In [62]:
rel_freq_adj_ly_Fanu = rel_frequency(198, 152056)
print("realative frequency of adjective end with -ly:", rel_freq_adj_ly_Fanu)

realative frequency of adjective end with -ly: 0.0013021518387962329


In [71]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_with_ly_filter(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

1097
Number of adjectives in Subset 1: 43
Number of adjectives in Subset 2: 41
Number of adjectives in Subset 3: 30
Number of adjectives in Subset 4: 33
Number of adjectives in Subset 5: 56


# Gothic corpus

In [63]:
count_adjectives_ly_Gothic = adjectives_counter_with_ly_filter(preprocessed_text_Gothic)
print("Number of adjectives ending with '-ly':", count_adjectives_ly_Gothic)

Number of adjectives ending with '-ly': 3224


In [64]:
rel_freq_adj_ly_Gothic = rel_frequency(3224, 3005036)
print("realative frequency of adjective end with -ly:", rel_freq_adj_ly_Gothic)

realative frequency of adjective end with -ly: 0.0010728656828071278


In [43]:
# Gothic vs Poe

sentences = split_into_sentences(text_Gothic)
subset_size = 1568


# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_with_ly_filter(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

Number of adjectives in Subset 1: 28
Number of adjectives in Subset 2: 35
Number of adjectives in Subset 3: 32
Number of adjectives in Subset 4: 27
Number of adjectives in Subset 5: 35
Number of adjectives in Subset 6: 30
Number of adjectives in Subset 7: 36
Number of adjectives in Subset 8: 37
Number of adjectives in Subset 9: 21
Number of adjectives in Subset 10: 53
Number of adjectives in Subset 11: 32
Number of adjectives in Subset 12: 68
Number of adjectives in Subset 13: 51
Number of adjectives in Subset 14: 61
Number of adjectives in Subset 15: 40
Number of adjectives in Subset 16: 45
Number of adjectives in Subset 17: 40
Number of adjectives in Subset 18: 39
Number of adjectives in Subset 19: 49
Number of adjectives in Subset 20: 34
Number of adjectives in Subset 21: 35
Number of adjectives in Subset 22: 52
Number of adjectives in Subset 23: 37
Number of adjectives in Subset 24: 48
Number of adjectives in Subset 25: 38


In [44]:
# Gothic vs Fanu

sentences = split_into_sentences(text_Gothic)
subset_size = 1097


# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adjectives_with_ly_filter(subset_text)
    print(f"Number of adjectives in Subset {i + 1}: {len(adjectives)}")

Number of adjectives in Subset 1: 25
Number of adjectives in Subset 2: 10
Number of adjectives in Subset 3: 30
Number of adjectives in Subset 4: 26
Number of adjectives in Subset 5: 19
Number of adjectives in Subset 6: 21
Number of adjectives in Subset 7: 24
Number of adjectives in Subset 8: 15
Number of adjectives in Subset 9: 29
Number of adjectives in Subset 10: 24
Number of adjectives in Subset 11: 24
Number of adjectives in Subset 12: 21
Number of adjectives in Subset 13: 14
Number of adjectives in Subset 14: 41
Number of adjectives in Subset 15: 30
Number of adjectives in Subset 16: 22
Number of adjectives in Subset 17: 47
Number of adjectives in Subset 18: 50
Number of adjectives in Subset 19: 36
Number of adjectives in Subset 20: 38
Number of adjectives in Subset 21: 27
Number of adjectives in Subset 22: 27
Number of adjectives in Subset 23: 35
Number of adjectives in Subset 24: 32
Number of adjectives in Subset 25: 23


# Adjective before noun

In [45]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def count_adj_before_noun(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    count = 0
    total_nouns = 0
    for i in range(1, len(tagged_words)):
        if tagged_words[i][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            total_nouns += 1
            if tagged_words[i - 1][1] == 'JJ':
                count += 1
    if total_nouns == 0:
        return 0
    else:
        #return (count / total_nouns), count  # Return relative frequency
        print("Total number of adjectives before nouns:", count)
        print("Relative frequency of adjectives before nouns:", (count / total_nouns))


[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [47]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def count_adj_before_noun2(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    count = 0
    total_nouns = 0
    for i in range(1, len(tagged_words)):
        if tagged_words[i][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            total_nouns += 1
            if tagged_words[i - 1][1] == 'JJ':
                count += 1
    if total_nouns == 0:
        return 0
    else:
        return count


[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Allan Poe

In [78]:
count_adj_before_noun(preprocessed_text_Poe)

Total number of adjectives before nouns: 8934
Relative frequency of adjectives before nouns: 0.19861719392632446


In [81]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of adjectives before nouns in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives_count = count_adj_before_noun2(subset_text)
    print(f"Number of adjectives before nouns in Subset {i + 1}: {adjectives_count}")


1568
Number of adjectives before nouns in Subset 1: 1531
Number of adjectives before nouns in Subset 2: 1792
Number of adjectives before nouns in Subset 3: 1725
Number of adjectives before nouns in Subset 4: 1878
Number of adjectives before nouns in Subset 5: 2220


# Sheridan Le Fanu

In [75]:
count_adj_before_noun(preprocessed_text_Fanu)

Total number of adjectives before nouns: 6796
Relative frequency of adjectives before nouns: 0.19800134020918916


In [82]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of adjectives before nouns in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives_count = count_adj_before_noun2(subset_text)
    print(f"Number of adjectives before nouns in Subset {i + 1}: {adjectives_count}")


1097
Number of adjectives before nouns in Subset 1: 1625
Number of adjectives before nouns in Subset 2: 977
Number of adjectives before nouns in Subset 3: 1117
Number of adjectives before nouns in Subset 4: 1375
Number of adjectives before nouns in Subset 5: 1418


# Gothic corpus

In [76]:
count_adj_before_noun(preprocessed_text_Gothic)

Total number of adjectives before nouns: 112205
Relative frequency of adjectives before nouns: 0.17382837614040741


In [48]:
#Gothic vs Poe
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the number of adjectives before nouns in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives_count = count_adj_before_noun2(subset_text)
    print(f"Number of adjectives before nouns in Subset {i + 1}: {adjectives_count}")


Number of adjectives before nouns in Subset 1: 1488
Number of adjectives before nouns in Subset 2: 1319
Number of adjectives before nouns in Subset 3: 1378
Number of adjectives before nouns in Subset 4: 1450
Number of adjectives before nouns in Subset 5: 1877
Number of adjectives before nouns in Subset 6: 1729
Number of adjectives before nouns in Subset 7: 1693
Number of adjectives before nouns in Subset 8: 1857
Number of adjectives before nouns in Subset 9: 1374
Number of adjectives before nouns in Subset 10: 1355
Number of adjectives before nouns in Subset 11: 1410
Number of adjectives before nouns in Subset 12: 1494
Number of adjectives before nouns in Subset 13: 1589
Number of adjectives before nouns in Subset 14: 1427
Number of adjectives before nouns in Subset 15: 1435
Number of adjectives before nouns in Subset 16: 1757
Number of adjectives before nouns in Subset 17: 1474
Number of adjectives before nouns in Subset 18: 1273
Number of adjectives before nouns in Subset 19: 1500
Nu

In [49]:
#Gothic vs Fanu
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the number of adjectives before nouns in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives_count = count_adj_before_noun2(subset_text)
    print(f"Number of adjectives before nouns in Subset {i + 1}: {adjectives_count}")


Number of adjectives before nouns in Subset 1: 1108
Number of adjectives before nouns in Subset 2: 859
Number of adjectives before nouns in Subset 3: 1000
Number of adjectives before nouns in Subset 4: 994
Number of adjectives before nouns in Subset 5: 902
Number of adjectives before nouns in Subset 6: 1107
Number of adjectives before nouns in Subset 7: 1352
Number of adjectives before nouns in Subset 8: 1128
Number of adjectives before nouns in Subset 9: 1258
Number of adjectives before nouns in Subset 10: 1214
Number of adjectives before nouns in Subset 11: 1271
Number of adjectives before nouns in Subset 12: 1139
Number of adjectives before nouns in Subset 13: 920
Number of adjectives before nouns in Subset 14: 951
Number of adjectives before nouns in Subset 15: 994
Number of adjectives before nouns in Subset 16: 1015
Number of adjectives before nouns in Subset 17: 1069
Number of adjectives before nouns in Subset 18: 1228
Number of adjectives before nouns in Subset 19: 936
Number of

# Noun

In [50]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def noun_finder(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    nouns = [word for word, tag in tagged_words if tag.startswith('NN')]
    return nouns

def noun_counter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    nouns = [word for word, tag in tagged_words if tag.startswith('NN')]
    return len(nouns)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Poe

In [19]:
n_nouns_Poe = noun_counter(preprocessed_text_Poe)
print('Number of nouns:', n_nouns_Poe)

Number of nouns: 44981


In [21]:
rel_freq_noun_Poe = rel_frequency(44981, 199831)
print("realative frequency of verb:", rel_freq_noun_Poe)

realative frequency of verb: 0.22509520544860406


In [87]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = noun_finder(subset_text)
    print(f"Number of nouns in Subset {i + 1}: {len(adjectives)}")

1568
Number of adjectives in Subset 1: 9045
Number of adjectives in Subset 2: 8679
Number of adjectives in Subset 3: 7960
Number of adjectives in Subset 4: 9811
Number of adjectives in Subset 5: 10421


# Le Fanu

In [22]:
n_nouns_Fanu = noun_counter(preprocessed_text_Fanu)
print('Number of nouns:', n_nouns_Fanu)

Number of verbs: 34324


In [23]:
rel_freq_nouns_Fanu = rel_frequency(34324, 152056)
print("realative frequency of nouns:", rel_freq_nouns_Fanu)

realative frequency of nouns: 0.22573262482243384


In [88]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = noun_finder(subset_text)
    print(f"Number of nouns in Subset {i + 1}: {len(adjectives)}")

1097
Number of adjectives in Subset 1: 7820
Number of adjectives in Subset 2: 5917
Number of adjectives in Subset 3: 6400
Number of adjectives in Subset 4: 6376
Number of adjectives in Subset 5: 6756


# Gothic corpus

In [24]:
n_nouns_Gothic = noun_counter(preprocessed_text_Gothic)
print('Number of nouns:', n_nouns_Gothic)

Number of nouns: 645493


In [25]:
rel_freq_nouns_Gothic = rel_frequency(645493, 3005036)
print("realative frequency of nouns:", rel_freq_nouns_Gothic)

realative frequency of nouns: 0.2148037494392746


In [51]:
# Gothic vs Poe
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = noun_finder(subset_text)
    print(f"Number of nouns in Subset {i + 1}: {len(adjectives)}")

Number of nouns in Subset 1: 8749
Number of nouns in Subset 2: 7489
Number of nouns in Subset 3: 7772
Number of nouns in Subset 4: 7921
Number of nouns in Subset 5: 9659
Number of nouns in Subset 6: 8807
Number of nouns in Subset 7: 9157
Number of nouns in Subset 8: 9657
Number of nouns in Subset 9: 7778
Number of nouns in Subset 10: 8284
Number of nouns in Subset 11: 9323
Number of nouns in Subset 12: 9585
Number of nouns in Subset 13: 10368
Number of nouns in Subset 14: 9378
Number of nouns in Subset 15: 9950
Number of nouns in Subset 16: 11056
Number of nouns in Subset 17: 7525
Number of nouns in Subset 18: 7597
Number of nouns in Subset 19: 8106
Number of nouns in Subset 20: 7411
Number of nouns in Subset 21: 6958
Number of nouns in Subset 22: 8394
Number of nouns in Subset 23: 7469
Number of nouns in Subset 24: 8658
Number of nouns in Subset 25: 8124


In [52]:
# Gothic vs Fanu
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = noun_finder(subset_text)
    print(f"Number of nouns in Subset {i + 1}: {len(adjectives)}")

Number of nouns in Subset 1: 6462
Number of nouns in Subset 2: 5119
Number of nouns in Subset 3: 5560
Number of nouns in Subset 4: 5618
Number of nouns in Subset 5: 5056
Number of nouns in Subset 6: 5750
Number of nouns in Subset 7: 7013
Number of nouns in Subset 8: 5988
Number of nouns in Subset 9: 6314
Number of nouns in Subset 10: 6630
Number of nouns in Subset 11: 6652
Number of nouns in Subset 12: 6172
Number of nouns in Subset 13: 5295
Number of nouns in Subset 14: 5787
Number of nouns in Subset 15: 6444
Number of nouns in Subset 16: 6614
Number of nouns in Subset 17: 6705
Number of nouns in Subset 18: 7486
Number of nouns in Subset 19: 6603
Number of nouns in Subset 20: 6528
Number of nouns in Subset 21: 7233
Number of nouns in Subset 22: 6740
Number of nouns in Subset 23: 7821
Number of nouns in Subset 24: 5479
Number of nouns in Subset 25: 5226


# Verb

In [53]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def verbs_finder(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    verbs = [word for word, tag in tagged_words if tag.startswith('VB')]
    return verbs

def verbs_counter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    verbs = [word for word, tag in tagged_words if tag.startswith('VB')]
    return len(verbs)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def verbs_counter_spacy(text):
    # Split the text into smaller chunks
    chunk_size = 100000  # Adjust the chunk size as needed
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    adjectives = []
    for chunk in chunks:
        # Process each chunk with spaCy
        doc = nlp(chunk)
        # Extract adjectives from the chunk
        adjectives.extend([token.text for token in doc if token.pos_ == 'ADJ'])
    
    return len(adjectives)

# Allan Poe

In [57]:
n_verbs_Poe = verbs_counter(preprocessed_text_Poe)
print('Number of verbs:', n_verbs_Poe)

Number of adjectives: 32906


In [60]:
rel_freq_verb_Poe = rel_frequency(32906, 199831)
print("realative frequency of verb:", rel_freq_verb_Poe)

realative frequency of verb: 0.16466914542788658


In [92]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = verbs_finder(subset_text)
    print(f"Number of verbs in Subset {i + 1}: {len(adjectives)}")

1568
Number of verbs in Subset 1: 6472
Number of verbs in Subset 2: 5861
Number of verbs in Subset 3: 6118
Number of verbs in Subset 4: 7552
Number of verbs in Subset 5: 6601


# Sheridan Le Fanu

In [59]:
n_verbs_Fanu = verbs_counter(preprocessed_text_Fanu)
print('Number of verbs:', n_verbs_Fanu)

Number of adjectives: 27124


In [92]:
rel_freq_verb_Fanu = rel_frequency(27124, 152056)
print("realative frequency of verb:", rel_freq_verb_Fanu)

realative frequency of verb: 0.1783816488662072


In [93]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = verbs_finder(subset_text)
    print(f"Number of verbs in Subset {i + 1}: {len(adjectives)}")

1097
Number of verbs in Subset 1: 5808
Number of verbs in Subset 2: 4605
Number of verbs in Subset 3: 5069
Number of verbs in Subset 4: 5996
Number of verbs in Subset 5: 4869


# Gothic corpus

In [89]:
n_verbs_Gothic = verbs_counter(preprocessed_text_Gothic)
print('Number of verbs:', n_verbs_Gothic)

Number of verbs: 571994


In [93]:
rel_freq_verb_Gothic = rel_frequency(571994, 3005036)
print("realative frequency of verb:", rel_freq_verb_Gothic)

realative frequency of verb: 0.19034514062393929


In [54]:
# Poe vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = verbs_finder(subset_text)
    print(f"Number of verbs in Subset {i + 1}: {len(adjectives)}")

Number of verbs in Subset 1: 7429
Number of verbs in Subset 2: 6866
Number of verbs in Subset 3: 7280
Number of verbs in Subset 4: 7028
Number of verbs in Subset 5: 7048
Number of verbs in Subset 6: 6501
Number of verbs in Subset 7: 6839
Number of verbs in Subset 8: 7219
Number of verbs in Subset 9: 5891
Number of verbs in Subset 10: 7838
Number of verbs in Subset 11: 8716
Number of verbs in Subset 12: 9250
Number of verbs in Subset 13: 9666
Number of verbs in Subset 14: 9909
Number of verbs in Subset 15: 10452
Number of verbs in Subset 16: 10915
Number of verbs in Subset 17: 5650
Number of verbs in Subset 18: 6065
Number of verbs in Subset 19: 6107
Number of verbs in Subset 20: 5941
Number of verbs in Subset 21: 5333
Number of verbs in Subset 22: 6457
Number of verbs in Subset 23: 6045
Number of verbs in Subset 24: 6498
Number of verbs in Subset 25: 7459


In [55]:
# Fanu vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = verbs_finder(subset_text)
    print(f"Number of verbs in Subset {i + 1}: {len(adjectives)}")

Number of verbs in Subset 1: 5429
Number of verbs in Subset 2: 4596
Number of verbs in Subset 3: 5004
Number of verbs in Subset 4: 5289
Number of verbs in Subset 5: 4864
Number of verbs in Subset 6: 4711
Number of verbs in Subset 7: 5146
Number of verbs in Subset 8: 4477
Number of verbs in Subset 9: 4518
Number of verbs in Subset 10: 4934
Number of verbs in Subset 11: 5022
Number of verbs in Subset 12: 4655
Number of verbs in Subset 13: 3912
Number of verbs in Subset 14: 5460
Number of verbs in Subset 15: 5824
Number of verbs in Subset 16: 6719
Number of verbs in Subset 17: 6338
Number of verbs in Subset 18: 6985
Number of verbs in Subset 19: 6643
Number of verbs in Subset 20: 6827
Number of verbs in Subset 21: 7499
Number of verbs in Subset 22: 7724
Number of verbs in Subset 23: 6817
Number of verbs in Subset 24: 3912
Number of verbs in Subset 25: 4175


# juxtapose verbs

In [124]:
import spacy

def count_juxtaposed_verbs(text):
    # Load the English model
    nlp = spacy.load("en_core_web_sm")

    # Initialize set for verbs
    verbs_set = set()

    # List of conjunctions, punctuation marks, and space to consider
    conj_punct_space = ['and', 'or', ',']

    # Split text into chunks of maximum length allowed by SpaCy
    max_length = 1000000
    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]

    # Process each chunk with SpaCy and count juxtaposed verbs
    for chunk in chunks:
        doc = nlp(chunk)
        tagged_words = [(token.text, token.pos_) for token in doc]
        for i in range(len(tagged_words) - 2):
            if tagged_words[i][1] == 'VERB':
                if tagged_words[i + 1][0] in conj_punct_space:
                    if tagged_words[i + 2][1] == 'VERB':
                        verbs_set.update([tagged_words[i][0], tagged_words[i + 2][0]])

    return len(verbs_set)


# Allan Poe

In [66]:
count_juxtaposed_verbs(preprocessed_text_Poe)


450

In [69]:
rel_freq_juxtaposed_verbs_Poe = rel_frequency(450, 32906)
print("realative frequency of juxtaposed verb:", rel_freq_juxtaposed_verbs_Poe)


realative frequency of juxtaposed verb: 0.0136753175712636


In [79]:
import spacy
import re

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

# Function to count juxtaposed verbs in a subset of text
def count_juxtaposed_verbs_subset(subset_text):
    # Load the English model
    nlp = spacy.load("en_core_web_sm")

    # Initialize set for verbs
    verbs_set = set()

    # List of conjunctions, punctuation marks, and space to consider
    conj_punct_space = ['and', 'or', ',']

    # Process the subset with SpaCy and count juxtaposed verbs
    doc = nlp(subset_text)
    tagged_words = [(token.text, token.pos_) for token in doc]
    for i in range(len(tagged_words) - 2):
        if tagged_words[i][1] == 'VERB':
            if tagged_words[i + 1][0] in conj_punct_space:
                if tagged_words[i + 2][1] == 'VERB':
                    verbs_set.update([tagged_words[i][0], tagged_words[i + 2][0]])

    return len(verbs_set)


# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5

# Find the number of juxtaposed verbs in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    verbs_count_subset = count_juxtaposed_verbs_subset(subset_text)
    print(f"Number of Juxtaposed Verbs in Subset {i + 1}: {verbs_count_subset}")


Number of Juxtaposed Verbs in Subset 1: 77
Number of Juxtaposed Verbs in Subset 2: 128
Number of Juxtaposed Verbs in Subset 3: 102
Number of Juxtaposed Verbs in Subset 4: 87
Number of Juxtaposed Verbs in Subset 5: 105


# Sheridan Le Fanu

In [70]:
count_juxtaposed_verbs(preprocessed_text_Fanu)


492

In [71]:
rel_freq_juxtaposed_verbs_Fanu = rel_frequency(492, 27124)
print("realative frequency of juxtaposed verb:", rel_freq_juxtaposed_verbs_Fanu)


realative frequency of juxtaposed verb: 0.018138917563781155


In [126]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5

# Find the number of juxtaposed verbs in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    verbs_count_subset = count_juxtaposed_verbs_subset(subset_text)
    print(f"Number of Juxtaposed Verbs in Subset {i + 1}: {verbs_count_subset}")


Number of Juxtaposed Verbs in Subset 1: 124
Number of Juxtaposed Verbs in Subset 2: 82
Number of Juxtaposed Verbs in Subset 3: 98
Number of Juxtaposed Verbs in Subset 4: 147
Number of Juxtaposed Verbs in Subset 5: 125


# Gothic corpus

In [72]:
count_juxtaposed_verbs(preprocessed_text_Gothic)

3373

In [73]:
rel_freq_juxtaposed_verbs_Gothic = rel_frequency(3373, 571994)
print("realative frequency of juxtaposed verb:", rel_freq_juxtaposed_verbs_Gothic)


realative frequency of juxtaposed verb: 0.005896915002604922


In [80]:
#Poe Vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the number of juxtaposed verbs in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    verbs_count_subset = count_juxtaposed_verbs_subset(subset_text)
    print(f"Number of Juxtaposed Verbs in Subset {i + 1}: {verbs_count_subset}")


Number of Juxtaposed Verbs in Subset 1: 162
Number of Juxtaposed Verbs in Subset 2: 133
Number of Juxtaposed Verbs in Subset 3: 147
Number of Juxtaposed Verbs in Subset 4: 163
Number of Juxtaposed Verbs in Subset 5: 162
Number of Juxtaposed Verbs in Subset 6: 144
Number of Juxtaposed Verbs in Subset 7: 181
Number of Juxtaposed Verbs in Subset 8: 199
Number of Juxtaposed Verbs in Subset 9: 144
Number of Juxtaposed Verbs in Subset 10: 102
Number of Juxtaposed Verbs in Subset 11: 89
Number of Juxtaposed Verbs in Subset 12: 96
Number of Juxtaposed Verbs in Subset 13: 94
Number of Juxtaposed Verbs in Subset 14: 104
Number of Juxtaposed Verbs in Subset 15: 136
Number of Juxtaposed Verbs in Subset 16: 134
Number of Juxtaposed Verbs in Subset 17: 95
Number of Juxtaposed Verbs in Subset 18: 106
Number of Juxtaposed Verbs in Subset 19: 98
Number of Juxtaposed Verbs in Subset 20: 82
Number of Juxtaposed Verbs in Subset 21: 81
Number of Juxtaposed Verbs in Subset 22: 81
Number of Juxtaposed Verbs 

In [81]:
#Fanu Vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the number of juxtaposed verbs in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    verbs_count_subset = count_juxtaposed_verbs_subset(subset_text)
    print(f"Number of Juxtaposed Verbs in Subset {i + 1}: {verbs_count_subset}")


Number of Juxtaposed Verbs in Subset 1: 136
Number of Juxtaposed Verbs in Subset 2: 93
Number of Juxtaposed Verbs in Subset 3: 102
Number of Juxtaposed Verbs in Subset 4: 112
Number of Juxtaposed Verbs in Subset 5: 112
Number of Juxtaposed Verbs in Subset 6: 118
Number of Juxtaposed Verbs in Subset 7: 122
Number of Juxtaposed Verbs in Subset 8: 119
Number of Juxtaposed Verbs in Subset 9: 89
Number of Juxtaposed Verbs in Subset 10: 146
Number of Juxtaposed Verbs in Subset 11: 125
Number of Juxtaposed Verbs in Subset 12: 143
Number of Juxtaposed Verbs in Subset 13: 99
Number of Juxtaposed Verbs in Subset 14: 76
Number of Juxtaposed Verbs in Subset 15: 49
Number of Juxtaposed Verbs in Subset 16: 83
Number of Juxtaposed Verbs in Subset 17: 57
Number of Juxtaposed Verbs in Subset 18: 76
Number of Juxtaposed Verbs in Subset 19: 74
Number of Juxtaposed Verbs in Subset 20: 72
Number of Juxtaposed Verbs in Subset 21: 94
Number of Juxtaposed Verbs in Subset 22: 101
Number of Juxtaposed Verbs in 

# Adverb

In [56]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def adverbs_finder(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    adjectives = [word for word, tag in tagged_words if tag.startswith('RB')]
    return adjectives

def adverbs_counter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    adjectives = [word for word, tag in tagged_words if tag.startswith('RB')]
    return len(adjectives)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Allan Poe

In [28]:
n_adv_Poe = adverbs_counter(preprocessed_text_Poe)
print('Number of adverbs:', n_adv_Poe)

Number of verbs: 12824


In [31]:
rel_freq_adv_Poe = rel_frequency(12824, 199831)
print("realative frequency of adverbs:", rel_freq_adv_Poe)

realative frequency of verb: 0.0641742272220026


In [97]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adverbs_finder(subset_text)
    print(f"Number of adverb in Subset {i + 1}: {len(adjectives)}")

1568
Number of adverb in Subset 1: 2264
Number of adverb in Subset 2: 2276
Number of adverb in Subset 3: 2496
Number of adverb in Subset 4: 3001
Number of adverb in Subset 5: 2734


# Sheridan Le Fanu

In [32]:
n_adv_Fanu = adverbs_counter(preprocessed_text_Fanu)
print('Number of adverbs:', n_adv_Fanu)

Number of verbs: 8966


In [33]:
rel_freq_adv_Fanu = rel_frequency(8966, 152056)
print("realative frequency of adverbs:", rel_freq_adv_Fanu)

realative frequency of adverbs: 0.05896511811437891


In [98]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adverbs_finder(subset_text)
    print(f"Number of adverb in Subset {i + 1}: {len(adjectives)}")

1097
Number of adverb in Subset 1: 2161
Number of adverb in Subset 2: 1588
Number of adverb in Subset 3: 1794
Number of adverb in Subset 4: 2155
Number of adverb in Subset 5: 1597


# Gothic corpus

In [34]:
n_adv_Gothic = adverbs_counter(preprocessed_text_Gothic)
print('Number of adverbs:', n_adv_Gothic)

Number of adverbs: 180929


In [35]:
rel_freq_adverb_Gothic = rel_frequency(180929, 3005036)
print("realative frequency of adverb:", rel_freq_adverb_Gothic)

realative frequency of adverb: 0.06020859650267085


In [57]:
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adverbs_finder(subset_text)
    print(f"Number of adverb in Subset {i + 1}: {len(adjectives)}")

Number of adverb in Subset 1: 2077
Number of adverb in Subset 2: 1841
Number of adverb in Subset 3: 2031
Number of adverb in Subset 4: 2123
Number of adverb in Subset 5: 2407
Number of adverb in Subset 6: 2313
Number of adverb in Subset 7: 2356
Number of adverb in Subset 8: 2363
Number of adverb in Subset 9: 1866
Number of adverb in Subset 10: 2881
Number of adverb in Subset 11: 3250
Number of adverb in Subset 12: 3451
Number of adverb in Subset 13: 3404
Number of adverb in Subset 14: 3840
Number of adverb in Subset 15: 3761
Number of adverb in Subset 16: 3870
Number of adverb in Subset 17: 1815
Number of adverb in Subset 18: 1829
Number of adverb in Subset 19: 2028
Number of adverb in Subset 20: 1732
Number of adverb in Subset 21: 1572
Number of adverb in Subset 22: 1828
Number of adverb in Subset 23: 1711
Number of adverb in Subset 24: 1736
Number of adverb in Subset 25: 2725


In [58]:
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = adverbs_finder(subset_text)
    print(f"Number of adverb in Subset {i + 1}: {len(adjectives)}")

Number of adverb in Subset 1: 1499
Number of adverb in Subset 2: 1230
Number of adverb in Subset 3: 1366
Number of adverb in Subset 4: 1534
Number of adverb in Subset 5: 1336
Number of adverb in Subset 6: 1577
Number of adverb in Subset 7: 1744
Number of adverb in Subset 8: 1498
Number of adverb in Subset 9: 1625
Number of adverb in Subset 10: 1733
Number of adverb in Subset 11: 1675
Number of adverb in Subset 12: 1430
Number of adverb in Subset 13: 1258
Number of adverb in Subset 14: 2046
Number of adverb in Subset 15: 2163
Number of adverb in Subset 16: 2524
Number of adverb in Subset 17: 2367
Number of adverb in Subset 18: 2505
Number of adverb in Subset 19: 2328
Number of adverb in Subset 20: 2705
Number of adverb in Subset 21: 2719
Number of adverb in Subset 22: 2810
Number of adverb in Subset 23: 2342
Number of adverb in Subset 24: 1277
Number of adverb in Subset 25: 1259


In [24]:
#Kruskal–Wallis test
from scipy.stats import kruskal
from nltk.tag import pos_tag


# Count adjectives before nouns for each corpus
Adv_Poe = [adverbs_counter(text) for text in preprocessed_text_Poe]
Adv_Gothic = [adverbs_counter(text) for text in preprocessed_text_Gothic]
Adv_Fanu = [adverbs_counter(text) for text in preprocessed_text_Fanu]

# Perform Kruskal-Wallis H test
statistic, p_value = kruskal(Adv_Poe, Adv_Gothic, Adv_Fanu)

# Print the results
print("Kruskal-Wallis H statistic:", statistic)
print("p-value:", p_value)

# Interpret the results
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis. There is a statistically significant difference in the number of adjectives before nouns among the three corpora.")
else:
    print("Fail to reject the null hypothesis. There is no statistically significant difference in the number of adjectives before nouns among the three corpora.")

Kruskal-Wallis H statistic: 0.0
p-value: 1.0
Fail to reject the null hypothesis. There is no statistically significant difference in the number of adjectives before nouns among the three corpora.


# Content words (Noun, Verb, Adj, Adv)

In [59]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger')

def ContenW_finder(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    Content_words = [word for word, tag in tagged_words if tag.startswith(('NN', 'JJ', 'RB', 'VB'))]
    return Content_words

def ContenW_counter(text):
    words = word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    Content_words = [word for word, tag in tagged_words if tag.startswith(('NN', 'JJ', 'RB', 'VB'))]
    return len(Content_words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Allan Poe

In [65]:
n_ContenW_Poe = ContenW_counter(preprocessed_text_Poe)
print('Number of content words:', n_ContenW_Poe)

Number of content words: 105800


In [66]:
rel_freq_ContenW_Poe = rel_frequency(105800, 199831)
print("realative frequency of content words:", rel_freq_ContenW_Poe)

realative frequency of content words: 0.5294473830386677


In [102]:
# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = ContenW_finder(subset_text)
    print(f"Number of content words in Subset {i + 1}: {len(adjectives)}")

1568
Number of content words in Subset 1: 20402
Number of content words in Subset 2: 19841
Number of content words in Subset 3: 19545
Number of content words in Subset 4: 23660
Number of content words in Subset 5: 23383


# Sheridan Le Fanu

In [67]:
n_ContenW_Fanu = ContenW_counter(preprocessed_text_Fanu)
print('Number of content words:', n_ContenW_Fanu)

Number of content words: 82066


In [68]:
rel_freq_ContenW_Fanu = rel_frequency(82066, 152056)
print("realative frequency of content words:", rel_freq_ContenW_Fanu)

realative frequency of content words: 0.5397090545588468


In [103]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the number of prepositions in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = ContenW_finder(subset_text)
    print(f"Number of content words in Subset {i + 1}: {len(adjectives)}")

1097
Number of content words in Subset 1: 18538
Number of content words in Subset 2: 13877
Number of content words in Subset 3: 15217
Number of content words in Subset 4: 17113
Number of content words in Subset 5: 15595


# Gothic corpus

In [94]:
n_ContenW_Gothic = ContenW_counter(preprocessed_text_Gothic)
print('Number of content words:', n_ContenW_Gothic)

Number of content words: 1596617


In [95]:
rel_freq_ContenW_Gothic = rel_frequency(1596617, 3005036)
print("realative frequency of content words:", rel_freq_ContenW_Gothic)

realative frequency of content words: 0.5313137679548597


In [60]:
#Poe vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = ContenW_finder(subset_text)
    print(f"Number of content words in Subset {i + 1}: {len(adjectives)}")

Number of content words in Subset 1: 20892
Number of content words in Subset 2: 18518
Number of content words in Subset 3: 19624
Number of content words in Subset 4: 19614
Number of content words in Subset 5: 22541
Number of content words in Subset 6: 20811
Number of content words in Subset 7: 21381
Number of content words in Subset 8: 22650
Number of content words in Subset 9: 18008
Number of content words in Subset 10: 21577
Number of content words in Subset 11: 24096
Number of content words in Subset 12: 25367
Number of content words in Subset 13: 26596
Number of content words in Subset 14: 26090
Number of content words in Subset 15: 27206
Number of content words in Subset 16: 29274
Number of content words in Subset 17: 17540
Number of content words in Subset 18: 17674
Number of content words in Subset 19: 18786
Number of content words in Subset 20: 17144
Number of content words in Subset 21: 15835
Number of content words in Subset 22: 18997
Number of content words in Subset 23: 174

In [61]:
#Fanu vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the number of prepositions in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    adjectives = ContenW_finder(subset_text)
    print(f"Number of content words in Subset {i + 1}: {len(adjectives)}")

Number of content words in Subset 1: 15323
Number of content words in Subset 2: 12487
Number of content words in Subset 3: 13695
Number of content words in Subset 4: 14312
Number of content words in Subset 5: 12821
Number of content words in Subset 6: 14009
Number of content words in Subset 7: 16380
Number of content words in Subset 8: 14054
Number of content words in Subset 9: 14703
Number of content words in Subset 10: 15508
Number of content words in Subset 11: 15713
Number of content words in Subset 12: 14299
Number of content words in Subset 13: 12135
Number of content words in Subset 14: 15091
Number of content words in Subset 15: 16370
Number of content words in Subset 16: 17925
Number of content words in Subset 17: 17572
Number of content words in Subset 18: 19349
Number of content words in Subset 19: 17577
Number of content words in Subset 20: 18091
Number of content words in Subset 21: 19634
Number of content words in Subset 22: 19438
Number of content words in Subset 23: 194

# POV

In [67]:
# Define a function to count the frequency of points of view (POV) in the corpus
def count_pov_frequency(text):
    pov_list = ['first-person', 'second-person', 'third-person']
    pov_frequency = {pov: 0 for pov in pov_list}
    
    # Define keywords for identifying each point of view
    first_person_keywords = ['I', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    second_person_keywords = ['you', 'your', 'yours']
    third_person_keywords = ['he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'they', 'them', 'their', 'theirs']

    # Tokenize the text into words
    words = word_tokenize(text)

    # Count the frequency of each point of view
    for word in words:
        if word.lower() in first_person_keywords:
            pov_frequency['first-person'] += 1
        elif word.lower() in second_person_keywords:
            pov_frequency['second-person'] += 1
        elif word.lower() in third_person_keywords:
            pov_frequency['third-person'] += 1

    return pov_frequency


# Allan Poe

In [70]:
pov_frequency_Poe = count_pov_frequency(preprocessed_text_Poe)

print("Frequency of first-person POV:", pov_frequency_Poe['first-person']/(pov_frequency_Poe['first-person']+pov_frequency_Poe['second-person']+pov_frequency_Poe['third-person']))
print("Frequency of second-person POV:", pov_frequency_Poe['second-person']/(pov_frequency_Poe['first-person']+pov_frequency_Poe['second-person']+pov_frequency_Poe['third-person']))
print("Frequency of third-person POV:", pov_frequency_Poe['third-person']/(pov_frequency_Poe['first-person']+pov_frequency_Poe['second-person']+pov_frequency_Poe['third-person']))

Frequency of first-person POV: 0.3474550441938433
Frequency of second-person POV: 0.03093569033831149
Frequency of third-person POV: 0.6216092654678451


In [68]:
# Define the function to count the frequency of points of view (POV) in a subset
def count_pov_frequency_subset(subset_text):
    pov_list = ['first-person', 'second-person', 'third-person']
    pov_frequency = {pov: 0 for pov in pov_list}
    
    # Define keywords for identifying each point of view
    first_person_keywords = ['I', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']
    second_person_keywords = ['you', 'your', 'yours']
    third_person_keywords = ['he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'they', 'them', 'their', 'theirs']

    # Tokenize the subset text into words
    words = word_tokenize(subset_text)

    # Count the frequency of each point of view
    for word in words:
        if word.lower() in first_person_keywords:
            pov_frequency['first-person'] += 1
        elif word.lower() in second_person_keywords:
            pov_frequency['second-person'] += 1
        elif word.lower() in third_person_keywords:
            pov_frequency['third-person'] += 1

    return pov_frequency

# Split the text into sentences (assuming you have already defined the function split_into_sentences)
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find the occurrence of each POV in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pov_frequency_subset = count_pov_frequency_subset(subset_text)
    print(f"Point of View Frequency in Subset {i + 1}: {pov_frequency_subset}")


1568
Point of View Frequency in Subset 1: {'first-person': 458, 'second-person': 156, 'third-person': 1524}
Point of View Frequency in Subset 2: {'first-person': 793, 'second-person': 125, 'third-person': 1430}
Point of View Frequency in Subset 3: {'first-person': 1068, 'second-person': 46, 'third-person': 1485}
Point of View Frequency in Subset 4: {'first-person': 1369, 'second-person': 2, 'third-person': 1776}
Point of View Frequency in Subset 5: {'first-person': 812, 'second-person': 65, 'third-person': 1790}


# Sheridan Le Fanu

In [71]:
pov_frequency_Fanu = count_pov_frequency(preprocessed_text_Fanu)

print("Frequency of first-person POV:", pov_frequency_Fanu['first-person']/(pov_frequency_Fanu['first-person']+pov_frequency_Fanu['second-person']+pov_frequency_Fanu['third-person']))
print("Frequency of second-person POV:", pov_frequency_Fanu['second-person']/(pov_frequency_Fanu['first-person']+pov_frequency_Fanu['second-person']+pov_frequency_Fanu['third-person']))
print("Frequency of third-person POV:", pov_frequency_Fanu['third-person']/(pov_frequency_Fanu['first-person']+pov_frequency_Fanu['second-person']+pov_frequency_Fanu['third-person']))

Frequency of first-person POV: 0.1494061413673233
Frequency of second-person POV: 0.09639339513325608
Frequency of third-person POV: 0.7542004634994206


In [110]:
# Split the text into sentences (assuming you have already defined the function split_into_sentences)
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find the occurrence of each POV in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pov_frequency_subset = count_pov_frequency_subset(subset_text)
    print(f"Point of View Frequency in Subset {i + 1}: {pov_frequency_subset}")


1097
Point of View Frequency in Subset 1: {'first-person': 464, 'second-person': 199, 'third-person': 2185}
Point of View Frequency in Subset 2: {'first-person': 217, 'second-person': 392, 'third-person': 1732}
Point of View Frequency in Subset 3: {'first-person': 199, 'second-person': 306, 'third-person': 2244}
Point of View Frequency in Subset 4: {'first-person': 934, 'second-person': 310, 'third-person': 2088}
Point of View Frequency in Subset 5: {'first-person': 245, 'second-person': 110, 'third-person': 2149}


# Gothic corpus

In [96]:
pov_frequency_Gothic = count_pov_frequency(preprocessed_text_Gothic)

print("Frequency of first-person POV:", pov_frequency_Gothic['first-person']/(pov_frequency_Gothic['first-person']+pov_frequency_Gothic['second-person']+pov_frequency_Gothic['third-person']))
print("Frequency of second-person POV:", pov_frequency_Gothic['second-person']/(pov_frequency_Gothic['first-person']+pov_frequency_Gothic['second-person']+pov_frequency_Gothic['third-person']))
print("Frequency of third-person POV:", pov_frequency_Gothic['third-person']/(pov_frequency_Gothic['first-person']+pov_frequency_Gothic['second-person']+pov_frequency_Gothic['third-person']))

Frequency of first-person POV: 0.22246757200737388
Frequency of second-person POV: 0.14130474785291058
Frequency of third-person POV: 0.6362276801397155


In [65]:
#Poe vs Gothic
# Split the text into sentences (assuming you have already defined the function split_into_sentences)
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find the occurrence of each POV in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pov_frequency_subset = count_pov_frequency_subset(subset_text)
    print(f"Point of View Frequency in Subset {i + 1}: {pov_frequency_subset}")


Point of View Frequency in Subset 1: {'first-person': 785, 'second-person': 517, 'third-person': 2604}
Point of View Frequency in Subset 2: {'first-person': 770, 'second-person': 614, 'third-person': 2440}
Point of View Frequency in Subset 3: {'first-person': 783, 'second-person': 782, 'third-person': 2700}
Point of View Frequency in Subset 4: {'first-person': 680, 'second-person': 492, 'third-person': 2785}
Point of View Frequency in Subset 5: {'first-person': 775, 'second-person': 224, 'third-person': 2229}
Point of View Frequency in Subset 6: {'first-person': 762, 'second-person': 481, 'third-person': 2066}
Point of View Frequency in Subset 7: {'first-person': 807, 'second-person': 422, 'third-person': 2251}
Point of View Frequency in Subset 8: {'first-person': 877, 'second-person': 420, 'third-person': 2408}
Point of View Frequency in Subset 9: {'first-person': 692, 'second-person': 305, 'third-person': 2031}
Point of View Frequency in Subset 10: {'first-person': 658, 'second-perso

In [66]:
#Fanu vs Gothic
# Split the text into sentences (assuming you have already defined the function split_into_sentences)
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find the occurrence of each POV in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pov_frequency_subset = count_pov_frequency_subset(subset_text)
    print(f"Point of View Frequency in Subset {i + 1}: {pov_frequency_subset}")


Point of View Frequency in Subset 1: {'first-person': 593, 'second-person': 362, 'third-person': 1874}
Point of View Frequency in Subset 2: {'first-person': 452, 'second-person': 433, 'third-person': 1653}
Point of View Frequency in Subset 3: {'first-person': 568, 'second-person': 378, 'third-person': 1830}
Point of View Frequency in Subset 4: {'first-person': 586, 'second-person': 543, 'third-person': 1930}
Point of View Frequency in Subset 5: {'first-person': 528, 'second-person': 474, 'third-person': 1883}
Point of View Frequency in Subset 6: {'first-person': 500, 'second-person': 260, 'third-person': 1690}
Point of View Frequency in Subset 7: {'first-person': 505, 'second-person': 160, 'third-person': 1722}
Point of View Frequency in Subset 8: {'first-person': 568, 'second-person': 317, 'third-person': 1284}
Point of View Frequency in Subset 9: {'first-person': 528, 'second-person': 307, 'third-person': 1453}
Point of View Frequency in Subset 10: {'first-person': 532, 'second-perso

# Sentence Structure

In [34]:
import string 
from collections import Counter

# Define a function to split the corpus into sentences using relative punctuation marks
def split_into_sentences(text):
    sentences = []
    current_sentence = ""
    for char in text:
        current_sentence += char
        if char in ['.', ',', ':', ';', '?', '!']:
            sentences.append(current_sentence.strip())
            current_sentence = ""
    if current_sentence:
        sentences.append(current_sentence.strip())
    return sentences

# Define a function to remove punctuation marks from a sentence
def remove_punctuation(sentence):
    return ''.join([char for char in sentence if char not in string.punctuation])

# Define a function to count sentence structures in the corpus
def count_sentence_structures(text):
    # Split the corpus into sentences using relative punctuation marks
    sentences = split_into_sentences(text)
    
    # Initialize a Counter to store the frequency of each sentence structure
    structure_counter = Counter()
    
    for sentence in sentences:
        # Remove punctuation marks from the sentence
        sentence = remove_punctuation(sentence)
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract the POS tags only
        tags = [tag for _, tag in tagged_words]
        # Convert the tags list into a string to represent the sentence structure
        structure = ' '.join(tags)
        # Increment the frequency count for this sentence structure
        structure_counter[structure] += 1
    
    return structure_counter


# Allan Poe

In [101]:
# Allan Poe
import pandas as pd


structure_frequency_Poe = count_sentence_structures(text_Poe)

# Calculate the total number of combinations in the corpus
total_combinations = sum(structure_frequency_Poe.values())

# Create a DataFrame from the structure_frequency dictionary
Allen_Poe = pd.DataFrame(list(structure_frequency_Poe.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each combination
Allen_Poe['Relative Frequency'] = Allen_Poe['Frequency'] / total_combinations

# Filter out instances with frequency equal to 1
Allen_Poe = Allen_Poe[Allen_Poe['Frequency'] > 1]

# Sort the DataFrame by frequency in descending order
Allen_Poe = Allen_Poe.sort_values(by='Relative Frequency', ascending=False)

In [102]:
Allen_Poe.head(20)

Unnamed: 0,Structure,Frequency,Relative Frequency
28,RB,823,0.030585
32,NN,540,0.020068
80,CC,411,0.015274
24,IN NN,248,0.009216
43,IN DT NN,234,0.008696
34,DT NN,209,0.007767
306,IN DT JJ NN,147,0.005463
148,PRP VBD,142,0.005277
152,NNS,131,0.004868
719,IN,122,0.004534


In [35]:
# Define a function to count sentence structures in the corpus
def count_sentence_structures2(text):
    # Split the corpus into sentences using relative punctuation marks
    sentences = split_into_sentences(text)
    
    # Initialize a Counter to store the frequency of each sentence structure
    structure_counter = Counter()
    
    for sentence in sentences:
        # Remove punctuation marks from the sentence
        #sentence = remove_punctuation(sentence)
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract the POS tags only
        tags = [tag for _, tag in tagged_words]
        # Convert the tags list into a string to represent the sentence structure
        structure = ' '.join(tags)
        # Increment the frequency count for this sentence structure
        structure_counter[structure] += 1
    
    return structure_counter


In [37]:
# with punct
# Allan Poe
import pandas as pd


structure_frequency_Poe = count_sentence_structures2(text_Poe)

# Calculate the total number of combinations in the corpus
total_combinations = sum(structure_frequency_Poe.values())

# Create a DataFrame from the structure_frequency dictionary
Allen_Poe2 = pd.DataFrame(list(structure_frequency_Poe.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each combination
Allen_Poe2['Relative Frequency'] = Allen_Poe2['Frequency'] / total_combinations

# Filter out instances with frequency equal to 1
Allen_Poe2 = Allen_Poe2[Allen_Poe2['Frequency'] > 1]

# Sort the DataFrame by frequency in descending order
Allen_Poe2 = Allen_Poe2.sort_values(by='Relative Frequency', ascending=False)

In [40]:
Allen_Poe2.head(20)

Unnamed: 0,Structure,Frequency,Relative Frequency
29,"RB ,",812,0.030176
80,"CC ,",410,0.015237
114,"NN ,",262,0.009737
25,"IN NN ,",247,0.009179
44,"IN DT NN ,",204,0.007581
35,"DT NN ,",196,0.007284
313,"IN DT JJ NN ,",138,0.005128
149,"PRP VBD ,",131,0.004868
256,"NNP ,",126,0.004682
33,NN .,117,0.004348


In [43]:
# using sentence tokenizer, it is empty because no 2 sentences are identically the same
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Define a function to count sentence structures in the corpus
def count_sentence_structures(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize a Counter to store the frequency of each sentence structure
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract the POS tags only
        tags = [tag for _, tag in tagged_words]
        # Convert the tags list into a string to represent the sentence structure
        structure = ' '.join(tags)
        # Increment the frequency count for this sentence structure
        structure_counter[structure] += 1
    
    return structure_counter


# Calculate sentence structures frequency
structure_frequency_Poe = count_sentence_structures(text_Poe)

# Calculate the total number of combinations in the corpus
total_combinations = sum(structure_frequency_Poe.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_Poe = pd.DataFrame(list(structure_frequency_Poe.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each combination
df_structure_frequency_Poe['Relative Frequency'] = df_structure_frequency_Poe['Frequency'] / total_combinations

# Filter out instances with frequency equal to 1
df_structure_frequency_Poe = df_structure_frequency_Poe[df_structure_frequency_Poe['Frequency'] > 1]

# Sort the DataFrame by frequency in descending order
df_structure_frequency_Poe = df_structure_frequency_Poe.sort_values(by='Relative Frequency', ascending=False)

# Display the DataFrame
print(df_structure_frequency_Poe)


                                              Structure  Frequency  \
4676                                           NNP CD .         23   
1448                                               NN .         23   
570                                                UH .         11   
2412                                               IN .          9   
1746                                           JJ NNP .          8   
2104                                              NNP .          6   
2413                                          JJ . NN .          6   
3010                                 PRP VBD DT JJ NN .          4   
1986                                       PRP VBD RB .          4   
844                                     PRP VBD IN NN .          4   
1644                                     DT NN VBD JJ .          4   
2507                                              PRP .          4   
1449                                          NN . NN .          4   
2440                

In [44]:
df_structure_frequency_Poe

Unnamed: 0,Structure,Frequency,Relative Frequency
4676,NNP CD .,23,0.003038
1448,NN .,23,0.003038
570,UH .,11,0.001453
2412,IN .,9,0.001189
1746,JJ NNP .,8,0.001057
2104,NNP .,6,0.000792
2413,JJ . NN .,6,0.000792
3010,PRP VBD DT JJ NN .,4,0.000528
1986,PRP VBD RB .,4,0.000528
844,PRP VBD IN NN .,4,0.000528


In [32]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams

# Define a function to count sentence structures using 3-grams of POS tags
def count_sentence_structures_3grams(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize a Counter to store the frequency of each 3-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 3-grams of POS tags
        pos_3grams = ngrams(tags, 3)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_3grams)
    
    return structure_counter


# Calculate sentence structures frequency using 3-grams
structure_frequency_3grams = count_sentence_structures_3grams(text_Poe)

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_3grams = pd.DataFrame(list(structure_frequency_3grams.items()), columns=['Structure', 'Frequency'])

# Sort the DataFrame by frequency in descending order
df_structure_frequency_3grams = df_structure_frequency_3grams.sort_values(by='Frequency', ascending=False)

# Display the DataFrame
print(df_structure_frequency_3grams)


           Structure  Frequency
185     (IN, DT, NN)       7468
68      (DT, NN, IN)       4968
69      (NN, IN, DT)       4261
73      (DT, JJ, NN)       3531
34      (IN, DT, JJ)       2771
...              ...        ...
6002   (JJ, WDT, RB)          1
6004   (RP, NNS, DT)          1
6006   (WDT, NN, WP)          1
6007   (RB, VBD, MD)          1
8738  (VBZ, NN, RBR)          1

[8739 rows x 2 columns]


In [33]:
df_structure_frequency_3grams

Unnamed: 0,Structure,Frequency
185,"(IN, DT, NN)",7468
68,"(DT, NN, IN)",4968
69,"(NN, IN, DT)",4261
73,"(DT, JJ, NN)",3531
34,"(IN, DT, JJ)",2771
...,...,...
6002,"(JJ, WDT, RB)",1
6004,"(RP, NNS, DT)",1
6006,"(WDT, NN, WP)",1
6007,"(RB, VBD, MD)",1


In [36]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams

# Define a function to count sentence structures using 3-grams of POS tags
def count_sentence_structures_3grams(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize a Counter to store the frequency of each 3-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 3-grams of POS tags
        pos_3grams = ngrams(tags, 3)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_3grams)
    
    return structure_counter


# Calculate sentence structures frequency using 3-grams
structure_frequency_3grams = count_sentence_structures_3grams(text_Poe)

# Calculate the total number of 3-grams in the corpus
total_3grams = sum(structure_frequency_3grams.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_3grams = pd.DataFrame(list(structure_frequency_3grams.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_3grams['Relative Frequency'] = df_structure_frequency_3grams['Frequency'] / total_3grams

# Sort the DataFrame by frequency in descending order
df_structure_frequency_3grams = df_structure_frequency_3grams.sort_values(by='Frequency', ascending=False)

# Display the DataFrame
print(df_structure_frequency_3grams)


           Structure  Frequency  Relative Frequency
185     (IN, DT, NN)       7468            0.035338
68      (DT, NN, IN)       4968            0.023508
69      (NN, IN, DT)       4261            0.020163
73      (DT, JJ, NN)       3531            0.016708
34      (IN, DT, JJ)       2771            0.013112
...              ...        ...                 ...
6002   (JJ, WDT, RB)          1            0.000005
6004   (RP, NNS, DT)          1            0.000005
6006   (WDT, NN, WP)          1            0.000005
6007   (RB, VBD, MD)          1            0.000005
8738  (VBZ, NN, RBR)          1            0.000005

[8739 rows x 3 columns]


In [37]:
df_structure_frequency_3grams

Unnamed: 0,Structure,Frequency,Relative Frequency
185,"(IN, DT, NN)",7468,0.035338
68,"(DT, NN, IN)",4968,0.023508
69,"(NN, IN, DT)",4261,0.020163
73,"(DT, JJ, NN)",3531,0.016708
34,"(IN, DT, JJ)",2771,0.013112
...,...,...,...
6002,"(JJ, WDT, RB)",1,0.000005
6004,"(RP, NNS, DT)",1,0.000005
6006,"(WDT, NN, WP)",1,0.000005
6007,"(RB, VBD, MD)",1,0.000005


In [38]:
df_structure_frequency_3grams.head(25).to_csv('Poe_structure_frequency_3grams.csv', index=False)

In [75]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import re

# Define a function to count sentence structures using 3-grams of POS tags for a subset
def count_sentence_structures_3grams_subset(subset_text):
    # Tokenize the subset into sentences
    sentences = sent_tokenize(subset_text)
    
    # Initialize a Counter to store the frequency of each 3-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 3-grams of POS tags
        pos_3grams = ngrams(tags, 3)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_3grams)
    
    return structure_counter

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences


# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find sentence structures frequency using 3-grams in each subset
# Find sentence structures frequency using 3-grams in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_3grams_subset = count_sentence_structures_3grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_3grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



1568
Subset 1:
Structure: ('IN', 'DT', 'NN'), Frequency: 1510
Structure: ('DT', 'NN', 'IN'), Frequency: 1020
Structure: ('NN', 'IN', 'DT'), Frequency: 903
Structure: ('DT', 'JJ', 'NN'), Frequency: 630
Structure: ('IN', 'DT', 'JJ'), Frequency: 487

Subset 2:
Structure: ('IN', 'DT', 'NN'), Frequency: 1274
Structure: ('DT', 'NN', 'IN'), Frequency: 841
Structure: ('NN', 'IN', 'DT'), Frequency: 767
Structure: ('DT', 'JJ', 'NN'), Frequency: 646
Structure: ('IN', 'DT', 'JJ'), Frequency: 542

Subset 3:
Structure: ('IN', 'DT', 'NN'), Frequency: 1262
Structure: ('DT', 'NN', 'IN'), Frequency: 860
Structure: ('NN', 'IN', 'DT'), Frequency: 677
Structure: ('DT', 'JJ', 'NN'), Frequency: 627
Structure: ('IN', 'DT', 'JJ'), Frequency: 503

Subset 4:
Structure: ('IN', 'DT', 'NN'), Frequency: 1820
Structure: ('DT', 'NN', 'IN'), Frequency: 1152
Structure: ('NN', 'IN', 'DT'), Frequency: 913
Structure: ('DT', 'JJ', 'NN'), Frequency: 810
Structure: ('DT', 'NN', ','), Frequency: 602

Subset 5:
Structure: ('IN'

In [39]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams

# Define a function to count sentence structures using 4-grams of POS tags
def count_sentence_structures_4grams(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize a Counter to store the frequency of each 4-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 4-grams of POS tags
        pos_4grams = ngrams(tags, 4)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_4grams)
    
    return structure_counter


# Calculate sentence structures frequency using 3-grams
structure_frequency_4grams = count_sentence_structures_4grams(text_Poe)

# Calculate the total number of 3-grams in the corpus
total_4grams = sum(structure_frequency_4grams.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_4grams = pd.DataFrame(list(structure_frequency_4grams.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_4grams['Relative Frequency'] = df_structure_frequency_4grams['Frequency'] / total_4grams

# Sort the DataFrame by frequency in descending order
df_structure_frequency_4grams = df_structure_frequency_4grams.sort_values(by='Frequency', ascending=False)


In [40]:
df_structure_frequency_4grams

Unnamed: 0,Structure,Frequency,Relative Frequency
434,"(NN, IN, DT, NN)",2634,0.012923
212,"(IN, DT, NN, IN)",2460,0.012070
69,"(DT, NN, IN, DT)",2242,0.011000
73,"(IN, DT, JJ, NN)",1833,0.008993
272,"(IN, DT, NN, ,)",1517,0.007443
...,...,...,...
8394,"(WP, VBZ, NNP, NN)",1,0.000005
20986,"(JJ, VBD, RB, TO)",1,0.000005
20987,"(,, DT, RB, NNS)",1,0.000005
20988,"(DT, RB, NNS, TO)",1,0.000005


In [41]:
df_structure_frequency_4grams.head(25).to_csv('Poe_structure_frequency_4grams.csv', index=False)

In [7]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import re

# Define a function to count sentence structures using 3-grams of POS tags for a subset
def count_sentence_structures_4grams_subset(subset_text):
    # Tokenize the subset into sentences
    sentences = sent_tokenize(subset_text)
    
    # Initialize a Counter to store the frequency of each 3-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 3-grams of POS tags
        pos_4grams = ngrams(tags, 4)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_4grams)
    
    return structure_counter

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences


# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find sentence structures frequency using 4-grams in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_4grams_subset = count_sentence_structures_4grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_4grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



1568
Subset 1:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 558
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 522
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 509
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 330
Structure: ('IN', 'DT', 'NN', ','), Frequency: 259

Subset 2:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 450
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 398
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 389
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 328
Structure: ('IN', 'DT', 'NN', ','), Frequency: 259

Subset 3:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 430
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 393
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 348
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 335
Structure: ('DT', 'JJ', 'NN', 'IN'), Frequency: 247

Subset 4:
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 616
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 615
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 479
Structure: ('IN', 

In [42]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams

# Define a function to count sentence structures using 4-grams of POS tags
def count_sentence_structures_5grams(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize a Counter to store the frequency of each 4-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 5-grams of POS tags
        pos_5grams = ngrams(tags, 5)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_5grams)
    
    return structure_counter


# Calculate sentence structures frequency using 3-grams
structure_frequency_5grams = count_sentence_structures_5grams(text_Poe)

# Calculate the total number of 3-grams in the corpus
total_5grams = sum(structure_frequency_5grams.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_5grams = pd.DataFrame(list(structure_frequency_5grams.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_5grams['Relative Frequency'] = df_structure_frequency_5grams['Frequency'] / total_5grams

# Sort the DataFrame by frequency in descending order
df_structure_frequency_5grams = df_structure_frequency_5grams.sort_values(by='Frequency', ascending=False)


In [43]:
df_structure_frequency_5grams

Unnamed: 0,Structure,Frequency,Relative Frequency
589,"(DT, NN, IN, DT, NN)",1406,0.007160
692,"(IN, DT, NN, IN, DT)",1151,0.005862
691,"(NN, IN, DT, NN, IN)",672,0.003422
855,"(NN, IN, DT, NN, ,)",629,0.003203
275,"(IN, DT, JJ, NN, IN)",613,0.003122
...,...,...,...
34615,"((, WDT, VBZ, RB, NNP)",1,0.000005
34616,"(WDT, VBZ, RB, NNP, ,)",1,0.000005
34617,"(VBZ, RB, NNP, ,, ))",1,0.000005
34618,"(RB, NNP, ,, ), IN)",1,0.000005


In [44]:
df_structure_frequency_5grams.head(25).to_csv('Poe_structure_frequency_5grams.csv', index=False)

In [14]:
from collections import Counter
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import ngrams
import re

# Define a function to count sentence structures using 5-grams of POS tags for a subset
def count_sentence_structures_5grams_subset(subset_text):
    # Tokenize the subset into sentences
    sentences = sent_tokenize(subset_text)
    
    # Initialize a Counter to store the frequency of each 3-gram of POS tags
    structure_counter = Counter()
    
    for sentence in sentences:
        # Perform POS tagging on the sentence
        tagged_words = nltk.pos_tag(word_tokenize(sentence))
        # Extract POS tags only
        tags = [tag for _, tag in tagged_words]
        # Generate 3-grams of POS tags
        pos_5grams = ngrams(tags, 5)
        # Increment the frequency count for each 3-gram
        structure_counter.update(pos_5grams)
    
    return structure_counter

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences


# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find sentence structures frequency using 4-grams in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_5grams_subset = count_sentence_structures_5grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_5grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



1568
Subset 1:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 329
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 282
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 163
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 114
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 107

Subset 2:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 232
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 183
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 115
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 110
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 110

Subset 3:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 214
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 181
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 118
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 103
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 101

Subset 4:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 332
Structure: ('IN', 'DT', 'NN'

# Sheridan Le Fanu

In [103]:
# Sheridan Le Fanu
import pandas as pd


structure_frequency_Fanu = count_sentence_structures(text_Fanu)

# Calculate the total number of combinations in the corpus
total_combinations_Fanu = sum(structure_frequency_Fanu.values())

# Create a DataFrame from the structure_frequency dictionary
Le_Fanu = pd.DataFrame(list(structure_frequency_Fanu.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each combination
Le_Fanu['Relative Frequency'] = Le_Fanu['Frequency'] / total_combinations_Fanu

# Filter out instances with frequency equal to 1
Le_Fanu = Le_Fanu[Le_Fanu['Frequency'] > 1]

# Sort the DataFrame by frequency in descending order
Le_Fanu = Le_Fanu.sort_values(by='Relative Frequency', ascending=False)

In [104]:
Le_Fanu.head(20)

Unnamed: 0,Structure,Frequency,Relative Frequency
200,NN,802,0.035079
3,RB,502,0.021957
360,PRP VBD,268,0.011722
27,CC,205,0.008966
7,IN DT NN,172,0.007523
21,DT NN,152,0.006648
50,PRP VBP,128,0.005599
115,IN DT JJ NN,106,0.004636
193,NNP NNP,100,0.004374
263,IN NN,93,0.004068


In [45]:
# Calculate sentence structures frequency using 3-grams
structure_frequency_3grams_Fanu = count_sentence_structures_3grams(text_Fanu)

# Calculate the total number of 3-grams in the corpus
total_3grams_Fanu = sum(structure_frequency_3grams_Fanu.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_3grams_Fanu = pd.DataFrame(list(structure_frequency_3grams_Fanu.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_3grams_Fanu['Relative Frequency'] = df_structure_frequency_3grams_Fanu['Frequency'] / total_3grams_Fanu

# Sort the DataFrame by frequency in descending order
df_structure_frequency_3grams_Fanu = df_structure_frequency_3grams_Fanu.sort_values(by='Frequency', ascending=False)

In [46]:
df_structure_frequency_3grams_Fanu

Unnamed: 0,Structure,Frequency,Relative Frequency
46,"(IN, DT, NN)",4335,0.026095
53,"(DT, JJ, NN)",2924,0.017602
6,"(DT, NN, IN)",2670,0.016073
70,"(NN, IN, DT)",2305,0.013875
109,"(IN, DT, JJ)",1932,0.011630
...,...,...,...
6537,"(WP, VBZ, VBG)",1,0.000006
6539,"(MD, PRP, NN)",1,0.000006
6544,"(WRB, PRP$, VBG)",1,0.000006
3503,"(NN, RB, VBP)",1,0.000006


In [47]:
df_structure_frequency_3grams_Fanu.head(25).to_csv('Fanu_structure_frequency_3grams.csv', index=False)

In [76]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find sentence structures frequency using 3-grams in each subset
# Find sentence structures frequency using 3-grams in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_3grams_subset = count_sentence_structures_3grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_3grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



1097
Subset 1:
Structure: ('IN', 'DT', 'NN'), Frequency: 1073
Structure: ('DT', 'JJ', 'NN'), Frequency: 716
Structure: ('DT', 'NN', 'IN'), Frequency: 654
Structure: ('NN', 'IN', 'DT'), Frequency: 615
Structure: ('IN', 'DT', 'JJ'), Frequency: 500

Subset 2:
Structure: ('IN', 'DT', 'NN'), Frequency: 679
Structure: ('DT', 'NN', 'IN'), Frequency: 429
Structure: ('DT', 'JJ', 'NN'), Frequency: 428
Structure: ('NN', 'IN', 'DT'), Frequency: 344
Structure: ('IN', 'DT', 'JJ'), Frequency: 282

Subset 3:
Structure: ('IN', 'DT', 'NN'), Frequency: 814
Structure: ('DT', 'JJ', 'NN'), Frequency: 521
Structure: ('DT', 'NN', 'IN'), Frequency: 509
Structure: ('NN', 'IN', 'DT'), Frequency: 449
Structure: ('IN', 'DT', 'JJ'), Frequency: 313

Subset 4:
Structure: ('IN', 'DT', 'NN'), Frequency: 849
Structure: ('DT', 'JJ', 'NN'), Frequency: 597
Structure: ('DT', 'NN', 'IN'), Frequency: 552
Structure: ('NN', 'IN', 'DT'), Frequency: 424
Structure: ('IN', 'DT', 'JJ'), Frequency: 404

Subset 5:
Structure: ('IN', 'D

In [48]:
# Calculate sentence structures frequency using 3-grams
structure_frequency_4grams_Fanu = count_sentence_structures_4grams(text_Fanu)

# Calculate the total number of 3-grams in the corpus
total_4grams_Fanu = sum(structure_frequency_4grams_Fanu.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_4grams_Fanu = pd.DataFrame(list(structure_frequency_4grams_Fanu.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_4grams_Fanu['Relative Frequency'] = df_structure_frequency_4grams_Fanu['Frequency'] / total_4grams_Fanu

# Sort the DataFrame by frequency in descending order
df_structure_frequency_4grams_Fanu = df_structure_frequency_4grams_Fanu.sort_values(by='Frequency', ascending=False)

In [49]:
df_structure_frequency_4grams_Fanu

Unnamed: 0,Structure,Frequency,Relative Frequency
75,"(NN, IN, DT, NN)",1383,0.008651
117,"(IN, DT, JJ, NN)",1342,0.008395
206,"(IN, DT, NN, IN)",1229,0.007688
105,"(DT, NN, IN, DT)",1087,0.006800
48,"(IN, DT, NN, ,)",1058,0.006618
...,...,...,...
19040,"(PRP$, VBG, PRP, TO)",1,0.000006
19039,"(,, PRP$, VBG, PRP)",1,0.000006
19036,"(VBZ, IN, PRP, VBZ)",1,0.000006
19033,"(CC, DT, WDT, VBZ)",1,0.000006


In [50]:
df_structure_frequency_4grams_Fanu.head(25).to_csv('Fanu_structure_frequency_4grams.csv', index=False)

In [8]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find sentence structures frequency using 4-grams in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_4grams_subset = count_sentence_structures_4grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_4grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



1097
Subset 1:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 370
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 345
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 323
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 279
Structure: ('IN', 'DT', 'NN', ','), Frequency: 253

Subset 2:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 201
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 192
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 176
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 171
Structure: ('IN', 'DT', 'NN', ','), Frequency: 160

Subset 3:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 281
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 221
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 217
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 206
Structure: ('IN', 'DT', 'NN', ','), Frequency: 189

Subset 4:
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 289
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 262
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 253
Structure: ('IN', '

In [51]:
# Calculate sentence structures frequency using 3-grams
structure_frequency_5grams_Fanu = count_sentence_structures_5grams(text_Fanu)

# Calculate the total number of 3-grams in the corpus
total_5grams_Fanu = sum(structure_frequency_5grams_Fanu.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_5grams_Fanu = pd.DataFrame(list(structure_frequency_5grams_Fanu.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_5grams_Fanu['Relative Frequency'] = df_structure_frequency_5grams_Fanu['Frequency'] / total_5grams_Fanu

# Sort the DataFrame by frequency in descending order
df_structure_frequency_5grams_Fanu = df_structure_frequency_5grams_Fanu.sort_values(by='Frequency', ascending=False)

In [52]:
df_structure_frequency_5grams_Fanu

Unnamed: 0,Structure,Frequency,Relative Frequency
103,"(DT, NN, IN, DT, NN)",651,0.004235
270,"(IN, DT, NN, IN, DT)",511,0.003324
116,"(IN, DT, JJ, NN, ,)",399,0.002596
312,"(NN, IN, DT, JJ, NN)",394,0.002563
271,"(NN, IN, DT, NN, ,)",394,0.002563
...,...,...,...
31680,"(IN, JJ, ,, VBD, NNP)",1,0.000007
31682,"(,, VBD, NNP, NNP, :)",1,0.000007
31684,"(NNP, :, CC, RB, VBG)",1,0.000007
31685,"(:, CC, RB, VBG, PRP$)",1,0.000007


In [53]:
df_structure_frequency_5grams_Fanu.head(25).to_csv('Fanu_structure_frequency_5grams.csv', index=False)

In [15]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find sentence structures frequency using 4-grams in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_5grams_subset = count_sentence_structures_5grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_5grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



1097
Subset 1:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 169
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 132
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 112
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 105
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 98

Subset 2:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 101
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 76
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 61
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 58
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 48

Subset 3:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 121
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 93
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 79
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 67
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 65

Subset 4:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 130
Structure: ('IN', 'DT', 'NN', 'IN', 'DT

# Gothic corpus

In [105]:
# Gothic corpus
import pandas as pd


structure_frequency_Gothic = count_sentence_structures(text_Gothic)

# Calculate the total number of combinations in the corpus
total_combinations_Gothic = sum(structure_frequency_Gothic.values())

# Create a DataFrame from the structure_frequency dictionary
Gothic = pd.DataFrame(list(structure_frequency_Gothic.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each combination
Gothic['Relative Frequency'] = Gothic['Frequency'] / total_combinations_Gothic

# Filter out instances with frequency equal to 1
Gothic = Gothic[Gothic['Frequency'] > 1]

# Sort the DataFrame by frequency in descending order
Gothic = Gothic.sort_values(by='Relative Frequency', ascending=False)

In [106]:
Gothic.head(20)

Unnamed: 0,Structure,Frequency,Relative Frequency
14,NN,16452,0.035136
46,RB,9706,0.020729
102,CC,4591,0.009805
15,NN PRP VBD,3539,0.007558
157,IN DT NN,2804,0.005988
434,PRP VBD,2588,0.005527
259,DT NN,2442,0.005215
860,NNS,2243,0.00479
296,NNP,2226,0.004754
173,NN VBD NNP,2087,0.004457


In [54]:
# Calculate sentence structures frequency using 3-grams
structure_frequency_3grams_Gothic = count_sentence_structures_3grams(text_Gothic)

# Calculate the total number of 3-grams in the corpus
total_3grams_Gothic = sum(structure_frequency_3grams_Gothic.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_3grams_Gothic = pd.DataFrame(list(structure_frequency_3grams_Gothic.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_3grams_Gothic['Relative Frequency'] = df_structure_frequency_3grams_Gothic['Frequency'] / total_3grams_Gothic

# Sort the DataFrame by frequency in descending order
df_structure_frequency_3grams_Gothic = df_structure_frequency_3grams_Gothic.sort_values(by='Frequency', ascending=False)

In [55]:
df_structure_frequency_3grams_Gothic

Unnamed: 0,Structure,Frequency,Relative Frequency
4,"(IN, DT, NN)",72970,2.194610e-02
33,"(DT, NN, IN)",50940,1.532046e-02
20,"(DT, JJ, NN)",42276,1.271472e-02
328,"(NN, IN, DT)",37488,1.127471e-02
179,"(DT, NN, ,)",30687,9.229270e-03
...,...,...,...
16915,"(NNP, UH, NN)",1,3.007550e-07
16914,"(PRP, NNP, UH)",1,3.007550e-07
16910,"(JJ, ), NNP)",1,3.007550e-07
16908,"(DT, RP, WRB)",1,3.007550e-07


In [56]:
df_structure_frequency_3grams_Gothic.head(25).to_csv('Gothic_structure_frequency_3grams.csv', index=False)

In [77]:
# Poe vs Gothic

# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find sentence structures frequency using 3-grams in each subset
# Find sentence structures frequency using 3-grams in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_3grams_subset = count_sentence_structures_3grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_3grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



Subset 1:
Structure: ('IN', 'DT', 'NN'), Frequency: 726
Structure: ('DT', 'NN', 'IN'), Frequency: 538
Structure: ('DT', 'JJ', 'NN'), Frequency: 449
Structure: ('NN', 'IN', 'DT'), Frequency: 362
Structure: ('NN', ',', 'CC'), Frequency: 347

Subset 2:
Structure: ('IN', 'DT', 'NN'), Frequency: 609
Structure: ('DT', 'NN', 'IN'), Frequency: 445
Structure: ('DT', 'JJ', 'NN'), Frequency: 360
Structure: ('NN', 'IN', 'DT'), Frequency: 295
Structure: ('NN', ',', 'CC'), Frequency: 295

Subset 3:
Structure: ('IN', 'DT', 'NN'), Frequency: 531
Structure: ('DT', 'NN', 'IN'), Frequency: 347
Structure: ('DT', 'JJ', 'NN'), Frequency: 329
Structure: ('NN', ',', 'CC'), Frequency: 315
Structure: ('PRP', 'MD', 'VB'), Frequency: 305

Subset 4:
Structure: ('IN', 'DT', 'NN'), Frequency: 631
Structure: ('DT', 'JJ', 'NN'), Frequency: 438
Structure: ('DT', 'NN', 'IN'), Frequency: 408
Structure: ('NN', ',', 'CC'), Frequency: 310
Structure: ('DT', 'NN', ','), Frequency: 305

Subset 5:
Structure: ('IN', 'DT', 'NN'),

In [78]:
# Fanu vs Gothic

# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find sentence structures frequency using 3-grams in each subset
# Find sentence structures frequency using 3-grams in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_3grams_subset = count_sentence_structures_3grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_3grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



Subset 1:
Structure: ('IN', 'DT', 'NN'), Frequency: 572
Structure: ('DT', 'NN', 'IN'), Frequency: 418
Structure: ('DT', 'JJ', 'NN'), Frequency: 348
Structure: ('NN', 'IN', 'DT'), Frequency: 284
Structure: ('NN', ',', 'CC'), Frequency: 272

Subset 2:
Structure: ('IN', 'DT', 'NN'), Frequency: 367
Structure: ('DT', 'NN', 'IN'), Frequency: 267
Structure: ('DT', 'JJ', 'NN'), Frequency: 214
Structure: ('NN', ',', 'CC'), Frequency: 185
Structure: ('NN', 'IN', 'DT'), Frequency: 172

Subset 3:
Structure: ('IN', 'DT', 'NN'), Frequency: 462
Structure: ('DT', 'NN', 'IN'), Frequency: 339
Structure: ('DT', 'JJ', 'NN'), Frequency: 299
Structure: ('NN', 'IN', 'DT'), Frequency: 240
Structure: ('IN', 'PRP$', 'NN'), Frequency: 222

Subset 4:
Structure: ('IN', 'DT', 'NN'), Frequency: 395
Structure: ('DT', 'NN', 'IN'), Frequency: 268
Structure: ('DT', 'JJ', 'NN'), Frequency: 230
Structure: ('NN', ',', 'CC'), Frequency: 227
Structure: ('PRP', 'MD', 'VB'), Frequency: 208

Subset 5:
Structure: ('IN', 'DT', 'N

In [57]:
# Calculate sentence structures frequency using 4-grams
structure_frequency_4grams_Gothic = count_sentence_structures_4grams(text_Gothic)

# Calculate the total number of 3-grams in the corpus
total_4grams_Gothic = sum(structure_frequency_4grams_Gothic.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_4grams_Gothic = pd.DataFrame(list(structure_frequency_4grams_Gothic.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_4grams_Gothic['Relative Frequency'] = df_structure_frequency_4grams_Gothic['Frequency'] / total_4grams_Gothic

# Sort the DataFrame by frequency in descending order
df_structure_frequency_4grams_Gothic = df_structure_frequency_4grams_Gothic.sort_values(by='Frequency', ascending=False)

In [58]:
df_structure_frequency_4grams_Gothic

Unnamed: 0,Structure,Frequency,Relative Frequency
445,"(NN, IN, DT, NN)",22172,6.917192e-03
33,"(IN, DT, NN, IN)",21508,6.710038e-03
90,"(IN, DT, JJ, NN)",18695,5.832442e-03
431,"(DT, NN, IN, DT)",17623,5.498001e-03
436,"(IN, DT, NN, ,)",17194,5.364162e-03
...,...,...,...
91898,"(VBN, RP, JJ, :)",1,3.119787e-07
91901,"(NN, (, JJ, PRP)",1,3.119787e-07
91902,"((, JJ, PRP, ))",1,3.119787e-07
91903,"(JJ, PRP, ), RBS)",1,3.119787e-07


In [59]:
df_structure_frequency_4grams_Gothic.head(25).to_csv('Gothic_structure_frequency_4grams.csv', index=False)

In [11]:
#Gothic vs Poe
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find sentence structures frequency using 4-grams in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_4grams_subset = count_sentence_structures_4grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_4grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:5]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



Subset 1:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 235
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 193
Structure: ('IN', 'DT', 'NN', ','), Frequency: 180
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 166
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 164

Subset 2:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 207
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 148
Structure: ('IN', 'DT', 'NN', ','), Frequency: 127
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 117
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 112

Subset 3:
Structure: ('IN', 'DT', 'NN', ','), Frequency: 152
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 151
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 120
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 119
Structure: ('PRP', 'VBP', 'JJ', 'NN'), Frequency: 116

Subset 4:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 189
Structure: ('IN', 'DT', 'NN', ','), Frequency: 157
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 152
Structure: ('IN', 'DT',

In [13]:
#Gothic vs Fanu
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find sentence structures frequency using 4-grams in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_4grams_subset = count_sentence_structures_4grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_4grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:10]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



Subset 1:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 185
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 153
Structure: ('IN', 'DT', 'NN', ','), Frequency: 145
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 130
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 125
Structure: ('DT', 'NN', ',', 'CC'), Frequency: 89
Structure: ('DT', 'JJ', 'NN', 'IN'), Frequency: 78
Structure: ('PRP', 'VBD', 'DT', 'NN'), Frequency: 77
Structure: ('DT', 'JJ', 'NN', ','), Frequency: 76
Structure: ('IN', 'DT', 'NN', '.'), Frequency: 76

Subset 2:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 117
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 87
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 84
Structure: ('IN', 'DT', 'NN', ','), Frequency: 83
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 74
Structure: ('PRP', 'VBP', 'JJ', 'NN'), Frequency: 68
Structure: ('DT', 'NN', ',', 'CC'), Frequency: 65
Structure: ('PRP', 'VBD', 'DT', 'NN'), Frequency: 64
Structure: ('NN', 'IN', 'PRP$', 'NN'), Frequency: 64
St

Subset 17:
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 233
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 223
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 219
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 209
Structure: (',', "''", 'VBD', 'NNP'), Frequency: 165
Structure: ('DT', 'JJ', 'NN', 'IN'), Frequency: 152
Structure: ('IN', 'DT', 'NN', ','), Frequency: 150
Structure: ('DT', 'NN', 'IN', 'NN'), Frequency: 112
Structure: ('PRP', 'MD', 'RB', 'VB'), Frequency: 110
Structure: ('DT', 'JJ', 'NN', ','), Frequency: 99

Subset 18:
Structure: ('NN', 'IN', 'DT', 'NN'), Frequency: 331
Structure: ('IN', 'DT', 'NN', 'IN'), Frequency: 330
Structure: ('DT', 'NN', 'IN', 'DT'), Frequency: 324
Structure: ('IN', 'DT', 'JJ', 'NN'), Frequency: 258
Structure: ('IN', 'DT', 'NN', ','), Frequency: 240
Structure: ('DT', 'JJ', 'NN', 'IN'), Frequency: 174
Structure: ('DT', 'JJ', 'NN', ','), Frequency: 151
Structure: ('IN', 'DT', 'NN', '.'), Frequency: 133
Structure: ('DT', 'NN', ',', 'CC'), Frequency

In [60]:
# Calculate sentence structures frequency using 5-grams
structure_frequency_5grams_Gothic = count_sentence_structures_5grams(text_Gothic)

# Calculate the total number of 3-grams in the corpus
total_5grams_Gothic = sum(structure_frequency_5grams_Gothic.values())

# Create a DataFrame from the structure_frequency dictionary
df_structure_frequency_5grams_Gothic = pd.DataFrame(list(structure_frequency_5grams_Gothic.items()), columns=['Structure', 'Frequency'])

# Calculate the relative frequency for each 3-gram
df_structure_frequency_5grams_Gothic['Relative Frequency'] = df_structure_frequency_5grams_Gothic['Frequency'] / total_5grams_Gothic

# Sort the DataFrame by frequency in descending order
df_structure_frequency_5grams_Gothic = df_structure_frequency_5grams_Gothic.sort_values(by='Frequency', ascending=False)

In [61]:
df_structure_frequency_5grams_Gothic

Unnamed: 0,Structure,Frequency,Relative Frequency
467,"(DT, NN, IN, DT, NN)",10351,3.352479e-03
466,"(IN, DT, NN, IN, DT)",7967,2.580350e-03
1080,"(NN, IN, DT, NN, ,)",5920,1.917368e-03
88,"(IN, DT, JJ, NN, ,)",5328,1.725631e-03
457,"(IN, DT, JJ, NN, IN)",5208,1.686766e-03
...,...,...,...
268351,"(VBP, VB, DT, JJ, CD)",1,3.238797e-07
268354,"(JJ, CD, ., NN, IN)",1,3.238797e-07
268355,"(CD, ., NN, IN, VBG)",1,3.238797e-07
268357,"(JJ, VBG, VBN, RB, RB)",1,3.238797e-07


In [62]:
df_structure_frequency_5grams_Gothic.head(25).to_csv('Gothic_structure_frequency_5grams.csv', index=False)

In [25]:
# Poe vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find sentence structures frequency using 4-grams in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_5grams_subset = count_sentence_structures_5grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_5grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:100]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



Subset 1:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 110
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 76
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 65
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 57
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 55
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 52
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 46
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 42
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 36
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 36
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 36
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 34
Structure: ('.', 'NN', 'PRP', 'VBD', ','), Frequency: 34
Structure: ('TO', 'VB', 'DT', 'NN', 'IN'), Frequency: 33
Structure: ('NN', '.', 'NN', 'PRP', 'VBD'), Frequency: 33
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 32
Structure: ('PRP', 'VBD', 'DT', 'NN', 'IN'), Frequency: 32
Structure: ('NN', 

Subset 3:
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 52
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 51
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 46
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 46
Structure: ('PRP', 'VBP', 'JJ', 'NN', 'NN'), Frequency: 36
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 34
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 34
Structure: ('IN', 'PRP$', 'NN', ',', 'CC'), Frequency: 33
Structure: ('DT', 'NN', ',', 'CC', 'VBD'), Frequency: 33
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 32
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 32
Structure: ('NNP', ',', 'NNP', 'PRP', 'VBD'), Frequency: 31
Structure: ('NN', ':', 'CC', 'PRP', 'VBD'), Frequency: 31
Structure: ('.', 'NN', 'PRP', 'VBD', '.'), Frequency: 31
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 30
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 30
Structure: ('.', 'NN', 'PRP', 'VBD', ','), Frequency: 29
Structure: ('

Subset 5:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 100
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 79
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 71
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 62
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 61
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 59
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 56
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 55
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 53
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 50
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 47
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 43
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 42
Structure: ('DT', 'NN', 'IN', 'NN', ','), Frequency: 32
Structure: ('PRP', 'VBD', 'IN', 'DT', 'NN'), Frequency: 32
Structure: ('PRP', 'VBD', 'DT', 'JJ', 'NN'), Frequency: 32
Structure: ('DT', 'NN', ',', 'PRP', 'VBD'), Frequency: 32
Structure: ('NN',

Subset 7:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 87
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 63
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 52
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 50
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 47
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 47
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 46
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 45
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 43
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 42
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 42
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 40
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 37
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 36
Structure: ('IN', 'DT', 'JJ', 'NN', '.'), Frequency: 35
Structure: ('PRP', 'VBD', 'DT', 'JJ', 'NN'), Frequency: 34
Structure: ('DT', 'JJ', 'NN', 'IN', 'NN'), Frequency: 33
Structure: (',', 'IN

Subset 9:
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 58
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 53
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 47
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 46
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 45
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 42
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 41
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 39
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 38
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 34
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 32
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 31
Structure: ('DT', 'NN', ',', 'DT', 'NN'), Frequency: 30
Structure: ('IN', 'DT', 'JJ', 'NN', '.'), Frequency: 26
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 26
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 26
Structure: ('DT', 'JJ', 'NN', 'IN', 'NN'), Frequency: 25
Structure: ('DT', 'NN', '

Subset 11:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 165
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 126
Structure: (',', "''", 'VBD', 'NNP', ','), Frequency: 99
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 94
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 76
Structure: ('NN', ',', "''", 'VBD', 'NNP'), Frequency: 70
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 70
Structure: (',', "''", 'VBD', 'NNP', '.'), Frequency: 69
Structure: ('IN', 'DT', 'NN', '.', "''"), Frequency: 67
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 62
Structure: ("''", 'VBD', 'NNP', ',', '``'), Frequency: 61
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 60
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 59
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 56
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 52
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 49
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 49
Structure: ('PRP', 

Subset 13:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 251
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 196
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 120
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 112
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 111
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 107
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 100
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 98
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 94
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 90
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 78
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 76
Structure: ('IN', 'DT', 'NN', '.', "''"), Frequency: 71
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 61
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 59
Structure: ('DT', 'NNS', 'IN', 'DT', 'NN'), Frequency: 55
Structure: ('IN', 'DT', 'NN', ',', 'IN'), Frequency: 54
Structure: (',', "'

Subset 15:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 194
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 135
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 125
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 104
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 85
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 76
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 73
Structure: (',', "''", 'VBD', 'DT', 'NN'), Frequency: 73
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 71
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 70
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 70
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 69
Structure: ('IN', 'DT', 'NN', '.', "''"), Frequency: 62
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 60
Structure: ('IN', 'DT', 'NN', ',', 'IN'), Frequency: 58
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 55
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 54
Structure: (',', 'IN'

Subset 17:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 133
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 101
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 99
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 82
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 81
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 69
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 63
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 63
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 52
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 51
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 45
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 45
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 41
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 40
Structure: ('DT', 'JJ', 'NN', 'IN', 'PRP$'), Frequency: 40
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 40
Structure: ('NN', ',', 'IN', 'DT', 'NN'), Frequency: 37
Structure: ('NN', 

Subset 19:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 119
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 99
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 82
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 71
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 61
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 60
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 54
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 54
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 53
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 53
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 53
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 48
Structure: ('DT', 'JJ', 'NN', 'IN', 'PRP$'), Frequency: 44
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 44
Structure: (',', 'IN', 'DT', 'NN', 'IN'), Frequency: 43
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 42
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 40
Structure: ('NN', '

Subset 21:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 99
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 87
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 63
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 59
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 59
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 51
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 51
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 50
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 46
Structure: (',', 'IN', 'DT', 'NN', 'IN'), Frequency: 44
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 41
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 40
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 40
Structure: ('DT', 'NN', 'IN', 'NNP', 'NNP'), Frequency: 36
Structure: ('IN', 'DT', 'NN', ',', 'IN'), Frequency: 32
Structure: ('IN', 'NNP', 'NNP', 'NNP', 'VBD'), Frequency: 32
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 32
Structure: ('PRP

Subset 23:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 183
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 135
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 99
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 91
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 82
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 82
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 78
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 72
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 69
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 66
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 60
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 60
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 49
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 41
Structure: ('PRP', 'VBD', 'IN', 'DT', 'NN'), Frequency: 39
Structure: ('DT', 'JJ', 'NN', 'IN', 'PRP$'), Frequency: 39
Structure: ('PRP', 'VBD', 'DT', 'JJ', 'NN'), Frequency: 37
Structure: (

Subset 25:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 149
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 106
Structure: (',', 'IN', 'DT', 'NN', ','), Frequency: 103
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 92
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 90
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 68
Structure: ('NN', ',', 'IN', 'DT', 'NN'), Frequency: 65
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 65
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 65
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 63
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 62
Structure: ('IN', 'DT', 'NN', ',', 'IN'), Frequency: 62
Structure: (',', 'IN', 'DT', 'NN', 'IN'), Frequency: 61
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 61
Structure: ('PRP', 'VBD', 'IN', 'DT', 'NN'), Frequency: 58
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 58
Structure: ('DT', 'JJ', 'NN', 'IN', 'PRP$'), Frequency: 46
Structure: ('NN'

In [23]:
# Fanu vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find sentence structures frequency using 4-grams in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    structure_frequency_5grams_subset = count_sentence_structures_5grams_subset(subset_text)
    
    # Sort the subset by frequency in descending order
    sorted_subset = sorted(structure_frequency_5grams_subset.items(), key=lambda x: x[1], reverse=True)
    
    print(f"Subset {i + 1}:")
    for structure, frequency in sorted_subset[:100]:  # Print only the first 5 structures
        print(f"Structure: {structure}, Frequency: {frequency}")
    print()  # Print an empty line to separate subsets



Subset 1:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 86
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 57
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 52
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 46
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 40
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 37
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 35
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 31
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 30
Structure: ('NN', '.', 'NN', 'PRP', 'VBD'), Frequency: 30
Structure: ('TO', 'VB', 'DT', 'NN', 'IN'), Frequency: 29
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 28
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 27
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 25
Structure: ('IN', 'PRP$', 'NN', ',', 'CC'), Frequency: 25
Structure: ('PRP', 'VBD', 'DT', 'NN', 'IN'), Frequency: 24
Structure: ('DT', 'JJ', 'NN', ',', 'CC'), Frequency: 24
Structure: ('.', 'NN

Subset 3:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 52
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 39
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 38
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 30
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 30
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 30
Structure: ('IN', 'PRP$', 'NN', ',', 'CC'), Frequency: 30
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 29
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 27
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 27
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 26
Structure: ('JJ', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 26
Structure: ('PRP$', 'NN', 'IN', 'DT', 'NN'), Frequency: 25
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 24
Structure: ('VBD', 'DT', 'JJ', 'NN', 'IN'), Frequency: 23
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 22
Structure: ('DT', 'JJ', 'NN', 'IN', 'PRP$'), Frequency: 22
Structure: ('

Subset 5:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 36
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 34
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 30
Structure: ('PRP', 'VBP', 'JJ', 'NN', 'NN'), Frequency: 26
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 26
Structure: ('NN', ',', 'CC', 'PRP', 'VBD'), Frequency: 25
Structure: ('NN', '.', 'NN', 'PRP', 'VBD'), Frequency: 25
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 22
Structure: ('RB', ',', 'IN', 'PRP', 'VBD'), Frequency: 20
Structure: ('.', 'NN', 'PRP', 'VBD', '.'), Frequency: 20
Structure: ('IN', 'PRP$', 'NN', ',', 'CC'), Frequency: 19
Structure: ('NN', 'IN', 'PRP$', 'NN', '.'), Frequency: 19
Structure: ('NN', ':', 'CC', 'PRP', 'VBD'), Frequency: 19
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 18
Structure: (',', 'NNP', 'PRP', 'VBD', ','), Frequency: 18
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 18
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 18
Structure

Subset 7:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 77
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 57
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 55
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 52
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 46
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 46
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 45
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 44
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 43
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 39
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 37
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 31
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 29
Structure: ('DT', 'NN', 'IN', 'NN', ','), Frequency: 26
Structure: ('PRP', 'VBD', 'IN', 'DT', 'NN'), Frequency: 25
Structure: ('VBD', 'DT', 'NN', 'IN', 'DT'), Frequency: 23
Structure: ('JJ', 'NN', ',', 'PRP', 'VBD'), Frequency: 23
Structure: ('IN', '

Subset 9:
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 58
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 55
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 45
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 35
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 35
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 35
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 35
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 32
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 32
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 31
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 31
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 29
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 27
Structure: ('DT', 'NN', 'IN', 'NN', 'IN'), Frequency: 24
Structure: ('IN', 'DT', 'JJ', 'NN', '.'), Frequency: 24
Structure: ('DT', 'JJ', 'NN', 'IN', 'NN'), Frequency: 22
Structure: ('DT', 'JJ', 'NN', ',', 'CC'), Frequency: 22
Structure: ('DT', 'NN'

Subset 11:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 55
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 41
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 38
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 37
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 36
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 35
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 31
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 29
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 28
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 26
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 25
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 24
Structure: ('IN', 'DT', 'JJ', 'NN', '.'), Frequency: 23
Structure: ('NN', ',', 'IN', 'DT', 'NN'), Frequency: 22
Structure: ('PRP', 'VBD', 'DT', 'JJ', 'NN'), Frequency: 22
Structure: ('DT', 'JJ', 'NN', 'IN', 'NN'), Frequency: 21
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 21
Structure: ('DT', 'N

Subset 13:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 47
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 45
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 37
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 35
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 35
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 34
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 32
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 29
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 28
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 26
Structure: ('DT', 'NN', ',', 'DT', 'NN'), Frequency: 24
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 24
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 23
Structure: ('NN', ',', 'DT', 'JJ', 'NN'), Frequency: 20
Structure: ('DT', 'JJ', 'NN', ',', 'DT'), Frequency: 19
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 19
Structure: ('JJ', 'NN', ',', 'PRP', 'VBD'), Frequency: 18
Structure: ('DT', 'JJ', 

Subset 15:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 123
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 92
Structure: (',', "''", 'VBD', 'NNP', ','), Frequency: 76
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 74
Structure: ('NN', ',', "''", 'VBD', 'NNP'), Frequency: 58
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 55
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 54
Structure: (',', "''", 'VBD', 'NNP', '.'), Frequency: 54
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 53
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 43
Structure: ("''", 'VBD', 'NNP', ',', '``'), Frequency: 42
Structure: ('IN', 'DT', 'NN', '.', "''"), Frequency: 41
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 41
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 40
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 37
Structure: ('DT', 'JJ', 'NN', 'IN', 'NN'), Frequency: 35
Structure: ('RB', ',', "''", 'VBD', 'NNP'), Frequency: 34
Structure: ('DT', 'N

Subset 17:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 107
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 87
Structure: (',', "''", 'VBD', 'NNP', ','), Frequency: 63
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 60
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 58
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 57
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 53
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 52
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 51
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 51
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 51
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 48
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 43
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 40
Structure: ('IN', 'DT', 'JJ', 'NN', '.'), Frequency: 39
Structure: ('PRP', 'MD', 'VB', 'DT', 'NN'), Frequency: 39
Structure: ("''", 'VBD', 'NNP', ',', '``'), Frequency: 38
Structure: (',', "'

Subset 19:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 127
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 85
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 74
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 69
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 68
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 59
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 59
Structure: ('IN', 'DT', 'NN', '.', "''"), Frequency: 55
Structure: (',', "''", 'VBD', 'NNP', 'NNP'), Frequency: 50
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 49
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 48
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 46
Structure: (',', "''", 'VBD', 'DT', 'JJ'), Frequency: 44
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 43
Structure: ('DT', 'JJ', 'NN', 'IN', 'NN'), Frequency: 42
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 41
Structure: ('IN', 'DT', 'NN', 'IN', 'NN'), Frequency: 40
Structure: (',', "''",

Subset 21:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 143
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 97
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 92
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 70
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 66
Structure: (',', "''", 'VBD', 'DT', 'NN'), Frequency: 57
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 56
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 56
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 53
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 51
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 50
Structure: ('IN', 'DT', 'NN', ',', 'IN'), Frequency: 48
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 48
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 43
Structure: ('IN', 'DT', 'NN', '.', "''"), Frequency: 43
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 43
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 43
Structure: (',', 'IN', '

Subset 23:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 143
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 114
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 83
Structure: ('NNP', 'NNP', 'NNP', 'NNP', 'NNP'), Frequency: 75
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 69
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 68
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 65
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 64
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 63
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 63
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 62
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 54
Structure: ('IN', 'DT', 'NN', ',', 'IN'), Frequency: 51
Structure: ('DT', 'NN', 'IN', 'DT', 'JJ'), Frequency: 46
Structure: (',', 'IN', 'DT', 'NN', 'IN'), Frequency: 44
Structure: ('NN', ',', 'IN', 'DT', 'NN'), Frequency: 43
Structure: ('NN', 'IN', 'PRP$', 'NN', ','), Frequency: 43
Structure: ('NN'

Subset 25:
Structure: ('DT', 'NN', 'IN', 'DT', 'NN'), Frequency: 72
Structure: ('DT', 'JJ', 'NN', 'IN', 'DT'), Frequency: 51
Structure: ('IN', 'DT', 'JJ', 'NN', 'IN'), Frequency: 50
Structure: ('DT', 'NN', 'IN', 'PRP$', 'NN'), Frequency: 50
Structure: ('JJ', 'NN', 'IN', 'DT', 'NN'), Frequency: 49
Structure: ('IN', 'DT', 'NN', 'IN', 'DT'), Frequency: 39
Structure: ('NN', 'IN', 'DT', 'NN', ','), Frequency: 37
Structure: ('IN', 'DT', 'JJ', 'NN', ','), Frequency: 35
Structure: ('IN', 'DT', 'NN', 'IN', 'PRP$'), Frequency: 33
Structure: ('NN', 'IN', 'DT', 'NN', 'IN'), Frequency: 33
Structure: ('NN', 'IN', 'DT', 'JJ', 'NN'), Frequency: 32
Structure: ('IN', 'DT', 'NN', ',', 'CC'), Frequency: 30
Structure: ('NN', 'IN', 'DT', 'NN', '.'), Frequency: 27
Structure: (',', 'IN', 'DT', 'JJ', 'NN'), Frequency: 26
Structure: ('DT', 'NN', 'IN', 'NNP', 'NNP'), Frequency: 24
Structure: ('PRP', 'VBD', 'DT', 'NN', 'IN'), Frequency: 23
Structure: ('NN', 'IN', 'NNP', 'NNP', 'NNP'), Frequency: 23
Structure: ('N

# Hapax legomenon & it's POS

In [69]:
# Stemming function
def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_text = ' '.join([stemmer.stem(token) for token in tokens])
    return stemmed_text

# Allan Poe

In [23]:
from collections import Counter
from nltk.tokenize import word_tokenize

import pandas as pd

stemmed_text_Poe = stem_text(preprocessed_text_Poe)

# Tokenize the corpus into words
words_Poe = word_tokenize(stemmed_text_Poe)

# Count the frequency of each word
word_frequency_Poe = Counter(words_Poe)

# Identify Hapax legomena (words that occur only once)
hapax_legomena_Poe = [word for word, freq in word_frequency_Poe.items() if freq == 1]

# Initialize a dictionary to store the frequency of each general POS tag among Hapax legomena
pos_tag_frequency_Poe = {}

# Process each Hapax legomenon
for word in hapax_legomena_Poe:
    # Find the index of the first occurrence of the Hapax legomenon
    index_Poe = words_Poe.index(word)
    # Determine the POS tagging of the Hapax legomenon at its specific occurrence
    pos_tag_Poe = nltk.pos_tag([word])[0][1]
    # Generalize the POS tag to its main category
    general_pos_tag_Poe = pos_tag_Poe[:2]  # Take only the first two characters
    # Increment the frequency count for this general POS tag
    pos_tag_frequency_Poe[general_pos_tag_Poe] = pos_tag_frequency_Poe.get(general_pos_tag_Poe, 0) + 1

# Create a DataFrame from the pos_tag_frequency dictionary
Poe_Hapax = pd.DataFrame(list(pos_tag_frequency_Poe.items()), columns=['POS Tag', 'Frequency'])

# Calculate the total number of Hapax legomena
total_hapax_legomena_Poe = len(hapax_legomena_Poe)

# Calculate the relative frequency for each general POS tag among Hapax legomena
Poe_Hapax['Relative Frequency'] = Poe_Hapax['Frequency'] / total_hapax_legomena_Poe

In [24]:
Poe_Hapax.sort_values(by='Relative Frequency', ascending=False)

Unnamed: 0,POS Tag,Frequency,Relative Frequency
0,NN,3110,0.941568
1,JJ,116,0.03512
3,CD,50,0.015138
2,VB,15,0.004541
5,RB,10,0.003028
4,PR,1,0.000303
6,MD,1,0.000303


In [70]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import re
import pandas as pd
from collections import Counter

# Define stemming function
def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_text = ' '.join([stemmer.stem(token) for token in tokens])
    return stemmed_text

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

# Function to calculate hapax legomena and POS tag frequencies for a subset
def hapax_legomena_and_pos_tags(subset_text):
    # Stem the subset text
    stemmed_subset = stem_text(subset_text)
    
    # Tokenize the subset into words
    words = word_tokenize(stemmed_subset)
    
    # Count the frequency of each word
    word_frequency = Counter(words)
    
    # Identify hapax legomena (words that occur only once)
    hapax_legomena = [word for word, freq in word_frequency.items() if freq == 1]
    
    # Initialize a dictionary to store the frequency of each general POS tag among hapax legomena
    pos_tag_frequency = {}
    
    # Process each hapax legomenon
    for word in hapax_legomena:
        # Determine the POS tagging of the hapax legomenon
        pos_tag = nltk.pos_tag([word])[0][1]
        
        # Generalize the POS tag to its main category
        general_pos_tag = pos_tag[:2]  # Take only the first two characters
        
        # Increment the frequency count for this general POS tag
        pos_tag_frequency[general_pos_tag] = pos_tag_frequency.get(general_pos_tag, 0) + 1
    
    return pos_tag_frequency

# Read the Poe corpus text from file
file_path_Poe = 'Allan Poe - Full Corpus.txt'
with open(file_path_Poe, 'r') as file:
    text_Poe = file.read()

# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5
print(subset_size)

# Find hapax legomena and POS tag frequencies in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pos_tag_frequency_subset = hapax_legomena_and_pos_tags(subset_text)
    print(f"Hapax Legomena and POS Tag Frequencies in Subset {i + 1}:")
    print(pos_tag_frequency_subset)


1568
Hapax Legomena and POS Tag Frequencies in Subset 1:
{'NN': 1772, 'JJ': 57, 'VB': 19, 'RB': 13, 'IN': 7, 'CD': 27, 'DT': 1, ':': 1, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 2:
{'NN': 2394, 'IN': 11, 'JJ': 89, 'VB': 15, 'RB': 18, 'CD': 2, 'PR': 1, 'WD': 1, 'DT': 1, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 3:
{'NN': 2256, 'JJ': 84, 'RB': 20, 'CD': 11, 'VB': 16, 'IN': 2, 'CC': 1, 'MD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 4:
{'NN': 1476, 'JJ': 64, 'VB': 8, 'RB': 9, 'CD': 53, "''": 1, 'PO': 1, '.': 1, 'IN': 1, 'WD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 5:
{'NN': 2533, 'JJ': 119, 'VB': 15, 'CD': 7, 'RB': 16, 'CC': 1, 'IN': 5, 'PR': 1, 'WD': 1}


# Sheridan Le Fanu

In [20]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import Counter
from nltk.tokenize import word_tokenize
import pandas as pd

stemmed_text_Fanu = stem_text(preprocessed_text_Fanu)

# Tokenize the corpus into words
words_Fanu = word_tokenize(stemmed_text_Fanu)

# Count the frequency of each word
word_frequency_Fanu = Counter(words_Fanu)

# Identify Hapax legomena (words that occur only once)
hapax_legomena_Fanu = [word for word, freq in word_frequency_Fanu.items() if freq == 1]

# Initialize a dictionary to store the frequency of each general POS tag among Hapax legomena
pos_tag_frequency_Fanu = {}

# Process each Hapax legomenon
for word in hapax_legomena_Fanu:
    # Find the index of the first occurrence of the Hapax legomenon
    index_Fanu = words_Fanu.index(word)
    # Determine the POS tagging of the Hapax legomenon at its specific occurrence
    pos_tag_Fanu = nltk.pos_tag([word])[0][1]
    # Generalize the POS tag to its main category
    general_pos_tag_Fanu = pos_tag_Fanu[:2]  # Take only the first two characters
    # Increment the frequency count for this general POS tag
    pos_tag_frequency_Fanu[general_pos_tag_Fanu] = pos_tag_frequency_Fanu.get(general_pos_tag_Fanu, 0) + 1

# Create a DataFrame from the pos_tag_frequency dictionary
Fanu_Hapax = pd.DataFrame(list(pos_tag_frequency_Fanu.items()), columns=['POS Tag', 'Frequency'])

# Calculate the total number of Hapax legomena
total_hapax_legomena_Fanu = len(hapax_legomena_Fanu)

# Calculate the relative frequency for each general POS tag among Hapax legomena
Fanu_Hapax['Relative Frequency'] = Fanu_Hapax['Frequency'] / total_hapax_legomena_Fanu

In [21]:
Fanu_Hapax.sort_values(by='Relative Frequency', ascending=False)

Unnamed: 0,POS Tag,Frequency,Relative Frequency
0,NN,2843,0.960473
1,JJ,83,0.028041
3,CD,17,0.005743
2,VB,10,0.003378
4,RB,4,0.001351
5,MD,1,0.000338
6,SY,1,0.000338
7,IN,1,0.000338


In [115]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5
print(subset_size)

# Find hapax legomena and POS tag frequencies in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pos_tag_frequency_subset = hapax_legomena_and_pos_tags(subset_text)
    print(f"Hapax Legomena and POS Tag Frequencies in Subset {i + 1}:")
    print(pos_tag_frequency_subset)

1097
Hapax Legomena and POS Tag Frequencies in Subset 1:
{'NN': 1851, 'JJ': 77, 'CD': 5, 'VB': 13, 'RB': 5, 'IN': 1, 'MD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 2:
{'NN': 1576, 'JJ': 70, 'RB': 9, 'VB': 17, 'DT': 1, '(': 1, ')': 1, 'IN': 4, 'CD': 3}
Hapax Legomena and POS Tag Frequencies in Subset 3:
{'NN': 1578, 'JJ': 60, 'RB': 5, 'VB': 8, 'DT': 1, 'IN': 1, 'CD': 5}
Hapax Legomena and POS Tag Frequencies in Subset 4:
{'NN': 1688, 'JJ': 74, 'VB': 15, 'IN': 7, 'RB': 2, 'SY': 1, 'CD': 4, 'CC': 1, "''": 1}
Hapax Legomena and POS Tag Frequencies in Subset 5:
{'NN': 1697, 'JJ': 76, 'VB': 18, 'TO': 1, 'RB': 5, 'MD': 3, 'CD': 4, 'IN': 3}


# Gothic corpus

In [25]:
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import Counter
from nltk.tokenize import word_tokenize
import pandas as pd

stemmed_text_Gothic = stem_text(preprocessed_text_Gothic)

# Tokenize the corpus into words
words_Gothic = word_tokenize(stemmed_text_Gothic)

# Count the frequency of each word
word_frequency_Gothic = Counter(words_Gothic)

# Identify Hapax legomena (words that occur only once)
hapax_legomena_Gothic = [word for word, freq in word_frequency_Gothic.items() if freq == 1]

# Initialize a dictionary to store the frequency of each general POS tag among Hapax legomena
pos_tag_frequency_Gothic = {}

# Process each Hapax legomenon
for word in hapax_legomena_Gothic:
    # Find the index of the first occurrence of the Hapax legomenon
    index_Gothic = words_Gothic.index(word)
    # Determine the POS tagging of the Hapax legomenon at its specific occurrence
    pos_tag_Gothic = nltk.pos_tag([word])[0][1]
    # Generalize the POS tag to its main category
    general_pos_tag_Gothic = pos_tag_Gothic[:2]  # Take only the first two characters
    # Increment the frequency count for this general POS tag
    pos_tag_frequency_Gothic[general_pos_tag_Gothic] = pos_tag_frequency_Gothic.get(general_pos_tag_Gothic, 0) + 1

# Create a DataFrame from the pos_tag_frequency dictionary
Gothic_Hapax = pd.DataFrame(list(pos_tag_frequency_Gothic.items()), columns=['POS Tag', 'Frequency'])

# Calculate the total number of Hapax legomena
total_hapax_legomena_Gothic = len(hapax_legomena_Gothic)

# Calculate the relative frequency for each general POS tag among Hapax legomena
Gothic_Hapax['Relative Frequency'] = Gothic_Hapax['Frequency'] / total_hapax_legomena_Gothic

In [26]:
Gothic_Hapax.sort_values(by='Relative Frequency', ascending=False)

Unnamed: 0,POS Tag,Frequency,Relative Frequency
1,NN,8342,0.95819
3,JJ,193,0.022169
0,CD,115,0.013209
4,VB,29,0.003331
2,RB,25,0.002872
5,IN,2,0.00023


In [71]:
#Poe vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1568

# Find hapax legomena and POS tag frequencies in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pos_tag_frequency_subset = hapax_legomena_and_pos_tags(subset_text)
    print(f"Hapax Legomena and POS Tag Frequencies in Subset {i + 1}:")
    print(pos_tag_frequency_subset)

Hapax Legomena and POS Tag Frequencies in Subset 1:
{'CD': 4, 'NN': 1992, 'VB': 25, 'JJ': 74, 'RB': 12, 'IN': 5, 'PR': 2}
Hapax Legomena and POS Tag Frequencies in Subset 2:
{'NN': 1764, 'RB': 11, 'JJ': 69, 'VB': 16, 'IN': 5, 'CD': 3, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 3:
{'NN': 1579, 'JJ': 53, 'RB': 15, 'VB': 25, 'IN': 3, 'SY': 1, 'DT': 1, 'CC': 1, 'CD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 4:
{'NN': 1890, 'VB': 34, 'JJ': 78, 'RB': 8, 'CD': 1, 'WD': 1, 'IN': 4, 'DT': 1}
Hapax Legomena and POS Tag Frequencies in Subset 5:
{'NN': 2686, 'RB': 17, 'VB': 22, 'JJ': 119, 'DT': 1, 'IN': 5, 'CD': 2, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 6:
{'NN': 2513, 'RB': 21, 'JJ': 106, 'VB': 26, 'CD': 2, 'IN': 2, 'CC': 1, 'DT': 1}
Hapax Legomena and POS Tag Frequencies in Subset 7:
{'NN': 2561, 'JJ': 105, 'VB': 25, 'RB': 14, 'IN': 7, 'DT': 3, 'CD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 8:
{'NN': 2836, 'JJ': 109, 'IN': 7, 'VB': 24, 'RB'

In [72]:
#Fanu vs Gothic
# Split the text into sentences
sentences = split_into_sentences(text_Gothic)
subset_size = 1097

# Find hapax legomena and POS tag frequencies in each subset
for i in range(25):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    pos_tag_frequency_subset = hapax_legomena_and_pos_tags(subset_text)
    print(f"Hapax Legomena and POS Tag Frequencies in Subset {i + 1}:")
    print(pos_tag_frequency_subset)

Hapax Legomena and POS Tag Frequencies in Subset 1:
{'CD': 4, 'NN': 1846, 'VB': 22, 'JJ': 76, 'RB': 10, 'IN': 4, 'PR': 1, 'MD': 1, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 2:
{'NN': 1425, 'IN': 2, 'JJ': 52, 'RB': 8, 'VB': 12, 'CD': 1, 'PR': 1, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 3:
{'NN': 1618, 'VB': 18, 'JJ': 68, 'RB': 12, 'CD': 1, 'IN': 4, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 4:
{'NN': 1377, 'RB': 14, 'JJ': 44, 'VB': 23, 'IN': 4, 'WP': 1, 'SY': 1, 'CD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 5:
{'NN': 1390, 'JJ': 51, 'RB': 7, 'VB': 25, 'WP': 1, 'CC': 1, 'CD': 2, 'IN': 4, 'WD': 1, 'PR': 1}
Hapax Legomena and POS Tag Frequencies in Subset 6:
{'NN': 1805, 'JJ': 73, 'RB': 10, 'VB': 18, 'IN': 4, 'DT': 2, 'CD': 1}
Hapax Legomena and POS Tag Frequencies in Subset 7:
{'NN': 2309, 'JJ': 106, 'VB': 18, 'IN': 5, 'RB': 11, 'CD': 2, 'CC': 1}
Hapax Legomena and POS Tag Frequencies in Subset 8:
{'JJ': 84, 'NN': 2063, 'CD': 6, 

# N-gram

In [132]:
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
import re

def find_ngrams(text, n):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove punctuation
    words = [word for word in words if word.isalnum()]
    # Generate n-grams
    n_grams = ngrams(words, n)
    # Count the frequency of each n-gram
    n_gram_freq = Counter(n_grams)
    # Return the most common n-grams
    return n_gram_freq.most_common(25)


# Allan Poe

In [133]:
most_common_ngrams_Poe = find_ngrams(preprocessed_text_Poe, 3)
print("Most frequent {}-grams:".format(3), most_common_ngrams_Poe)


Most frequent 3-grams: [(('one', 'of', 'the'), 109), (('I', 'could', 'not'), 81), (('of', 'the', 'most'), 58), (('as', 'well', 'as'), 52), (('portion', 'of', 'the'), 44), (('would', 'have', 'been'), 44), (('that', 'of', 'the'), 43), (('might', 'have', 'been'), 42), (('which', 'I', 'had'), 36), (('I', 'had', 'been'), 35), (('part', 'of', 'the'), 34), (('that', 'it', 'was'), 34), (('in', 'regard', 'to'), 33), (('that', 'he', 'had'), 32), (('of', 'the', 'water'), 32), (('to', 'the', 'southward'), 32), (('on', 'account', 'of'), 31), (('by', 'means', 'of'), 31), (('to', 'and', 'fro'), 31), (('appeared', 'to', 'be'), 30), (('that', 'he', 'was'), 30), (('seemed', 'to', 'be'), 30), (('so', 'as', 'to'), 29), (('of', 'which', 'I'), 29), (('and', 'in', 'the'), 29)]


In [8]:
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
import re
import pandas as pd

def find_ngrams(text, n):
    # Tokenize the text into words
    words = word_tokenize(text)
    # Remove punctuation
    words = [word for word in words if word.isalnum()]
    # Generate n-grams
    n_grams = ngrams(words, n)
    # Count the frequency of each n-gram
    n_gram_freq = Counter(n_grams)
    # Calculate the total number of n-grams
    total_ngrams = sum(n_gram_freq.values())
    # Calculate the relative frequency of each n-gram
    relative_frequency = {gram: freq / total_ngrams for gram, freq in n_gram_freq.items()}
    # Return the most common n-grams with their relative frequencies
    return [(gram, freq, relative_frequency[gram]) for gram, freq in n_gram_freq.most_common(25)]

most_common_ngrams_Poe = find_ngrams(preprocessed_text_Poe, 3)
print("Most frequent {}-grams:".format(3))
for gram, freq, relative_freq in most_common_ngrams_Poe:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")

most_common_3grams_Poe_df = pd.DataFrame(most_common_ngrams_Poe, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 3-grams:
('one', 'of', 'the'): Frequency = 109, Relative Frequency = 0.000547
('I', 'could', 'not'): Frequency = 81, Relative Frequency = 0.000407
('of', 'the', 'most'): Frequency = 58, Relative Frequency = 0.000291
('as', 'well', 'as'): Frequency = 52, Relative Frequency = 0.000261
('portion', 'of', 'the'): Frequency = 44, Relative Frequency = 0.000221
('would', 'have', 'been'): Frequency = 44, Relative Frequency = 0.000221
('that', 'of', 'the'): Frequency = 43, Relative Frequency = 0.000216
('might', 'have', 'been'): Frequency = 42, Relative Frequency = 0.000211
('which', 'I', 'had'): Frequency = 36, Relative Frequency = 0.000181
('I', 'had', 'been'): Frequency = 35, Relative Frequency = 0.000176
('part', 'of', 'the'): Frequency = 34, Relative Frequency = 0.000171
('that', 'it', 'was'): Frequency = 34, Relative Frequency = 0.000171
('in', 'regard', 'to'): Frequency = 33, Relative Frequency = 0.000166
('that', 'he', 'had'): Frequency = 32, Relative Frequency = 0.000161
(

In [9]:
most_common_3grams_Poe_df.to_csv('most_common_3grams.csv', index=False)


In [10]:
most_common_ngrams_Poe = find_ngrams(preprocessed_text_Poe, 4)
print("Most frequent {}-grams:".format(4))
for gram, freq, relative_freq in most_common_ngrams_Poe:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_4grams_Poe_df = pd.DataFrame(most_common_ngrams_Poe, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 4-grams:
('on', 'the', 'part', 'of'): Frequency = 24, Relative Frequency = 0.000120
('for', 'the', 'purpose', 'of'): Frequency = 20, Relative Frequency = 0.000100
('that', 'is', 'to', 'say'): Frequency = 18, Relative Frequency = 0.000090
('on', 'account', 'of', 'the'): Frequency = 17, Relative Frequency = 0.000085
('ugh', 'ugh', 'ugh', 'ugh'): Frequency = 17, Relative Frequency = 0.000085
('I', 'could', 'not', 'help'): Frequency = 16, Relative Frequency = 0.000080
('in', 'regard', 'to', 'the'): Frequency = 16, Relative Frequency = 0.000080
('the', 'Barrière', 'du', 'Roule'): Frequency = 16, Relative Frequency = 0.000080
('by', 'means', 'of', 'a'): Frequency = 15, Relative Frequency = 0.000075
('the', 'vicinity', 'of', 'the'): Frequency = 15, Relative Frequency = 0.000075
('with', 'the', 'exception', 'of'): Frequency = 15, Relative Frequency = 0.000075
('the', 'head', 'of', 'the'): Frequency = 15, Relative Frequency = 0.000075
('the', 'middle', 'of', 'the'): Frequency = 13

In [12]:
most_common_4grams_Poe_df

Unnamed: 0,N-gram,Frequency,Relative Frequency
0,"(on, the, part, of)",24,0.00012
1,"(for, the, purpose, of)",20,0.0001
2,"(that, is, to, say)",18,9e-05
3,"(on, account, of, the)",17,8.5e-05
4,"(ugh, ugh, ugh, ugh)",17,8.5e-05
5,"(I, could, not, help)",16,8e-05
6,"(in, regard, to, the)",16,8e-05
7,"(the, Barrière, du, Roule)",16,8e-05
8,"(by, means, of, a)",15,7.5e-05
9,"(the, vicinity, of, the)",15,7.5e-05


In [11]:
most_common_4grams_Poe_df.to_csv('most_common_4grams.csv', index=False)


In [13]:
most_common_ngrams_Poe = find_ngrams(preprocessed_text_Poe, 5)
print("Most frequent {}-grams:".format(5))
for gram, freq, relative_freq in most_common_ngrams_Poe:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_5grams_Poe_df = pd.DataFrame(most_common_ngrams_Poe, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 5-grams:
('ugh', 'ugh', 'ugh', 'ugh', 'ugh'): Frequency = 15, Relative Frequency = 0.000075
('in', 'the', 'direction', 'of', 'the'): Frequency = 10, Relative Frequency = 0.000050
('in', 'the', 'vicinity', 'of', 'the'): Frequency = 9, Relative Frequency = 0.000045
('on', 'the', 'part', 'of', 'the'): Frequency = 9, Relative Frequency = 0.000045
('o', 'clock', 'in', 'the', 'morning'): Frequency = 8, Relative Frequency = 0.000040
('at', 'the', 'head', 'of', 'the'): Frequency = 7, Relative Frequency = 0.000035
('with', 'the', 'exception', 'of', 'the'): Frequency = 6, Relative Frequency = 0.000030
('in', 'the', 'bottom', 'of', 'the'): Frequency = 6, Relative Frequency = 0.000030
('the', 'bottom', 'of', 'the', 'boat'): Frequency = 6, Relative Frequency = 0.000030
('There', 'can', 'be', 'no', 'doubt'): Frequency = 5, Relative Frequency = 0.000025
('Madame', 'L', 'Espanaye', 'and', 'her'): Frequency = 5, Relative Frequency = 0.000025
('L', 'Espanaye', 'and', 'her', 'daughter'): Fr

In [14]:
most_common_5grams_Poe_df.to_csv('most_common_5grams.csv', index=False)


# Sheridan Le Fanu

In [134]:
most_common_ngrams_Fanu = find_ngrams(preprocessed_text_Fanu, 3)
print("Most frequent {}-grams:".format(3), most_common_ngrams_Fanu)


Most frequent 3-grams: [(('I', 'don', 't'), 57), (('a', 'sort', 'of'), 39), (('that', 'he', 'was'), 38), (('said', 'Sir', 'Bale'), 37), (('out', 'of', 'the'), 36), (('I', 'could', 'not'), 35), (('side', 'of', 'the'), 33), (('there', 'was', 'a'), 32), (('I', 'can', 't'), 32), (('as', 'well', 'as'), 31), (('in', 'the', 'same'), 30), (('It', 'was', 'a'), 29), (('of', 'the', 'lake'), 29), (('Sir', 'Bale', 'Mardykes'), 29), (('which', 'he', 'had'), 28), (('and', 'with', 'a'), 28), (('of', 'the', 'house'), 28), (('of', 'the', 'old'), 28), (('two', 'or', 'three'), 27), (('one', 'of', 'the'), 27), (('for', 'a', 'moment'), 26), (('he', 'could', 'not'), 25), (('he', 'did', 'not'), 25), (('don', 't', 'know'), 25), (('Sir', 'Bale', 's'), 25)]


In [18]:
most_common_ngrams_Fanu = find_ngrams(preprocessed_text_Fanu, 3)
print("Most frequent {}-grams:".format(3))
for gram, freq, relative_freq in most_common_ngrams_Fanu:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_3grams_Fanu_df = pd.DataFrame(most_common_ngrams_Fanu, columns=['N-gram', 'Frequency', 'Relative Frequency'])
    

Most frequent 3-grams:
('I', 'don', 't'): Frequency = 57, Relative Frequency = 0.000376
('a', 'sort', 'of'): Frequency = 39, Relative Frequency = 0.000257
('that', 'he', 'was'): Frequency = 38, Relative Frequency = 0.000250
('said', 'Sir', 'Bale'): Frequency = 37, Relative Frequency = 0.000244
('out', 'of', 'the'): Frequency = 36, Relative Frequency = 0.000237
('I', 'could', 'not'): Frequency = 35, Relative Frequency = 0.000231
('side', 'of', 'the'): Frequency = 33, Relative Frequency = 0.000218
('there', 'was', 'a'): Frequency = 32, Relative Frequency = 0.000211
('I', 'can', 't'): Frequency = 32, Relative Frequency = 0.000211
('as', 'well', 'as'): Frequency = 31, Relative Frequency = 0.000204
('in', 'the', 'same'): Frequency = 30, Relative Frequency = 0.000198
('It', 'was', 'a'): Frequency = 29, Relative Frequency = 0.000191
('of', 'the', 'lake'): Frequency = 29, Relative Frequency = 0.000191
('Sir', 'Bale', 'Mardykes'): Frequency = 29, Relative Frequency = 0.000191
('which', 'he', 'h

In [19]:
most_common_3grams_Fanu_df.to_csv('most_common_3grams_Fanu.csv', index=False)


In [20]:
most_common_ngrams_Fanu = find_ngrams(preprocessed_text_Fanu, 4)
print("Most frequent {}-grams:".format(4))
for gram, freq, relative_freq in most_common_ngrams_Fanu:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_4grams_Fanu_df = pd.DataFrame(most_common_ngrams_Fanu, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 4-grams:
('I', 'don', 't', 'know'): Frequency = 22, Relative Frequency = 0.000145
('the', 'side', 'of', 'the'): Frequency = 15, Relative Frequency = 0.000099
('at', 'the', 'same', 'time'): Frequency = 15, Relative Frequency = 0.000099
('the', 'foot', 'of', 'the'): Frequency = 15, Relative Frequency = 0.000099
('the', 'George', 'and', 'Dragon'): Frequency = 14, Relative Frequency = 0.000092
('as', 'I', 'have', 'said'): Frequency = 13, Relative Frequency = 0.000086
('for', 'the', 'first', 'time'): Frequency = 12, Relative Frequency = 0.000079
('at', 'the', 'other', 'side'): Frequency = 12, Relative Frequency = 0.000079
('at', 'the', 'foot', 'of'): Frequency = 11, Relative Frequency = 0.000073
('in', 'a', 'state', 'of'): Frequency = 10, Relative Frequency = 0.000066
('the', 'door', 'of', 'the'): Frequency = 9, Relative Frequency = 0.000059
('for', 'a', 'long', 'time'): Frequency = 9, Relative Frequency = 0.000059
('I', 'can', 't', 'say'): Frequency = 9, Relative Frequency = 

In [21]:
most_common_4grams_Fanu_df.to_csv('most_common_4grams_Fanu.csv', index=False)


In [22]:
most_common_ngrams_Fanu = find_ngrams(preprocessed_text_Fanu, 5)
print("Most frequent {}-grams:".format(5))
for gram, freq, relative_freq in most_common_ngrams_Fanu:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_5grams_Fanu_df = pd.DataFrame(most_common_ngrams_Fanu, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 5-grams:
('at', 'the', 'foot', 'of', 'the'): Frequency = 8, Relative Frequency = 0.000053
('the', 'foot', 'of', 'the', 'bed'): Frequency = 8, Relative Frequency = 0.000053
('the', 'margin', 'of', 'the', 'lake'): Frequency = 7, Relative Frequency = 0.000046
('of', 'the', 'George', 'and', 'Dragon'): Frequency = 7, Relative Frequency = 0.000046
('at', 'the', 'other', 'side', 'of'): Frequency = 7, Relative Frequency = 0.000046
('the', 'other', 'side', 'of', 'the'): Frequency = 7, Relative Frequency = 0.000046
('I', 'don', 't', 'know', 'what'): Frequency = 7, Relative Frequency = 0.000046
('and', 'at', 'the', 'same', 'time'): Frequency = 6, Relative Frequency = 0.000040
('made', 'up', 'his', 'mind', 'to'): Frequency = 6, Relative Frequency = 0.000040
('the', 'host', 'of', 'the', 'George'): Frequency = 5, Relative Frequency = 0.000033
('the', 'side', 'of', 'the', 'bed'): Frequency = 4, Relative Frequency = 0.000026
('in', 'the', 'direction', 'of', 'the'): Frequency = 4, Relativ

In [23]:
most_common_5grams_Fanu_df.to_csv('most_common_5grams_Fanu.csv', index=False)


# Gothic corpus

In [135]:
most_common_ngrams_Gothic = find_ngrams(preprocessed_text_Gothic, 3)
print("Most frequent {}-grams:".format(3), most_common_ngrams_Gothic)


Most frequent 3-grams: [(('I', 'don', 't'), 1002), (('I', 'can', 'not'), 759), (('I', 'could', 'not'), 750), (('one', 'of', 'the'), 686), (('out', 'of', 'the'), 614), (('I', 'did', 'not'), 567), (('that', 'it', 'was'), 523), (('that', 'I', 'was'), 492), (('that', 'he', 'was'), 488), (('as', 'well', 'as'), 486), (('that', 'he', 'had'), 482), (('I', 'do', 'not'), 467), (('that', 'I', 'had'), 454), (('Sir', 'Francis', 'Varney'), 451), (('for', 'a', 'moment'), 437), (('I', 'am', 'not'), 406), (('which', 'I', 'had'), 389), (('part', 'of', 'the'), 385), (('there', 'was', 'a'), 383), (('don', 't', 'know'), 382), (('I', 'had', 'been'), 367), (('would', 'have', 'been'), 363), (('I', 'should', 'have'), 361), (('which', 'he', 'had'), 328), (('It', 'was', 'a'), 326)]


In [25]:
most_common_ngrams_Gothic = find_ngrams(preprocessed_text_Gothic, 3)
print("Most frequent {}-grams:".format(3))
for gram, freq, relative_freq in most_common_ngrams_Gothic:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_3grams_Gothic_df = pd.DataFrame(most_common_ngrams_Gothic, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 3-grams:
('I', 'don', 't'): Frequency = 1002, Relative Frequency = 0.000334
('I', 'can', 'not'): Frequency = 759, Relative Frequency = 0.000253
('I', 'could', 'not'): Frequency = 750, Relative Frequency = 0.000250
('one', 'of', 'the'): Frequency = 686, Relative Frequency = 0.000229
('out', 'of', 'the'): Frequency = 614, Relative Frequency = 0.000205
('I', 'did', 'not'): Frequency = 567, Relative Frequency = 0.000189
('that', 'it', 'was'): Frequency = 523, Relative Frequency = 0.000174
('that', 'I', 'was'): Frequency = 492, Relative Frequency = 0.000164
('that', 'he', 'was'): Frequency = 488, Relative Frequency = 0.000163
('as', 'well', 'as'): Frequency = 486, Relative Frequency = 0.000162
('that', 'he', 'had'): Frequency = 482, Relative Frequency = 0.000161
('I', 'do', 'not'): Frequency = 467, Relative Frequency = 0.000156
('that', 'I', 'had'): Frequency = 454, Relative Frequency = 0.000151
('Sir', 'Francis', 'Varney'): Frequency = 451, Relative Frequency = 0.000150
('for

In [26]:
most_common_3grams_Gothic_df.to_csv('most_common_3grams_Gothic.csv', index=False)


In [27]:
most_common_ngrams_Gothic = find_ngrams(preprocessed_text_Gothic, 4)
print("Most frequent {}-grams:".format(4))
for gram, freq, relative_freq in most_common_ngrams_Gothic:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_4grams_Gothic_df = pd.DataFrame(most_common_ngrams_Gothic, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 4-grams:
('I', 'don', 't', 'know'): Frequency = 292, Relative Frequency = 0.000097
('at', 'the', 'same', 'time'): Frequency = 210, Relative Frequency = 0.000070
('for', 'the', 'first', 'time'): Frequency = 166, Relative Frequency = 0.000055
('the', 'door', 'of', 'the'): Frequency = 132, Relative Frequency = 0.000044
('at', 'the', 'end', 'of'): Frequency = 117, Relative Frequency = 0.000039
('the', 'end', 'of', 'the'): Frequency = 111, Relative Frequency = 0.000037
('in', 'the', 'midst', 'of'): Frequency = 107, Relative Frequency = 0.000036
('the', 'rest', 'of', 'the'): Frequency = 106, Relative Frequency = 0.000035
('I', 'do', 'not', 'know'): Frequency = 105, Relative Frequency = 0.000035
('in', 'a', 'state', 'of'): Frequency = 95, Relative Frequency = 0.000032
('I', 'don', 't', 'want'): Frequency = 94, Relative Frequency = 0.000031
('out', 'of', 'the', 'room'): Frequency = 92, Relative Frequency = 0.000031
('don', 't', 'know', 'what'): Frequency = 91, Relative Frequency 

In [28]:
most_common_4grams_Gothic_df.to_csv('most_common_4grams_Gothic.csv', index=False)


In [29]:
most_common_ngrams_Gothic = find_ngrams(preprocessed_text_Gothic, 5)
print("Most frequent {}-grams:".format(5))
for gram, freq, relative_freq in most_common_ngrams_Gothic:
    print(f"{gram}: Frequency = {freq}, Relative Frequency = {relative_freq:.6f}")
    
most_common_5grams_Gothic_df = pd.DataFrame(most_common_ngrams_Gothic, columns=['N-gram', 'Frequency', 'Relative Frequency'])


Most frequent 5-grams:
('I', 'don', 't', 'know', 'what'): Frequency = 64, Relative Frequency = 0.000021
('at', 'the', 'end', 'of', 'the'): Frequency = 61, Relative Frequency = 0.000020
('in', 'the', 'middle', 'of', 'the'): Frequency = 55, Relative Frequency = 0.000018
('I', 'don', 't', 'want', 'to'): Frequency = 47, Relative Frequency = 0.000016
('a', 'quarter', 'of', 'an', 'hour'): Frequency = 44, Relative Frequency = 0.000015
('and', 'at', 'the', 'same', 'time'): Frequency = 41, Relative Frequency = 0.000014
('at', 'the', 'door', 'of', 'the'): Frequency = 39, Relative Frequency = 0.000013
('the', 'other', 'side', 'of', 'the'): Frequency = 37, Relative Frequency = 0.000012
('at', 'the', 'foot', 'of', 'the'): Frequency = 36, Relative Frequency = 0.000012
('as', 'if', 'it', 'had', 'been'): Frequency = 34, Relative Frequency = 0.000011
('was', 'on', 'the', 'point', 'of'): Frequency = 34, Relative Frequency = 0.000011
('as', 'if', 'he', 'had', 'been'): Frequency = 34, Relative Frequency =

In [30]:
most_common_5grams_Gothic_df.to_csv('most_common_5grams_Gothic.csv', index=False)


 # Occurance of 1 - 10 letter words

In [2]:
def count_words_by_length(document):
    # Initialize a dictionary to store counts for each word length
    word_lengths = {i: 0 for i in range(1, 11)}

    # Split the document into words
    words = document.split()

    # Iterate over each word and update the counts
    for word in words:
        length = len(word)
        if 1 <= length <= 10:
            word_lengths[length] += 1

    return word_lengths

# Allan Poe

In [12]:
word_counts_Poe = count_words_by_length(preprocessed_text_Poe)
print("Word counts by length:")
for length, count in word_counts_Poe.items():
    print(f"Length {length}: {count}")


Word counts by length:
Length 1: 8447
Length 2: 37194
Length 3: 42986
Length 4: 33037
Length 5: 21292
Length 6: 16257
Length 7: 13748
Length 8: 10111
Length 9: 7311
Length 10: 4745


In [13]:
def relative_frequency_words_by_length(document):
    # Initialize a dictionary to store counts for each word length
    word_lengths = {i: 0 for i in range(1, 11)}

    # Split the document into words
    words = document.split()
    total_words = len(words)

    # Iterate over each word and update the counts
    for word in words:
        length = len(word)
        if 1 <= length <= 10:
            word_lengths[length] += 1

    # Calculate the relative frequency of each word length occurrence
    relative_frequencies = {length: count / total_words for length, count in word_lengths.items()}

    return relative_frequencies


In [14]:
relative_frequencies_Poe = relative_frequency_words_by_length(preprocessed_text_Poe)
print("Relative frequencies of word counts by length:")
for length, frequency in relative_frequencies_Poe.items():
    print(f"Length {length}: {frequency:.4f}")


Relative frequencies of word counts by length:
Length 1: 0.0423
Length 2: 0.1862
Length 3: 0.2152
Length 4: 0.1654
Length 5: 0.1066
Length 6: 0.0814
Length 7: 0.0688
Length 8: 0.0506
Length 9: 0.0366
Length 10: 0.0238


In [82]:
import re

# Function to split text into sentences
def split_into_sentences(text):
    # Use regular expression to split text into sentences
    # Assumes sentences end with '.', '!', or '?' followed by a space or end of line
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

# Function to count words by length in a subset of text
def count_words_by_length_subset(subset_text):
    # Initialize a dictionary to store counts for each word length
    word_lengths = {i: 0 for i in range(1, 11)}

    # Split the subset into words
    words = subset_text.split()

    # Iterate over each word and update the counts
    for word in words:
        length = len(word)
        if 1 <= length <= 10:
            word_lengths[length] += 1

    return word_lengths

# Split the text into sentences
sentences = split_into_sentences(text_Poe)
subset_size = len(sentences) // 5

# Find the number of words by length in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    word_lengths_subset = count_words_by_length_subset(subset_text)
    print(f"Word Lengths Frequency in Subset {i + 1}:")
    for length, count in word_lengths_subset.items():
        print(f"Length {length}: {count}")


Word Lengths Frequency in Subset 1:
Length 1: 1046
Length 2: 6703
Length 3: 7655
Length 4: 5685
Length 5: 3842
Length 6: 2898
Length 7: 2475
Length 8: 2049
Length 9: 1507
Length 10: 1063
Word Lengths Frequency in Subset 2:
Length 1: 1685
Length 2: 6276
Length 3: 7374
Length 4: 5368
Length 5: 3664
Length 6: 2940
Length 7: 2585
Length 8: 2078
Length 9: 1561
Length 10: 1042
Word Lengths Frequency in Subset 3:
Length 1: 1824
Length 2: 6593
Length 3: 7289
Length 4: 5366
Length 5: 3840
Length 6: 2924
Length 7: 2535
Length 8: 2157
Length 9: 1552
Length 10: 1049
Word Lengths Frequency in Subset 4:
Length 1: 1318
Length 2: 8497
Length 3: 9137
Length 4: 7140
Length 5: 5135
Length 6: 3560
Length 7: 3304
Length 8: 2375
Length 9: 1858
Length 10: 1232
Word Lengths Frequency in Subset 5:
Length 1: 1496
Length 2: 7155
Length 3: 9076
Length 4: 6391
Length 5: 4538
Length 6: 3545
Length 7: 3102
Length 8: 2331
Length 9: 1735
Length 10: 1265


# Le Fanu

In [15]:
relative_frequencies_Fanu = relative_frequency_words_by_length(preprocessed_text_Fanu)
print("Relative frequencies of word counts by length:")
for length, frequency in relative_frequencies_Fanu.items():
    print(f"Length {length}: {frequency:.4f}")


Relative frequencies of word counts by length:
Length 1: 0.0510
Length 2: 0.1676
Length 3: 0.2481
Length 4: 0.1831
Length 5: 0.1086
Length 6: 0.0810
Length 7: 0.0631
Length 8: 0.0401
Length 9: 0.0270
Length 10: 0.0158


In [130]:
# Split the text into sentences
sentences = split_into_sentences(text_Fanu)
subset_size = len(sentences) // 5

# Find the number of words by length in each subset
for i in range(5):
    subset_sentences = sentences[i * subset_size: (i + 1) * subset_size]
    subset_text = ' '.join(subset_sentences)
    word_lengths_subset = count_words_by_length_subset(subset_text)
    print(f"Word Lengths Frequency in Subset {i + 1}:")
    for length, count in word_lengths_subset.items():
        print(f"Length {length}: {count}")


Word Lengths Frequency in Subset 1:
Length 1: 1278
Length 2: 5348
Length 3: 8108
Length 4: 5574
Length 5: 3971
Length 6: 2925
Length 7: 2375
Length 8: 1746
Length 9: 1222
Length 10: 840
Word Lengths Frequency in Subset 2:
Length 1: 884
Length 2: 3744
Length 3: 6056
Length 4: 4531
Length 5: 2983
Length 6: 2164
Length 7: 1630
Length 8: 1159
Length 9: 764
Length 10: 440
Word Lengths Frequency in Subset 3:
Length 1: 1020
Length 2: 4310
Length 3: 6738
Length 4: 4974
Length 5: 3235
Length 6: 2422
Length 7: 1922
Length 8: 1332
Length 9: 862
Length 10: 554
Word Lengths Frequency in Subset 4:
Length 1: 1455
Length 2: 5391
Length 3: 7121
Length 4: 5272
Length 5: 3525
Length 6: 2655
Length 7: 2170
Length 8: 1642
Length 9: 1171
Length 10: 768
Word Lengths Frequency in Subset 5:
Length 1: 1051
Length 2: 4307
Length 3: 6978
Length 4: 4853
Length 5: 3662
Length 6: 2584
Length 7: 1983
Length 8: 1279
Length 9: 874
Length 10: 564


# Gothic corpus

In [16]:
relative_frequencies_Gothic = relative_frequency_words_by_length(preprocessed_text_Gothic)
print("Relative frequencies of word counts by length:")
for length, frequency in relative_frequencies_Gothic.items():
    print(f"Length {length}: {frequency:.4f}")


Relative frequencies of word counts by length:
Length 1: 0.0537
Length 2: 0.1834
Length 3: 0.2320
Length 4: 0.1787
Length 5: 0.1045
Length 6: 0.0773
Length 7: 0.0644
Length 8: 0.0422
Length 9: 0.0300
Length 10: 0.0175


In [83]:
#Poe vs Gothic
import random

# Function to select random sentences from the second corpus
def select_random_sentences(text, subset_size):
    sentences = split_into_sentences(text)
    random_subset = random.sample(sentences, subset_size)
    return ' '.join(random_subset)

# Define subset size based on the size of your first corpus
subset_size = 1568

# Find the number of words by length in each randomly selected subset
for i in range(25):
    subset_text = select_random_sentences(text_Gothic, subset_size)
    word_lengths_subset = count_words_by_length_subset(subset_text)
    print(f"Word Lengths Frequency in Subset {i + 1}:")
    for length, count in word_lengths_subset.items():
        print(f"Length {length}: {count}")


Word Lengths Frequency in Subset 1:
Length 1: 1817
Length 2: 7019
Length 3: 8705
Length 4: 6955
Length 5: 4503
Length 6: 3284
Length 7: 2678
Length 8: 1978
Length 9: 1332
Length 10: 887
Word Lengths Frequency in Subset 2:
Length 1: 1710
Length 2: 6588
Length 3: 8505
Length 4: 6588
Length 5: 4336
Length 6: 3164
Length 7: 2580
Length 8: 1918
Length 9: 1443
Length 10: 908
Word Lengths Frequency in Subset 3:
Length 1: 1628
Length 2: 6428
Length 3: 8279
Length 4: 6290
Length 5: 4096
Length 6: 3057
Length 7: 2602
Length 8: 1933
Length 9: 1358
Length 10: 852
Word Lengths Frequency in Subset 4:
Length 1: 1640
Length 2: 6797
Length 3: 8829
Length 4: 6724
Length 5: 4439
Length 6: 3259
Length 7: 2622
Length 8: 1930
Length 9: 1433
Length 10: 981
Word Lengths Frequency in Subset 5:
Length 1: 1659
Length 2: 6631
Length 3: 8486
Length 4: 6560
Length 5: 4372
Length 6: 3248
Length 7: 2588
Length 8: 2003
Length 9: 1459
Length 10: 938
Word Lengths Frequency in Subset 6:
Length 1: 1700
Length 2: 6744
Leng

In [84]:
#Fanu vs Gothic
import random

# Function to select random sentences from the second corpus
def select_random_sentences(text, subset_size):
    sentences = split_into_sentences(text)
    random_subset = random.sample(sentences, subset_size)
    return ' '.join(random_subset)

# Define subset size based on the size of your first corpus
subset_size = 1097

# Find the number of words by length in each randomly selected subset
for i in range(25):
    subset_text = select_random_sentences(text_Gothic, subset_size)
    word_lengths_subset = count_words_by_length_subset(subset_text)
    print(f"Word Lengths Frequency in Subset {i + 1}:")
    for length, count in word_lengths_subset.items():
        print(f"Length {length}: {count}")


Word Lengths Frequency in Subset 1:
Length 1: 1163
Length 2: 4806
Length 3: 6195
Length 4: 4744
Length 5: 3084
Length 6: 2338
Length 7: 1933
Length 8: 1429
Length 9: 1026
Length 10: 608
Word Lengths Frequency in Subset 2:
Length 1: 1189
Length 2: 4900
Length 3: 6208
Length 4: 4775
Length 5: 3209
Length 6: 2325
Length 7: 1953
Length 8: 1399
Length 9: 1002
Length 10: 686
Word Lengths Frequency in Subset 3:
Length 1: 1149
Length 2: 4635
Length 3: 5866
Length 4: 4626
Length 5: 3026
Length 6: 2292
Length 7: 1794
Length 8: 1371
Length 9: 950
Length 10: 638
Word Lengths Frequency in Subset 4:
Length 1: 1108
Length 2: 4687
Length 3: 5877
Length 4: 4542
Length 5: 2969
Length 6: 2349
Length 7: 1870
Length 8: 1390
Length 9: 1005
Length 10: 630
Word Lengths Frequency in Subset 5:
Length 1: 1205
Length 2: 4621
Length 3: 5876
Length 4: 4543
Length 5: 3003
Length 6: 2229
Length 7: 1841
Length 8: 1369
Length 9: 948
Length 10: 667
Word Lengths Frequency in Subset 6:
Length 1: 1106
Length 2: 4338
Length