In [340]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string

from nltk import WordNetLemmatizer, word_tokenize, pos_tag, SnowballStemmer
from nltk.corpus import stopwords, wordnet

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

# Reading The Poems And Store Each Line As Single Input

In [302]:
# Read the text file and store its content in a list
with open('edgar_allan_poe.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Create a dataframe with one column 'Text' and populate it with the lines from the text file
edgar_allan_poe= pd.DataFrame({'text': lines})

edgar_allan_poe

Unnamed: 0,text
0,LO! Death hath rear'd himself a throne\n
1,"In a strange city, all alone,\n"
2,Far down within the dim west\n
3,"Where the good, and the bad, and the worst, an..."
4,Have gone to their eternal rest.\n
...,...
792,To a discordant melody;\n
793,"While, like a rapid ghastly river,\n"
794,"Through the pale door,\n"
795,"A hideous throng rush out forever,\n"


In [303]:
# Read the text file and store its content in a list
with open('robert_frost_poe.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Create a dataframe with one column 'Text' and populate it with the lines from the text file
robert_frost_poe = pd.DataFrame({'text': lines})

robert_frost_poe

Unnamed: 0,text
0,"Two roads diverged in a yellow wood,\n"
1,And sorry I could not travel both\n
2,"And be one traveler, long I stood\n"
3,And looked down one as far as I could\n
4,To where it bent in the undergrowth; \n
...,...
1576,\n
1577,A feather-hammer gives a double knock.\n
1578,This Eden day is done at two o'clock.\n
1579,An hour of winter day might seem too short\n


# Reading The Poems And Store Each Section As Single Input

In [304]:
# def poem_reader(input_file_path):
#     
#     try:
#         with open(input_file_path, 'r') as input_file:
#             content = input_file.read()  # Read the content of the input text file
#         
#         sections = content.split('\n\n')  # Split content into sections based on empty lines
#         data = {'text': sections}        
#         print('Done')
#         
#         return pd.DataFrame(data)
#         
#     except FileNotFoundError:
#         print("Input file not found.")
#     except Exception as e:
#         print("An error occurred:", e)

In [305]:
# edgar_allan_poe = poem_reader('edgar_allan_poe.txt')
# robert_frost_poe = poem_reader('robert_frost_poe.txt')

In [306]:
# edgar_allan_poe

In [307]:
# robert_frost_poe

# PreProcess The Data and Removing Stop Words And NonAlphabetic Chars Using Lemmatization and Pos Technic

In [308]:
nltk.download('punkt')  # Download NLTK data for tokenization if not already downloaded
nltk.download('stopwords')  # Download NLTK data for stopwords if not already downloaded
nltk.download('wordnet')  # Download NLTK data for WordNet if not already downloaded
nltk.download('averaged_perceptron_tagger')  # Download NLTK data for POS tagger if not already downloaded

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [309]:
def text_preprocessor(data):
    # Load English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Initialize the SnowballStemmer with English language
    # stemmer = SnowballStemmer("english")
    
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Mapping between Penn Treebank POS tags and WordNet POS tags
    def penn_to_wordnet_pos(tag):
        if tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun
    
    # Tokenize and preprocess the text data
    def preprocess_text(text):
        tokens = word_tokenize(text)  # Tokenize the text
        tokens = [token.lower() for token in tokens if token.isalpha()]  # Keep only alphabetic tokens
        tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords
        
        # stemmed_tokens = [stemmer.stem(token) for token in tokens]  # Apply Snowball stemmer
        # return " ".join(stemmed_tokens)  # Join the tokens back into a string
        
        pos_tags = pos_tag(tokens)  # Get part of speech tags
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token, pos in pos_tags]  # Apply lemmatization without POS
        lemmatized_tokens = [lemmatizer.lemmatize(token, pos=penn_to_wordnet_pos(pos)) for token, pos in pos_tags]  # Apply lemmatization with POS
        return " ".join(lemmatized_tokens)  # Join the tokens back into a string
    
    return data.apply(preprocess_text)

In [310]:
edgar_allan_preprocessed = text_preprocessor(edgar_allan_poe['text'])
robert_frost_preprocessed = text_preprocessor(robert_frost_poe['text'])

edgar_allan_preprocessed = edgar_allan_preprocessed.to_frame()
robert_frost_preprocessed = robert_frost_preprocessed.to_frame()

In [311]:
edgar_allan_preprocessed

Unnamed: 0,text
0,lo death hath rear throne
1,strange city alone
2,far within dim west
3,good bad bad best
4,go eternal rest
...,...
792,discordant melody
793,like rapid ghastly river
794,pale door
795,hideous throng rush forever


In [312]:
robert_frost_preprocessed

Unnamed: 0,text
0,two road diverge yellow wood
1,sorry could travel
2,one traveler long stand
3,look one far could
4,bent undergrowth
...,...
1576,
1577,give double knock
1578,eden day do two
1579,hour winter day might seem short


In [313]:
edgar_allan_preprocessed['label'] = 0
robert_frost_preprocessed['label'] = 1

edgar_allan_preprocessed

Unnamed: 0,text,label
0,lo death hath rear throne,0
1,strange city alone,0
2,far within dim west,0
3,good bad bad best,0
4,go eternal rest,0
...,...,...
792,discordant melody,0
793,like rapid ghastly river,0
794,pale door,0
795,hideous throng rush forever,0


# removing Empty Cells

In [314]:
# Remove rows with empty cells in the 'Text' column
edgar_allan_preprocessed['text'].replace('', pd.NA, inplace=True)
edgar_allan_preprocessed.dropna(subset=['text'], inplace=True)

# Reset the index after dropping rows
edgar_allan_preprocessed.reset_index(drop=True, inplace=True)

edgar_allan_preprocessed

Unnamed: 0,text,label
0,lo death hath rear throne,0
1,strange city alone,0
2,far within dim west,0
3,good bad bad best,0
4,go eternal rest,0
...,...,...
709,discordant melody,0
710,like rapid ghastly river,0
711,pale door,0
712,hideous throng rush forever,0


In [315]:
# Remove rows with empty cells in the 'Text' column
robert_frost_preprocessed['text'].replace('', pd.NA, inplace=True)
robert_frost_preprocessed.dropna(subset=['text'], inplace=True)

# Reset the index after dropping rows
robert_frost_preprocessed.reset_index(drop=True, inplace=True)

robert_frost_preprocessed

Unnamed: 0,text,label
0,two road diverge yellow wood,1
1,sorry could travel,1
2,one traveler long stand,1
3,look one far could,1
4,bent undergrowth,1
...,...,...
1414,say bud leaf bloom,1
1415,give double knock,1
1416,eden day do two,1
1417,hour winter day might seem short,1


In [316]:
all_poems_preprocessed = pd.concat([edgar_allan_preprocessed, robert_frost_preprocessed], ignore_index=True)

all_poems_preprocessed

Unnamed: 0,text,label
0,lo death hath rear throne,0
1,strange city alone,0
2,far within dim west,0
3,good bad bad best,0
4,go eternal rest,0
...,...,...
2128,say bud leaf bloom,1
2129,give double knock,1
2130,eden day do two,1
2131,hour winter day might seem short,1


# Splitting Data To Train And Test

In [317]:
x = all_poems_preprocessed.drop(columns=['label'])
y = all_poems_preprocessed['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

x_train = x_train['text']
x_test = x_test['text']

In [318]:
x_train

614       quicken spell doth u pass
1732                 put sign close
861                   lantern chore
1083                      come face
371           fell upturn face rose
                   ...             
1638     builder build little house
1095    mean mile mormon settlement
1130            never tend anything
1294        swollen tight bury snow
860         everyone mile could see
Name: text, Length: 1493, dtype: object

In [319]:
y_train

614     0
1732    1
861     1
1083    1
371     0
       ..
1638    1
1095    1
1130    1
1294    1
860     1
Name: label, Length: 1493, dtype: int64

# Create Word To Index Mapping

In [320]:
idx = 1
word2idx = {'<unk>': 0}

In [321]:
for text in x_train:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

word2idx

{'<unk>': 0,
 'quicken': 1,
 'spell': 2,
 'doth': 3,
 'u': 4,
 'pass': 5,
 'put': 6,
 'sign': 7,
 'close': 8,
 'lantern': 9,
 'chore': 10,
 'come': 11,
 'face': 12,
 'fell': 13,
 'upturn': 14,
 'rose': 15,
 'isola': 16,
 'fior': 17,
 'di': 18,
 'levante': 19,
 'misty': 20,
 'mid': 21,
 'region': 22,
 'become': 23,
 'reconcile': 24,
 'see': 25,
 'stop': 26,
 'wonder': 27,
 'sulphurous': 28,
 'current': 29,
 'yaanek': 30,
 'go': 31,
 'mowing': 32,
 'field': 33,
 'gaze': 34,
 'entrance': 35,
 'adown': 36,
 'gorgeous': 37,
 'vista': 38,
 'son': 39,
 'never': 40,
 'could': 41,
 'find': 42,
 'whose': 43,
 'bone': 44,
 'wish': 45,
 'strong': 46,
 'small': 47,
 'book': 48,
 'pocket': 49,
 'double': 50,
 'redouble': 51,
 'song': 52,
 'twitter': 53,
 'long': 54,
 'town': 55,
 'know': 56,
 'kind': 57,
 'old': 58,
 'davis': 59,
 'own': 60,
 'solid': 61,
 'mica': 62,
 'mountain': 63,
 'think': 64,
 'much': 65,
 'shade': 66,
 'nothing': 67,
 'good': 68,
 'say': 69,
 'pay': 70,
 'sure': 71,
 'lead': 

# Converting  Data Into Integers

In [322]:
x_train_int = []
x_test_int = []

for text in x_train:
    text2idx = []
    tokens = text.split()
    for token in tokens:
        text2idx.append(word2idx[token])
    x_train_int.append(text2idx)

for text in x_test:
    text2idx = []
    tokens = text.split()
    for token in tokens:
        try:
            text2idx.append(word2idx[token])
        except:
            text2idx.append(word2idx['<unk>'])
    x_test_int.append(text2idx)


In [323]:
x_train_int

[[1, 2, 3, 4, 5],
 [6, 7, 8],
 [9, 10],
 [11, 12],
 [13, 14, 12, 15],
 [16, 17, 18, 19],
 [20, 21, 22],
 [23, 24],
 [25, 26],
 [27],
 [28, 29, 30],
 [31, 32, 33],
 [34, 35, 36, 37, 38],
 [39, 40, 41, 42, 43, 44],
 [45, 46],
 [47, 48, 49],
 [50, 51, 52, 53],
 [54, 55],
 [56, 57],
 [58, 59, 60, 61, 62, 63],
 [64, 65, 66],
 [67, 68, 69, 70],
 [71, 72, 4],
 [73, 74],
 [47, 75],
 [76, 76, 77, 78],
 [79, 80, 81],
 [58, 58, 82, 83, 78],
 [84, 85, 86, 87],
 [88, 89, 90, 91, 64],
 [92, 93, 94, 95, 96],
 [97, 69, 98, 56],
 [99, 100, 101, 56],
 [102, 69, 103],
 [104, 105, 106],
 [107, 108, 109],
 [110, 111, 112],
 [25, 113, 114],
 [115, 116, 31],
 [117, 118, 119],
 [120, 121, 122],
 [123, 124],
 [125],
 [126, 69, 127, 128],
 [129, 129, 129],
 [69],
 [11, 130, 131],
 [54, 132],
 [69, 133, 134],
 [97, 135, 136, 137],
 [138, 139, 102, 4, 139, 140],
 [141, 142, 143, 144],
 [113, 145, 146, 147],
 [148, 149, 150, 151, 152, 153],
 [102, 69, 103],
 [154, 155, 50, 155],
 [126, 156, 157],
 [56, 158, 159],


In [324]:
x_test_int

[[729, 1083, 979],
 [405, 1002, 0, 0],
 [0, 74, 0],
 [0, 547, 0, 127],
 [40, 113, 299, 1248],
 [226, 1090, 0],
 [353, 1081, 178],
 [453, 937, 181],
 [361, 362, 363, 30],
 [698, 1230, 148, 214, 0],
 [531, 0, 531],
 [0, 100, 125, 353],
 [1199, 268],
 [65, 0],
 [100, 0, 0],
 [1144, 572, 1132],
 [222, 376, 482, 48],
 [54, 860],
 [292, 107],
 [1424, 1111, 1685],
 [13, 317, 899],
 [56],
 [830, 0, 566, 0],
 [304, 0, 276, 89],
 [729, 1083, 979],
 [99, 126, 215, 360],
 [457, 55, 314, 1143],
 [41, 25, 1344, 0, 481],
 [0, 490, 1111, 1685],
 [1703, 556],
 [11, 0, 1618],
 [1535, 64, 0],
 [0],
 [113, 700, 556, 0],
 [376, 1556, 372, 160, 1749, 26],
 [344, 0, 0, 517, 529],
 [56, 385, 64],
 [1188, 597, 150],
 [788, 100, 571, 115, 689],
 [1227, 85, 0, 1215],
 [367, 906],
 [327, 223, 0, 1349, 0, 1068],
 [91, 141, 567, 186],
 [1201, 79, 915, 1051],
 [0, 0, 0, 0],
 [0, 653, 121],
 [0],
 [316, 316],
 [223, 0, 199],
 [114, 324],
 [1691, 0],
 [681, 265, 963, 289],
 [436, 25],
 [1332, 648, 1709, 301],
 [0, 0, 

# Initializing A and PI Metrics For Both Classes

In [325]:
V = len(word2idx)

A0 = np.ones((V, V))
PI0 = np.ones(V)

A1 = np.ones((V, V))
PI1 = np.ones(V)

# Compute Counts For A and PI

In [326]:
def compute_counts (text_as_int, A, PI):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in the sentence
                PI[idx] += 1
            else:
                # The last word exist, so count a transition
                A[last_idx, idx] += 1
            
            # Update idx
            last_idx = idx


compute_counts([t for t, y in zip(x_train_int, y_train) if y == 0], A0, PI0)
compute_counts([t for t, y in zip(x_train_int, y_train) if y == 1], A1, PI1)

In [327]:
PI1

array([1., 1., 2., ..., 1., 1., 2.])

In [328]:
A1

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [329]:
# Get the unique values and their counts
unique_values, counts = np.unique(A1, return_counts=True)

# Combine the unique values and their counts into a dictionary
value_count_dict = dict(zip(unique_values, counts))

# Print the value counts
for value, count in value_count_dict.items():
    print(f"Value {value} occurs {count} times.")

Value 1.0 occurs 3262949 times.
Value 2.0 occurs 2220 times.
Value 3.0 occurs 68 times.
Value 4.0 occurs 10 times.
Value 6.0 occurs 1 times.
Value 7.0 occurs 1 times.


# Normalize A and PI so They Are Valid Probability Matrices

In [330]:
A0 /= A0.sum(axis=1, keepdims=True)
PI0 /= PI0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
PI1 /= PI1.sum()

In [331]:
A1

array([[0.0005534, 0.0005534, 0.0005534, ..., 0.0005534, 0.0005534,
        0.0005534],
       [0.0005534, 0.0005534, 0.0005534, ..., 0.0005534, 0.0005534,
        0.0005534],
       [0.0005531, 0.0005531, 0.0005531, ..., 0.0005531, 0.0005531,
        0.0005531],
       ...,
       [0.0005534, 0.0005534, 0.0005534, ..., 0.0005534, 0.0005534,
        0.0005534],
       [0.0005534, 0.0005534, 0.0005534, ..., 0.0005534, 0.0005534,
        0.0005534],
       [0.0005531, 0.0005531, 0.0005531, ..., 0.0005531, 0.0005531,
        0.0005531]])

In [332]:
PI1

array([0.00035499, 0.00035499, 0.00070998, ..., 0.00035499, 0.00035499,
       0.00070998])

In [333]:
# Get the unique values and their counts
unique_values, counts = np.unique(A1, return_counts=True)

# Combine the unique values and their counts into a dictionary
value_count_dict = dict(zip(unique_values, counts))

# Print the value counts
for value, count in value_count_dict.items():
    print(f"Value {value} occurs {count} times.")

Value 0.0005399568034557236 occurs 1767 times.
Value 0.0005428881650380022 occurs 1772 times.
Value 0.000544069640914037 occurs 3560 times.
Value 0.0005446623093681918 occurs 1780 times.
Value 0.0005452562704471102 occurs 5352 times.
Value 0.000546448087431694 occurs 1786 times.
Value 0.0005467468562055768 occurs 3571 times.
Value 0.0005482456140350877 occurs 7167 times.
Value 0.0005485463521667581 occurs 5376 times.
Value 0.0005488474204171241 occurs 1795 times.
Value 0.0005494505494505495 occurs 8975 times.
Value 0.0005497526113249038 occurs 8978 times.
Value 0.00055005500550055 occurs 5389 times.
Value 0.000550357732526142 occurs 10788 times.
Value 0.0005506607929515419 occurs 10790 times.
Value 0.0005509641873278236 occurs 14396 times.
Value 0.0005512679162072767 occurs 21600 times.
Value 0.0005515719801434088 occurs 34233 times.
Value 0.0005518763796909492 occurs 41449 times.
Value 0.0005521811154058532 occurs 75734 times.
Value 0.0005524861878453039 occurs 146128 times.
Value 0.0

In [334]:
# Log A and PI Since We Don't Need The Actual Probs
logA0 = np.log(A0)
logPI0 = np.log(PI0)

logA1 = np.log(A1)
logPI1 = np.log(PI1)

# Compute Priors

In [335]:
count0 = sum(y == 0 for y in y_train)
count1 = sum(y == 1 for y in y_train)
total = len(y_train)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)

p0, p1

(0.3235097119892833, 0.6764902880107166)

# Build A Classifier

In [336]:
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes
    
    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]
        
        last_idx = None
        logprob = 0
        
        for idx in input_:
            if last_idx is None:
                # it's the first token
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]
            
            #update last_idx
            last_idx = idx
        
        return logprob
    
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c]  for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        
        return predictions


In [337]:
clf = Classifier([logA0, logA1], [logPI0, logPI1], [logp0, logp1])

In [338]:
Ptrain = clf.predict(x_train_int)
print(f'Train acc: {np.mean(Ptrain == y_train)}')

Train acc: 0.9859343603482921


In [339]:
Ptest = clf.predict(x_test_int)
print(f'Test acc: {np.mean(Ptest == y_test)}')

Test acc: 0.7828125


In [341]:
cm = confusion_matrix(y_train, Ptrain)

cm

array([[ 462,   21],
       [   0, 1010]], dtype=int64)

In [342]:
cm_test = confusion_matrix(y_test, Ptest)

cm_test

array([[109, 122],
       [ 17, 392]], dtype=int64)

In [343]:
f1_score(y_train, Ptrain)

0.9897109260166584

In [344]:
f1_score(y_test, Ptest)

0.8494041170097508