<a href="https://colab.research.google.com/github/soroush1dft/Bachelor_Degree_Thesis/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import re
import nltk
from collections import Counter
import numpy as np

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
# Define stop words
stop_words = set(stopwords.words('english'))

# Read and preprocess the text data
def read_and_preprocess(file_path):
    with open(file_path, "r", encoding='utf-8') as file:
        content = file.read().lower()
    # Remove punctuation
    content = re.sub(r'[^\w\s]', '', content)

    # Tokenize the text
    tokens = word_tokenize(content)

    # Remove Stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Join the tokens back into a string
    filtered_content = ' '.join(filtered_tokens)

    return filtered_content

# Extract top N keywords from the text
def extract_top_keywords(content, top_n=20):
    words = content.split()
    word_counts = Counter(words)
    top_keywords = word_counts.most_common(top_n)
    return top_keywords

# Find common keywords between two lists of keywords
def find_common_keywords(keywords1, keywords2):
    set1 = {word for word, _ in keywords1}
    set2 = {word for word, _ in keywords2}
    common = set1.intersection(set2)
    return sorted(common)  # Return sorted list for consistent order

# Construct the term-document matrix
def construct_term_document_matrix(texts, keywords):
    matrix = np.zeros((len(texts), len(keywords)))
    for i, text in enumerate(texts):
        word_counts = Counter(text.split())
        for j, keyword in enumerate(keywords):
            matrix[i][j] = word_counts[keyword]
    return matrix


# File paths
txt_AlicesAdventuresinWonderland = "LewisCarroll-AlicesAdventuresinWonderland.txt"
txt_ThroughtheLookingGlass = "LewisCarroll-ThroughtheLooking-Glass.txt"

# Read and preprocess texts
content_Alices = read_and_preprocess(txt_AlicesAdventuresinWonderland)
content_Through = read_and_preprocess(txt_ThroughtheLookingGlass)

# Extract top 20 keywords from each text
top_keywords_Alices = extract_top_keywords(content_Alices, 1000)
top_keywords_Through = extract_top_keywords(content_Through, 1000)

# Find common keywords
common_keywords = find_common_keywords(top_keywords_Alices, top_keywords_Through)
common_top_n_keywords = common_keywords[:200]  # Take the top 20 common keywords

# Construct term-document matrix
tdm = construct_term_document_matrix([content_Alices, content_Through], common_top_n_keywords)

print("Common Top 20 Keywords:", common_top_n_keywords)
print("Term-Document Matrix:\n", tdm)




Common Top 20 Keywords: ['across', 'added', 'afraid', 'afterwards', 'age', 'ah', 'air', 'alice', 'alices', 'alive', 'almost', 'alone', 'along', 'aloud', 'always', 'among', 'angrily', 'angry', 'another', 'answer', 'answered', 'anxious', 'anxiously', 'anything', 'argument', 'arm', 'arms', 'ask', 'asked', 'asking', 'asleep', 'away', 'back', 'beat', 'beautiful', 'beg', 'began', 'begin', 'beginning', 'begun', 'behind', 'believe', 'besides', 'best', 'better', 'bird', 'birds', 'bit', 'body', 'book', 'box', 'breadandbutter', 'breath', 'bright', 'bring', 'brought', 'business', 'cake', 'call', 'called', 'came', 'cant', 'care', 'carefully', 'case', 'catch', 'caught', 'certain', 'certainly', 'chance', 'changed', 'chapter', 'child', 'children', 'chorus', 'civil', 'close', 'come', 'comes', 'coming', 'consider', 'continued', 'conversation', 'corner', 'could', 'couldnt', 'course', 'crab', 'creature', 'creatures', 'cried', 'crossed', 'crown', 'cry', 'curiosity', 'curious', 'cut', 'dare', 'dark', 'day',

Now its time to convert it to np.array

In [36]:
import numpy as np

tdm_np_array = np.array(tdm)

In [37]:
def calculate_term_document_matrix_for_input(input_string, common_top_n_keywords):
    # Preprocess the input string
    input_string = input_string.lower()  # Lowercase the text
    input_string = re.sub(r'[^\w\s]', '', input_string)  # Remove punctuation
    tokens = word_tokenize(input_string)  # Tokenize
    filtered_tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords

    # Construct the term-document matrix for the input
    matrix = np.zeros((1, len(common_top_n_keywords)))
    word_counts = Counter(filtered_tokens)
    for j, keyword in enumerate(common_top_n_keywords):
        matrix[0][j] = word_counts[keyword]

    return matrix

In [38]:
string_input_toTDM = input("Please enter any string to calcualate its Term Document Matrix:\n")

Please enter any string to calcualate its Term Document Matrix:
CHAPTER II.   THE POOL OF TEARS.   "Curiouser and curiouser!" cried Alice (she was so much surprised, that for the moment she quite forgot how to speak good English); "now I'm opening out like the largest telescope that ever was! Good-bye, feet!" (for when she looked down at her feet, they seemed to be almost out of sight, they were getting so far off). "Oh, my poor little feet, I wonder who will put on your shoes and stockings for you now, dears? I'm sure I shan't be able! I shall be a great deal too far off to trouble myself about you: you must manage the best way you can;—but I must be kind to them," thought Alice, "or perhaps they won't walk the way I want to go! Let me see: I'll give them a new pair of boots every Christmas."  And she went on planning to herself how she would manage it. "They must go by the carrier," she thought; "and how funny it'll seem, sending presents to one's own feet! And how odd the directions

In [39]:
print(calculate_term_document_matrix_for_input(string_input_toTDM, common_top_n_keywords))

[[ 0.  0.  2.  0.  1.  1.  0. 24.  2.  0.  2.  1.  1.  0.  1.  0.  0.  1.
   1.  1.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  3.  4.  0.  0.  3.
   6.  0.  0.  0.  1.  0.  1.  1.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  1.  3.  2.  0.  0.  1.  0.  0.  2.  0.  0.  3.  1.
   1.  2.  0.  0.  0.  6.  0.  1.  0.  0.  1.  0.  7.  0.  0.  0.  0.  1.
   8.  1.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  2. 10.  1.  0.  0.  1.
   1.  0.  1.  0.  1.  0.  1.  3.  2.  3.  3.  0.  0.  0.  0.  0.  0.  0.
   1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  3.  0.  0.  4.  2.  0.  3.  0.
   0.  1.  3.  1.  0.  1.  0.  0.  2.  0.  0.  0.  0.  0.  1.  8.  0.  2.
   3.  0.  0.  1.  4.  0.  0.  1.  0.  0.  0.  0.  2.  4.  2.  0.  1.  0.
   0.  0.  3.  1.  0.  1.  3.  2.  1.  1.  1.  1. 10.  1.  1.  0.  2.  2.
   0.  0.]]


In [40]:
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (norm(vec1, 2) * norm(vec2, 2))

def classify_text(input_string, common_top_20_keywords, tdm):
    # Calculate the term-document matrix for the input
    input_tdm = calculate_term_document_matrix_for_input(input_string, common_top_20_keywords)

    # Compare with Alice's Adventures in Wonderland (column 0) and Through the Looking-Glass (column 1)
    similarity_Alices = cosine_similarity(input_tdm[0], tdm[0])
    similarity_Through = cosine_similarity(input_tdm[0], tdm[1])
    result = ""
    # Classification based on higher cosine similarity
    if similarity_Alices > similarity_Through:
        result = "The input text is more similar to Alice's Adventures in Wonderland."
    else:
        result = "The input text is more similar to Through the Looking-Glass."
    return result

In [41]:
test_txt = classify_text(string_input_toTDM, common_top_n_keywords, tdm)
print(test_txt)

The input text is more similar to Alice's Adventures in Wonderland.
