In [1]:
# Install necessary libraries
!pip install langdetect datasets nltk

# Import required libraries
import re
import nltk
import textwrap
from langdetect import detect
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Ensure necessary NLTK resources are downloaded for text processing
nltk.download('stopwords')  # Stop words like "and", "the", etc.
nltk.download('punkt')  # Tokenizers for sentences and words
nltk.download('wordnet')  # Lemmatization
nltk.download('averaged_perceptron_tagger')  # POS tagging
nltk.download('maxent_treebank_pos_tagger')  # Enhanced POS tagging
nltk.download('maxent_ne_chunker')  # Named entity recognition
nltk.download('words')  # Wordlists for NER
nltk.download('punkt_tab')  # Additional resources for tokenization
nltk.download('maxent_ne_chunker_tab')  # Additional resources for NER
nltk.download('averaged_perceptron_tagger_eng')

# Load a sample dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:1]")  # Load a small subset of the CNN/DailyMail dataset
texts = dataset['article']  # Extract articles from the dataset

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/981.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m614.4/981.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package maxent

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [2]:
# Function to clean text (remove unwanted characters and standardize)
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)  # Remove single characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and trim
    return text

# Function to remove stop words from text
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))  # Define stop words
    tokens = word_tokenize(text)  # Tokenize words
    filtered_words = [word for word in tokens if word not in stop_words]  # Remove stop words
    return ' '.join(filtered_words)

# Function to lemmatize text (reduce words to base forms)
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    tokens = word_tokenize(text)  # Tokenize words
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # Lemmatize each word
    return ' '.join(lemmatized_words)

# Function to replace synonyms or abbreviations in text
def replace_synonyms(text, synonym_dict):
    words = text.split()  # Split text into words
    replaced_text = ' '.join([synonym_dict.get(word, word) for word in words])  # Replace words using dictionary
    return replaced_text

# Function for sentence tokenization
def sentence_tokenization(text):
    sentences = sent_tokenize(text)  # Split text into sentences
    return sentences

# Function for word tokenization
def word_tokenization(text):
    words = word_tokenize(text)  # Split text into words
    return words

# Function for POS tagging
def pos_tagging(text):
    tokens = word_tokenize(text)  # Tokenize words
    pos_tags = nltk.pos_tag(tokens)  # Perform part-of-speech tagging
    return pos_tags

# Function for named entity recognition (NER)
def named_entity_recognition(text):
    tokens = word_tokenize(text)  # Tokenize words
    pos_tags = nltk.pos_tag(tokens)  # Perform POS tagging
    ner_tree = nltk.ne_chunk(pos_tags, binary=True)  # Create NER tree
    return ner_tree

# Function to detect the language of text
def is_english(text):
    try:
        return detect(text) == 'en'  # Check if the language is English
    except:
        return False  # Return False if detection fails

# Function to compute TF-IDF matrix for documents
def compute_tfidf(documents):
    tfidf = TfidfVectorizer()  # Initialize TF-IDF Vectorizer
    tfidf_matrix = tfidf.fit_transform(documents)  # Compute TF-IDF matrix
    return tfidf_matrix, tfidf.get_feature_names_out()

# Function to pad sequences (useful for models that require fixed-length input)
def pad_sequences_text(documents, max_length=50):
    tokenizer = Tokenizer()  # Initialize tokenizer
    tokenizer.fit_on_texts(documents)  # Fit tokenizer on documents
    sequences = tokenizer.texts_to_sequences(documents)  # Convert texts to sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')  # Pad sequences
    return padded_sequences

# Apply all functionalities on the first text
text = texts[0]  # Select the first article

# Clean text
cleaned_text = clean_text(text)

# Remove stop words
filtered_text = remove_stop_words(cleaned_text)

# Lemmatize text
lemmatized_text = lemmatize_text(filtered_text)

# Replace synonyms/abbreviations
synonym_dict = {"NLP": "Natural Language Processing", "AI": "Artificial Intelligence"}
replaced_text = replace_synonyms(lemmatized_text, synonym_dict)

# Sentence Tokenization
sentences = sentence_tokenization(text)

# Word Tokenization
words = word_tokenize(text)

# POS tagging and NER
pos_tags = pos_tagging(replaced_text)
ner_tree = named_entity_recognition(replaced_text)

# TF-IDF Vectorization
documents = [replaced_text]
tfidf_matrix, feature_names = compute_tfidf(documents)

# Pad sequences
padded_sequences = pad_sequences_text(documents)

# Print results with proper formatting
print("Original Text:\n", textwrap.fill(text, width=180))  # Original text with line wrapping
print("\nCleaned Text:\n", textwrap.fill(cleaned_text, width=180))  # Cleaned text
print("\nFiltered Text:\n", textwrap.fill(filtered_text, width=180))  # Text without stop words
print("\nLemmatized Text:\n", textwrap.fill(lemmatized_text, width=180))  # Lemmatized text
print("\nReplaced Text:\n", textwrap.fill(replaced_text, width=180))  # Text after replacing synonyms
print("\nSentences:", sentences)  # Tokenized sentences
print("\nWords:", words)  # Tokenized words
print("\nPOS Tags:\n", pos_tags)  # POS tagging output
print("\nNamed Entity Recognition:\n", ner_tree)  # NER tree
print("\nTF-IDF Matrix:\n", tfidf_matrix.toarray())  # TF-IDF matrix as an array
print("\nTF-IDF Feature Names:\n", feature_names)  # Feature names from TF-IDF
print("\nPadded Sequences:\n", padded_sequences)  # Padded sequences for models

Original Text:
 (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in
Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome
Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014."
Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As
members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to
join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesda