In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pandas as pd

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [9]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pandas as pd

# Download all required NLTK data with robust error handling
def download_nltk_resources():
    resources = {
        'punkt': 'tokenizers/punkt',
        'stopwords': 'corpora/stopwords',
        'averaged_perceptron_tagger': 'taggers/averaged_perceptron_tagger',
        'wordnet': 'corpora/wordnet',
        'omw-1.4': 'corpora/omw-1.4',
        'punkt_tab': 'tokenizers/punkt_tab',
        'averaged_perceptron_tagger_eng': 'taggers/averaged_perceptron_tagger_eng'
    }

    for resource, path in resources.items():
        try:
            nltk.download(resource)
            print(f"Successfully downloaded {resource}")
        except Exception as e:
            print(f"Failed to download {resource}: {str(e)}")
            # Try alternative download method
            try:
                nltk.download(resource, download_dir='/root/nltk_data')
                nltk.data.path.append('/root/nltk_data')
                print(f"Successfully downloaded {resource} to alternate location")
            except Exception as e2:
                print(f"Completely failed to download {resource}: {str(e2)}")
                if resource == 'averaged_perceptron_tagger_eng':
                    # This is a known issue, we can use the regular tagger
                    print("Will use averaged_perceptron_tagger instead")

download_nltk_resources()

# Add nltk data path to system path
nltk.data.path.append('/root/nltk_data')

# Sample document
document = """
Natural language processing (NLP) is a subfield of linguistics, computer science,
and artificial intelligence concerned with the interactions between computers and human language.
It focuses on how to program computers to process and analyze large amounts of natural language data.
The result is a computer capable of understanding the contents of documents, including the contextual
nuances of the language within them. The technology can then accurately extract information and
insights contained in the documents as well as categorize and organize the documents themselves.
"""

# 1. Tokenization with error handling
def tokenize_text(text):
    """Tokenize text into sentences and words"""
    try:
        sentences = sent_tokenize(text)
        word_tokens = [word_tokenize(sentence) for sentence in sentences]
        return sentences, word_tokens
    except LookupError:
        # Try to download punkt again if missing
        nltk.download('punkt')
        nltk.download('punkt_tab')
        sentences = sent_tokenize(text)
        word_tokens = [word_tokenize(sentence) for sentence in sentences]
        return sentences, word_tokens

sentences, word_tokens = tokenize_text(document)
print("\nSentence Tokens:")
print(sentences)
print("\nWord Tokens:")
print(word_tokens)

# 2. POS Tagging with fallback
def pos_tagging(tokens):
    """Perform part-of-speech tagging with fallback"""
    try:
        return [pos_tag(sentence) for sentence in tokens]
    except LookupError:
        print("POS tagger resource missing, trying to download...")
        nltk.download('averaged_perceptron_tagger')
        try:
            return [pos_tag(sentence) for sentence in tokens]
        except:
            print("Still can't load POS tagger, using simplified tags")
            # Fallback to universal tagset if full tagger fails
            nltk.download('universal_tagset')
            return [nltk.pos_tag(sentence, tagset='universal') for sentence in tokens]

pos_tags = pos_tagging(word_tokens)
print("\nPOS Tags:")
for i, sentence_tags in enumerate(pos_tags):
    print(f"Sentence {i+1}:")
    print(sentence_tags)

# 3. Stop Words Removal
def remove_stopwords(tokens):
    """Remove stop words from tokenized sentences"""
    try:
        stop_words = set(stopwords.words('english'))
        punctuation = set(string.punctuation)
        return [
            [word.lower() for word in sentence
             if word.lower() not in stop_words and word not in punctuation]
            for sentence in tokens
        ]
    except LookupError:
        print("Stopwords missing, downloading...")
        nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        punctuation = set(string.punctuation)
        return [
            [word.lower() for word in sentence
             if word.lower() not in stop_words and word not in punctuation]
            for sentence in tokens
        ]

filtered_tokens = remove_stopwords(word_tokens)
print("\nAfter Stopword Removal:")
print(filtered_tokens)

# 4. Stemming
def stem_words(tokens):
    """Apply Porter Stemmer to tokens"""
    porter = PorterStemmer()
    return [[porter.stem(word) for word in sentence] for sentence in tokens]

stemmed_tokens = stem_words(filtered_tokens)
print("\nAfter Stemming:")
print(stemmed_tokens)

# 5. Lemmatization
def lemmatize_words(tokens):
    """Apply WordNet Lemmatizer to tokens"""
    try:
        lemmatizer = WordNetLemmatizer()
        return [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokens]
    except LookupError:
        print("WordNet missing, downloading...")
        nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()
        return [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokens]

lemmatized_tokens = lemmatize_words(filtered_tokens)
print("\nAfter Lemmatization:")
print(lemmatized_tokens)

# 6. TF-IDF Representation
def create_tfidf_representation(sentences):
    """Create TF-IDF representation of documents"""
    # Reconstruct original sentences from filtered tokens for TF-IDF
    clean_sentences = [' '.join(sentence) for sentence in filtered_tokens]

    tfidf_vectorizer = TfidfVectorizer(
        stop_words='english',
        lowercase=True,
        tokenizer=word_tokenize,
        use_idf=True,
        norm='l2',
        smooth_idf=True
    )

    tfidf_matrix = tfidf_vectorizer.fit_transform(clean_sentences)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    tfidf_df = pd.DataFrame(
        tfidf_matrix.toarray(),
        columns=feature_names,
        index=[f"Sentence {i+1}" for i in range(len(clean_sentences))]
    )

    return tfidf_df, tfidf_vectorizer

tfidf_df, vectorizer = create_tfidf_representation(sentences)
print("\nTF-IDF Representation:")
print(tfidf_df.head())

# Display top terms for each sentence
print("\nTop Terms per Sentence:")
for i in range(len(sentences)):
    top_terms = tfidf_df.iloc[i].sort_values(ascending=False).head(5)
    print(f"Sentence {i+1}: {', '.join(top_terms.index)}")

Successfully downloaded punkt
Successfully downloaded stopwords
Successfully downloaded averaged_perceptron_tagger
Successfully downloaded wordnet
Successfully downloaded omw-1.4
Successfully downloaded punkt_tab
Successfully downloaded averaged_perceptron_tagger_eng

Sentence Tokens:
['\nNatural language processing (NLP) is a subfield of linguistics, computer science, \nand artificial intelligence concerned with the interactions between computers and human language.', 'It focuses on how to program computers to process and analyze large amounts of natural language data.', 'The result is a computer capable of understanding the contents of documents, including the contextual \nnuances of the language within them.', 'The technology can then accurately extract information and \ninsights contained in the documents as well as categorize and organize the documents themselves.']

Word Tokens:
[['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
