<a href="https://colab.research.google.com/github/smrutipunto/NLP/blob/main/Practical_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def morphological_analysis(text):
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Tokenization
    tokens = word_tokenize(text)

    # Stop-words removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Text normalization
    normalized_tokens = [word.lower().strip(string.punctuation) for word in filtered_tokens]

    # Morphological analysis using lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, pos='n') for word in normalized_tokens]

    return lemmatized_tokens

# Example usage
text_sample = "The cats are playing with the ball and they are enjoying happiness"
processed_text = morphological_analysis(text_sample)
print(processed_text)


['cat', 'playing', 'ball', 'enjoying', 'happiness']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import spacy

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def spacy_morphological_analysis(word):
    # Process the word
    doc = nlp(word)
    analyzed_forms = [(token.text, token.lemma_) for token in doc]
    return analyzed_forms

# Example usage
word_sample = "playing"
analyzed_word = spacy_morphological_analysis(word_sample)
print(analyzed_word)

[('playing', 'play')]


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Sample for a morphological parser function
# hello
#h=0, e=1,l=2, o = 4
#hello : o = -1, l = -2, h = -5
def morphological_parsing(word):
    morphemes = []
    if word.startswith('un') and len(word) > 8:  # Checks if it starts with 'un' and is long enough
        morphemes.append('un-')  # Add the prefix
        base_word = word[2:-4]  # Extract the root (removing 'un' and 'ness')
        morphemes.append(base_word)  # Add the root
        morphemes.append('-ness')  # Add the suffix
    elif word.endswith('ness') and len(word) > 4:  # Checks if it ends with 'ness' and is long enough
        base_word = word[:-4]  # Remove 'ness'
        morphemes.append(base_word)  # Add the root
        morphemes.append('-ness')  # Add the suffix
    elif word.endswith('ing') and len(word) > 3:
        morphemes.append(word[:-3])  # Remove 'ing'
        morphemes.append('ing')
    elif word.endswith('ed') and len(word) > 2:
        morphemes.append(word[:-2])  # Remove 'ed'
        morphemes.append('ed')
    else:
        morphemes.append(word)  # Return the word as is
    return morphemes

# Placeholder for Finite State Transducer (FST) example
def finite_state_transducer(word):
    if word.endswith('s'):
        return word[:-1]  # Remove plural 's'
    return word

def morphological_analysis(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Stop-words removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Text normalization
    normalized_tokens = [word.lower().strip(string.punctuation) for word in filtered_tokens]

    # Morphological analysis using lemmatization and other methods
    processed_tokens = []
    for word in normalized_tokens:
        # Apply finite state transducer
        base_form = finite_state_transducer(word)
        processed_tokens.append(base_form)  # Add the base form to the processed tokens
        # Apply morphological parsing
        morphemes = morphological_parsing(word)
        processed_tokens.extend(morphemes)  # Add morphemes to the processed tokens

    return list(set(processed_tokens))  # Return unique processed tokens

# Example usage
text_sample = "The cats are playing with the ball and they are enjoying No unhappiness reported"
processed_text = morphological_analysis(text_sample)
print(processed_text)


['reported', 'cat', 'enjoy', 'happi', 'cats', 'enjoying', 'report', '-ness', 'ing', 'unhappines', 'un-', 'ball', 'ed', 'play', 'playing']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
