<a href="https://colab.research.google.com/github/puji2004-oss/training/blob/main/day7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
import os

# Download NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Define stemmer
stemmer = SnowballStemmer("english")

# Define lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to get part of speech tag
def get_pos(word):
    pos = nltk.pos_tag([word])[0][1][0].upper()
    pos_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return pos_dict.get(pos, wordnet.NOUN)

# Function to preprocess text
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            # Stemming
            stemmed_token = stemmer.stem(token)
            # Lemmatization
            lemmatized_token = lemmatizer.lemmatize(stemmed_token, get_pos(stemmed_token))
            result.append(lemmatized_token)
    return result

# Get the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Construct the file path
file_path = os.path.join(current_directory, 'sample.txt')

# Check if the file exists
if os.path.exists(file_path):
    # Read sample text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Preprocess text
    processed_text = preprocess(text)

    print(processed_text)
else:
    print(f"Error: The file 'sample.txt' was not found in the current directory: {current_directory}")
    print("Please ensure the file exists or provide the correct file path.")

Current working directory: /content
Error: The file 'sample.txt' was not found in the current directory: /content
Please ensure the file exists or provide the correct file path.


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
