# Developing an NLP application to analyze and process text data.

## Importing necessary libraries


In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.downloader as api

## Download necessary NLTK data


In [10]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Function to preprocess text
 

In [11]:
def preprocess_text(text):
    # Tokenization: Text ko words mein split karna
    tokens = word_tokenize(text.lower())
    
    # Stop-word Removal: Common words ko hatana
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Stemming: Words ko base form mein convert karna
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Lemmatization: Words ko dictionary form mein convert karna
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    
    return lemmatized_tokens

## Function to preprocess text and get word embeddings

In [12]:
def preprocess_and_embed(text, model):
    # Text ko preprocess karna
    tokens = preprocess_text(text)
    
    # Embeddings ko get karna
    embeddings = [model[word] for word in tokens if word in model]
    
    return embeddings


## Pre-trained Word2Vec model ko load karte hain

In [15]:
word2vec_model = api.load("glove-wiki-gigaword-50")



### Input text

In [17]:
text = input("Enter the text : ")

processed_text = preprocess_text(text)
print("Processed Text:", processed_text)

vector = word2vec_model['king']
print("Vector for 'king':", vector)

similar_words = word2vec_model.most_similar('king')
print("Words similar to 'king':", similar_words)

embeddings = preprocess_and_embed(text, word2vec_model)
print("Embeddings for processed text:", embeddings)

Enter the text :  Hi i am rohan thite from pune 


Processed Text: ['hi', 'rohan', 'thite', 'pune']
Vector for 'king': [ 0.50451   0.68607  -0.59517  -0.022801  0.60046  -0.13498  -0.08813
  0.47377  -0.61798  -0.31012  -0.076666  1.493    -0.034189 -0.98173
  0.68229   0.81722  -0.51874  -0.31503  -0.55809   0.66421   0.1961
 -0.13495  -0.11476  -0.30344   0.41177  -2.223    -1.0756   -1.0783
 -0.34354   0.33505   1.9927   -0.04234  -0.64319   0.71125   0.49159
  0.16754   0.34344  -0.25663  -0.8523    0.1661    0.40102   1.1685
 -1.0137   -0.21585  -0.15155   0.78321  -0.91241  -1.6106   -0.64426
 -0.51042 ]
Words similar to 'king': [('prince', 0.8236179351806641), ('queen', 0.7839043140411377), ('ii', 0.7746230363845825), ('emperor', 0.7736247777938843), ('son', 0.766719400882721), ('uncle', 0.7627150416374207), ('kingdom', 0.7542160749435425), ('throne', 0.7539913654327393), ('brother', 0.7492411136627197), ('ruler', 0.7434253692626953)]
Embeddings for processed text: [array([-0.54313  ,  0.34427  ,  0.27125  ,  1.0487   , -1.1642 

## Working and Explanation

### Text Preprocessing

### Word Embeddings