<a href="https://colab.research.google.com/github/puji2004-oss/nlpworks/blob/main/nlp2_4_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [6]:
from IPython import get_ipython
from IPython.display import display
# %%

import nltk
import spacy
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """Tokenize, remove stopwords, lemmatize."""
    # Indent the code block within the function
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = nltk.WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(processed_tokens)

def sentiment_analysis(text):
    """Perform sentiment analysis using VADER."""
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score

def named_entity_recognition(text):
    """Perform Named Entity Recognition (NER) using spaCy."""
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def text_classification(texts):
    """Convert text data into TF-IDF feature vectors."""
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return tfidf_matrix.toarray()

if __name__ == "__main__":
    sample_text = "Apple Inc. is looking at buying a U.K. startup for $1 billion."

    print("Preprocessed Text:", preprocess_text(sample_text))
    print("Sentiment Analysis:", sentiment_analysis(sample_text))
    print("Named Entities:", named_entity_recognition(sample_text))
    print("TF-IDF Matrix:", text_classification([sample_text, "Google is a big company too."]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed Text: apple inc looking buying u k startup 1 billion
Sentiment Analysis: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Named Entities: [('Apple Inc.', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]
TF-IDF Matrix: [[0.34287126 0.34287126 0.         0.34287126 0.34287126 0.
  0.34287126 0.         0.34287126 0.24395573 0.34287126 0.34287126
  0.        ]
 [0.         0.         0.47107781 0.         0.         0.47107781
  0.         0.47107781 0.         0.33517574 0.         0.
  0.47107781]]
