# NLP

In [None]:
#Lowercasing

In [None]:
text = "This is an Example Text."
lowercased_text = text.lower()
print(lowercased_text)

In [None]:
# Removing Special Characters

In [None]:
import re

def remove_special_characters(text):
    return re.sub(r"[^a-zA-Z0-9\s]", "", text)

cleaned_text = remove_special_characters("Hello, world! This is a test.")
print(cleaned_text)

In [None]:
# Handling Contractions

In [None]:
import contractions

def expand_contractions(text):
    return contractions.fix(text)

expanded_text = expand_contractions("I can't believe it!")
print(expanded_text)

In [None]:
# Removing HTML Tags

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

html_text = "<p>This is <b>HTML</b> content.</p>"
cleaned_text = remove_html_tags(html_text)
print(cleaned_text)

In [None]:
# Removing Numbers

In [None]:
def remove_numbers(text):
    return re.sub(r"\d+", "", text)

numeric_text = "There are 123 apples."
cleaned_text = remove_numbers(numeric_text)
print(cleaned_text)

In [None]:
# Spell Checking

In [None]:
from autocorrect import Speller

spell = Speller(lang='en')
print(spell('caaar'))  
print(spell('mussage')) 
print(spell('survice')) 
print(spell('hte'))  

In [None]:
# Remove Stopwords

In [None]:
def remove_stopwords(text):
    stopwords = ["this", "is", "an",  "with", "some"]
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

# Usage:
cleaned_text = remove_stopwords("This is an example sentence with some stopwords.")
print(cleaned_text)

In [None]:
# Sentence Tokenization

In [None]:
def segment_sentences(text):
    # Split the text into sentences based on whitespace (space or tab characters)
    sentences = text.split()
    return sentences

input_text = "This is an example sentence. Another sentence follows."
sentences = segment_sentences(input_text)
print(sentences)

In [None]:
# Removing Punctuation

In [None]:
def remove_punctuation(text):
    punctuation_marks = ['.', ',', '!', '?', ';', ':']

    cleaned_text = ''.join([char for char in text if char not in punctuation_marks])
    return cleaned_text

input_text = "This is an example sentence with some punctuation!"
cleaned_text = remove_punctuation(input_text)
print(cleaned_text)

In [None]:
# Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

sentences = [
    "John likes to watch movies. Mary likes movies too.",
    "John also likes to watch football games."
]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(sentences)

vocabulary = vectorizer.get_feature_names_out()

for i, sentence in enumerate(sentences):
    print(f"Sentence {i + 1} vector: {X[i].toarray()[0]}")

print(f"Vocabulary: {vocabulary}")

In [None]:
# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    "John likes to watch movies. Mary likes movies too.",
    "John also likes to watch football games."
]

vectorizer = TfidfVectorizer()

X_tfidf = vectorizer.fit_transform(sentences)

vocabulary = vectorizer.get_feature_names_out()

for i, sentence in enumerate(sentences):
    print(f"Sentence {i + 1} TF-IDF vector: {X_tfidf[i].toarray()[0]}")

print(f"Vocabulary: {vocabulary}")

In [None]:
# pip install gensim

In [None]:
# Word2Vec

In [None]:
from gensim.models import Word2Vec

sentences = [
    "John likes to watch movies. Mary likes movies too.".split(),
    "John also likes to watch football games.".split()
]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# Get the vector representation for a specific word (e.g., 'movies')
word_vector = model.wv['movies']
print(f"Vector for 'movies': {word_vector}")


In [None]:
# Part-of-Speech Tagging (POS) with spaCy

In [None]:
# pip install spacy
# python -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm")

sentence = "John likes to watch movies. Mary likes movies too."

doc = nlp(sentence)

print("Token\tPOS Tag")
for token in doc:
    print(token.text, "\t", token.pos_)

In [None]:
# Named Entity Recognition (NER) with spaCy

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

sentence = "Barack Obama was born in Hawaii and served as the 44th President of the United States."

doc = nlp(sentence)

print("Named Entities:")
for ent in doc.ents:
    print(ent.text, "\t", ent.label_)

In [None]:
'''ORG: Stands for "Organization." It refers to named entities that represent companies, institutions, or other organized groups.

DATE: Represents expressions of dates or periods in time.

TIME: Denotes expressions of time, including specific times of the day or durations.

MONEY: Indicates expressions of monetary values, including currencies and amounts.

PERCENT: Refers to expressions of percentages, such as "10%" or "50 percent."

CARDINAL: Represents cardinal numbers, which are numerical quantities, including both integers and floating-point numbers.

FAC: Stands for "Facility." It refers to named entities that represent buildings, airports, highways, bridges, etc.

NORP: Denotes named entities representing nationalities, ethnic groups, or religious groups.

PRODUCT: Represents named entities that are products or goods, such as "iPhone" or "Coca-Cola."

EVENT: Indicates named entities representing events, such as "World War II" or "Super Bowl."'''

In [None]:
# Sentiment Analysis

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

text = "I love this product! It's amazing."

analyzer = SentimentIntensityAnalyzer()

sentiment_scores = analyzer.polarity_scores(text)

print("Sentiment Scores:", sentiment_scores)

if sentiment_scores['compound'] >= 0.05:
    print("Overall Sentiment: Positive")
elif sentiment_scores['compound'] <= -0.05:
    print("Overall Sentiment: Negative")
else:
    print("Overall Sentiment: Neutral")


In [1]:
# Text classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(subset='all', categories=None, shuffle=True, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

clf = MultinomialNB()

clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)