In [None]:
#Semantic Analysis

In [21]:
# CBOW Example
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random

# Sample corpus of sentences
sentences = [
    "I am learning Python",
    "Python is a versatile language",
    "Learning Python is fun"
]

# Tokenize the corpus
tokens = [sentence.split() for sentence in sentences]

# Create input sequences and corresponding target words
input_sequences = []
target_words = []
for tokens_in_sentence in tokens:
    for i in range(1, len(tokens_in_sentence)):
        input_sequences.append(' '.join(tokens_in_sentence[:i]))  # Join tokens into a string
        target_words.append(tokens_in_sentence[i])

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the input sequences to create the bag-of-words representation
X = vectorizer.fit_transform(input_sequences)

# Encode target words using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target_words)

# Create a Multinomial Naive Bayes model for word prediction
classifier = MultinomialNB()
classifier.fit(X, y_encoded)

# Given a seed sentence, predict the next word
seed_sentence = "Python"
encoded_seed = vectorizer.transform([seed_sentence]).toarray()
predicted_next_word_index = classifier.predict(encoded_seed)
predicted_next_word = label_encoder.inverse_transform(predicted_next_word_index)

# Print the predicted next word
print("Seed Sentence:", seed_sentence)
print("Predicted Next Word:", predicted_next_word[0])


Seed Sentence: Python
Predicted Next Word: is


In [None]:
#Skip gram model

In [15]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec

In [16]:
# Download the IMDb dataset
nltk.download("movie_reviews")
nltk.download("punkt")

# Load the movie_reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Shuffle the dataset for randomness
random.shuffle(documents)

In [18]:
# Extract text and labels from the dataset
texts, labels = zip(*documents)

# Prepare data for Skip-gram models
text_str = [' '.join(text) for text in texts]

In [19]:
# Skip-gram model
tokenized_texts = [word_tokenize(text) for text in text_str]
skipgram_model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=1)
print("Skip-gram model vocabulary size:", len(skipgram_model.wv))

Skip-gram model vocabulary size: 39741


In [20]:
# Test semantic similarity using the Skip-gram model
word1 = "good"
word2 = "great"
print("Semantic similarity between '{}' and '{}':".format(word1, word2), skipgram_model.wv.similarity(word1, word2))


Semantic similarity between 'good' and 'great': 0.76957107


In [8]:
#Sentiment Analysis

In [9]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download the IMDb dataset
nltk.download("movie_reviews")
nltk.download("punkt")

# Load the movie_reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the dataset for randomness
random.shuffle(documents)

# Extract text and labels from the dataset
texts, labels = zip(*documents)

# Prepare data for sentiment analysis
text_str = [' '.join(text) for text in texts]

# Bag of Words model for sentiment analysis
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text_str)
y = list(labels)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes Classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Test the classifier
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Sentiment Analysis Classifier accuracy:", accuracy)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentiment Analysis Classifier accuracy: 0.8325


In [None]:
y_pred

array(['pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos',
       'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg',
       'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg',
       'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'pos',
       'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos',
       'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos',
       'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos',
       'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos',
       'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg',
       'neg', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg',
       'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg',
       'pos', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg',
       'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg',
       'pos', 'neg',