In [1]:
pip install transformers torch gensim sentencepiece nltk


Defaulting to user installation because normal site-packages is not writeable
Collecting sentencepiece
  Obtaining dependency information for sentencepiece from https://files.pythonhosted.org/packages/a2/f6/587c62fd21fc988555b85351f50bbde43a51524caafd63bc69240ded14fd/sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Collecting FuzzyTM>=0.4.0 (from gensim)
  Obtaining dependency information for FuzzyTM>=0.4.0 from https://files.pythonhosted.org/packages/2d/30/074bac7a25866a2807c1005c7852c0139ac22ba837871fc01f16df29b9dc/FuzzyTM-2.0.9-py3-none-any.whl.metadata
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Obtaining dependency information for pyfume from https://files.pythonhosted.org/packages/ed/ea/a3b120e251145dcdb10777f2bc5f18b1496fd999d705a178c1b0ad947ce1/pyFUME-0.3.4-py3-none-any.whl.metadata
  Downloading pyFUME-0.3.4-py3-none-any.whl.m

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
tables 3.8.0 requires cython>=0.29.21, which is not installed.


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize("Renewable energy technologies like solar panels and wind turbines are revolutionizing power generation.")
print("BERT Tokens:", tokens)


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BERT Tokens: ['renewable', 'energy', 'technologies', 'like', 'solar', 'panels', 'and', 'wind', 'turbines', 'are', 'revolution', '##izing', 'power', 'generation', '.']


In [5]:
from tokenizers import ByteLevelBPETokenizer

# Initialize the tokenizer
tokenizer = ByteLevelBPETokenizer()

# Train on the input text (normally trained on large corpora)
tokenizer.train_from_iterator(["Renewable energy technologies like solar panels and wind turbines..."], vocab_size=1000)

# Tokenize the text
tokens = tokenizer.encode("Renewable energy technologies like solar panels and wind turbines...")
print(tokens.tokens)


['R', 'ene', 'w', 'a', 'b', 'l', 'e', 'Ġ', 'ene', 'r', 'g', 'y', 'Ġt', 'e', 'c', 'h', 'n', 'ol', 'o', 'g', 'i', 'e', 's', 'Ġ', 'l', 'i', 'k', 'e', 'Ġ', 's', 'ol', 'a', 'r', 'Ġ', 'p', 'a', 'ne', 'l', 's', 'Ġ', 'a', 'nd', 'Ġ', 'w', 'i', 'nd', 'Ġt', 'u', 'r', 'b', 'i', 'ne', 's', '..', '.']


In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

text = "Renewable energy technologies like solar panels and wind turbines are revolutionizing power generation."
domain_specific_stopwords = {"energy", "power"}

tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word.lower() not in domain_specific_stopwords]
print(filtered_tokens)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...


['Renewable', 'technologies', 'like', 'solar', 'panels', 'and', 'wind', 'turbines', 'are', 'revolutionizing', 'generation', '.']


[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
from transformers import BertTokenizer, BertModel
import torch

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize and obtain embeddings
text = "Renewable energy technologies like solar panels and wind turbines..."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)

# Extract embeddings
embeddings = outputs.last_hidden_state
print(embeddings.shape)  # (batch_size, sequence_length, hidden_dim)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([1, 14, 768])


In [10]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download("punkt")

# Input text
text = """Renewable energy technologies like solar panels and wind turbines are revolutionizing 
power generation. Advances in battery storage enable efficient energy distribution even during 
low-production periods. Governments worldwide are investing in smart grids to optimize renewable 
resource allocation."""

# Tokenization & Preprocessing
sentences = [word_tokenize(sentence.lower()) for sentence in text.split(".") if sentence]

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to get similar words safely
def get_similar_words(word):
    if word in model.wv:
        return model.wv.most_similar(word, topn=5)
    else:
        return f"'{word}' not found in vocabulary"

# Get similar words
similar_solar = get_similar_words("solar")
similar_grid = get_similar_words("grid")
similar_storage = get_similar_words("storage")

# Print results
print("Similar to 'solar':", similar_solar)
print("Similar to 'grid':", similar_grid)
print("Similar to 'storage':", similar_storage)


Similar to 'solar': [('efficient', 0.3190879821777344), ('allocation', 0.17484663426876068), ('optimize', 0.1280771642923355), ('resource', 0.11968991160392761), ('turbines', 0.11134804040193558)]
Similar to 'grid': 'grid' not found in vocabulary
Similar to 'storage': [('revolutionizing', 0.2378092110157013), ('enable', 0.15626586973667145), ('turbines', 0.1560634970664978), ('renewable', 0.10841913521289825), ('and', 0.10184258222579956)]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize

# Download NLTK dataset if not already available
nltk.download("movie_reviews")
nltk.download("punkt")

# 1. Load and Preprocess Dataset
def extract_features(words):
    return {word: True for word in words}  # Convert words into feature dictionary

# Load positive and negative reviews
positive_reviews = [(extract_features(movie_reviews.words(fileid)), "pos") for fileid in movie_reviews.fileids("pos")]
negative_reviews = [(extract_features(movie_reviews.words(fileid)), "neg") for fileid in movie_reviews.fileids("neg")]

# Split into training and test data
train_data = positive_reviews[:800] + negative_reviews[:800]  # 1600 for training
test_data = positive_reviews[800:] + negative_reviews[800:]  # Remaining for testing

# Train Naïve Bayes Classifier
classifier = NaiveBayesClassifier.train(train_data)

# 2. Define Test Sentences
test_sentences = [
    "This eco-friendly appliance drastically reduced my electricity bills!",
    "Poor durability – the solar charger failed within two months.",
    "Innovative battery design but complex installation process."
]

# 3. Predict Sentiment for Test Sentences
print("\nTest Sentence Predictions:")
for sentence in test_sentences:
    tokens = word_tokenize(sentence.lower())
    features = extract_features(tokens)
    prediction = classifier.classify(features)
    print(f"'{sentence}' → Sentiment: {prediction}")

# 4. Calculate Accuracy
test_accuracy = accuracy(classifier, test_data)
print("\nOverall Model Accuracy:", round(test_accuracy * 100, 2), "%")


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



Test Sentence Predictions:
'This eco-friendly appliance drastically reduced my electricity bills!' → Sentiment: pos
'Poor durability – the solar charger failed within two months.' → Sentiment: neg
'Innovative battery design but complex installation process.' → Sentiment: pos

Overall Model Accuracy: 73.5 %
