In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Example dataset
data = {
    'text': [
        'I love this product!', 
        'This is the worst thing I have ever bought.', 
        'It is okay, neither good nor bad.',
        'Absolutely fantastic experience!', 
        'I hate it, very disappointing.', 
        'Not great, not terrible.'
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative', 'neutral']
}

# Convert to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,text,sentiment
0,I love this product!,positive
1,This is the worst thing I have ever bought.,negative
2,"It is okay, neither good nor bad.",neutral
3,Absolutely fantastic experience!,positive
4,"I hate it, very disappointing.",negative
5,"Not great, not terrible.",neutral


In [16]:

# Step 1: Text Preprocessing and Vectorization
# Create the Bag-of-Words model
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
print(X)

  (0, 9)	1
  (0, 11)	1
  (1, 14)	1
  (1, 13)	1
  (1, 2)	1
  (2, 10)	1
  (2, 6)	1
  (2, 1)	1
  (3, 0)	1
  (3, 5)	1
  (3, 4)	1
  (4, 8)	1
  (4, 3)	1
  (5, 7)	1
  (5, 12)	1


In [17]:

# Encode the sentiment labels
y = df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
print(y)

0    2
1    0
2    1
3    2
4    0
5    1
Name: sentiment, dtype: int64


In [7]:

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
print(X_train)

  (0, 7)	1
  (0, 12)	1
  (1, 10)	1
  (1, 6)	1
  (1, 1)	1
  (2, 8)	1
  (2, 3)	1
  (3, 0)	1
  (3, 5)	1
  (3, 4)	1


In [19]:
print(X_test)

  (0, 9)	1
  (0, 11)	1
  (1, 14)	1
  (1, 13)	1
  (1, 2)	1


In [20]:
print(y_train)

5    1
2    1
4    0
3    2
Name: sentiment, dtype: int64


In [21]:
print(y_test)

0    2
1    0
Name: sentiment, dtype: int64


In [26]:

# Step 3: Build and train the classifier (Logistic Regression)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)


In [27]:

# Step 4: Evaluate the model
y_pred = classifier.predict(X_test)
print(y_pred)

[1 1]


In [29]:
print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
     neutral       0.00      0.00      0.00       0.0
    positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset


In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
data = {
    'text': [
        'I love this product!', 
        'This is the worst thing I have ever bought.', 
        'It is okay, neither good nor bad.',
        'Absolutely fantastic experience!', 
        'I hate it, very disappointing.', 
        'Not great, not terrible.'
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'positive', 'negative', 'neutral']
}

# Convert to DataFrame
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,text,sentiment
0,I love this product!,positive
1,This is the worst thing I have ever bought.,negative
2,"It is okay, neither good nor bad.",neutral
3,Absolutely fantastic experience!,positive
4,"I hate it, very disappointing.",negative
5,"Not great, not terrible.",neutral


In [11]:

# Encode the sentiment labels
df_data['sentiment'] = df_data['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

df_data['sentiment']

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
Name: sentiment, dtype: float64

In [12]:
df_data

Unnamed: 0,text,sentiment
0,I love this product!,
1,This is the worst thing I have ever bought.,
2,"It is okay, neither good nor bad.",
3,Absolutely fantastic experience!,
4,"I hate it, very disappointing.",
5,"Not great, not terrible.",


In [14]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df

Unnamed: 0,text,sentiment
5,"Not great, not terrible.",1
2,"It is okay, neither good nor bad.",1
4,"I hate it, very disappointing.",0
3,Absolutely fantastic experience!,2


In [15]:
# 1. Bag-of-Words Model
def bow_model(train_df, test_df):
    vectorizer = CountVectorizer(stop_words='english')
    X_train = vectorizer.fit_transform(train_df['text'])
    X_test = vectorizer.transform(test_df['text'])
    y_train = train_df['sentiment']
    y_test = test_df['sentiment']
    
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    print("Bag-of-Words Model")
    print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


In [16]:
# 2. Word Embeddings Model
def word_embeddings_model(train_df, test_df):
    stop_words = set(stopwords.words('english'))
    glove_vectors = api.load("glove-wiki-gigaword-50")
    
    def preprocess(text):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        return tokens

    def get_document_vector(tokens):
        vectors = [glove_vectors[word] for word in tokens if word in glove_vectors]
        return np.mean(vectors, axis=0) if vectors else np.zeros(glove_vectors.vector_size)

    train_df['tokens'] = train_df['text'].apply(preprocess)
    test_df['tokens'] = test_df['text'].apply(preprocess)
    X_train = np.vstack(train_df['tokens'].apply(get_document_vector))
    X_test = np.vstack(test_df['tokens'].apply(get_document_vector))
    y_train = train_df['sentiment']
    y_test = test_df['sentiment']
    
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    print("Word Embeddings Model")
    print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))


In [25]:
# 3. Transformer Model
def transformer_model(train_df, test_df):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)

    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_dataset = train_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)
    
    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'sentiment'])
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'sentiment'])
    
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    
    training_args = TrainingArguments(
        output_dir='./results', 
        num_train_epochs=2, 
        per_device_train_batch_size=4, 
        per_device_eval_batch_size=4, 
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs', 
        logging_steps=10,
        evaluation_strategy="epoch"
    )
    
    trainer = Trainer(
        model=model, 
        args=training_args, 
        train_dataset=train_dataset, 
        eval_dataset=test_dataset
    )
    
    trainer.train()
    
    predictions = trainer.predict(test_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    y_test = test_df['sentiment'].values
    print("Transformer Model")
    print(classification_report(y_test, preds, target_names=['negative', 'neutral', 'positive']))



In [None]:
bow_model(train_df, test_df)
word_embeddings_model(train_df, test_df)
transformer_model(train_df, test_df)

Bag-of-Words Model
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       1.0
     neutral       0.00      0.00      0.00       0.0
    positive       0.00      0.00      0.00       1.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
