In [1]:
!pip install nltk
!pip install transformers
!pip install scikit-learn
!pip install pandas



In [2]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch


In [3]:
# Sample dataset (you can replace this with your own CSV)
data = {
    "text": [
        "The product is amazing!",
        "I hated the movie. Total waste of time.",
        "It was okay, not the best.",
        "Absolutely loved it! Will recommend.",
        "Worst experience ever.",
        "Good quality and fast delivery.",
        "Terrible acting and storyline.",
        "Superb! Totally worth the money."
    ],
    "label": [1, 0, 1, 1, 0, 1, 0, 1]  # 1 = Positive, 0 = Negative
}
df = pd.DataFrame(data)
df.head()


Unnamed: 0,text,label
0,The product is amazing!,1
1,I hated the movie. Total waste of time.,0
2,"It was okay, not the best.",1
3,Absolutely loved it! Will recommend.,1
4,Worst experience ever.,0


In [4]:
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = vader.polarity_scores(text)['compound']
    return 1 if score >= 0 else 0

df['vader_pred'] = df['text'].apply(vader_sentiment)

print("VADER Accuracy:")
print(classification_report(df['label'], df['vader_pred']))


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


VADER Accuracy:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.80      0.89         5

    accuracy                           0.88         8
   macro avg       0.88      0.90      0.87         8
weighted avg       0.91      0.88      0.88         8



In [5]:
# Load pre-trained pipeline
bert_pipe = pipeline("sentiment-analysis")

def bert_sentiment(text):
    result = bert_pipe(text)[0]
    return 1 if result['label'] == 'POSITIVE' else 0

df['bert_pred'] = df['text'].apply(bert_sentiment)

print("BERT Accuracy:")
print(classification_report(df['label'], df['bert_pred']))


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


BERT Accuracy:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.80      0.89         5

    accuracy                           0.88         8
   macro avg       0.88      0.90      0.87         8
weighted avg       0.91      0.88      0.88         8



In [6]:
# Vectorize text using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

print("Logistic Regression Accuracy:")
print(classification_report(y_test, y_pred))


Logistic Regression Accuracy:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      0.50      0.50         2

    accuracy                           0.33         3
   macro avg       0.25      0.25      0.25         3
weighted avg       0.33      0.33      0.33         3

