<a href="https://colab.research.google.com/github/roggersanguzu/Anguzu-Sentiment-Analysis-AI/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Add this line to download the missing resource

df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [w for w in tokens if w.isalpha() and w not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in filtered_tokens])

df['cleaned_review'] = df['reviewText'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = analyzer.polarity_scores(text)
    return 1 if score['compound'] >= 0.05 else 0

df['vader_sentiment'] = df['cleaned_review'].apply(get_sentiment)
print(confusion_matrix(df['Positive'], df['vader_sentiment']))
print(classification_report(df['Positive'], df['vader_sentiment']))


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


[[ 2382  2385]
 [ 1473 13760]]
              precision    recall  f1-score   support

           0       0.62      0.50      0.55      4767
           1       0.85      0.90      0.88     15233

    accuracy                           0.81     20000
   macro avg       0.74      0.70      0.71     20000
weighted avg       0.80      0.81      0.80     20000



In [5]:
!pip install transformers datasets torch
from transformers import pipeline

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [6]:
import torch
torch.cuda.is_available()

True

In [7]:
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict
df['bert_sentiment'] = df['reviewText'].apply(lambda x: classifier(x)[0]['label'])
df['bert_sentiment'] = df['bert_sentiment'].map({'POSITIVE': 1, 'NEGATIVE': 0})

print(classification_report(df['Positive'], df['bert_sentiment']))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


              precision    recall  f1-score   support

           0       0.63      0.93      0.75      4767
           1       0.97      0.83      0.90     15233

    accuracy                           0.85     20000
   macro avg       0.80      0.88      0.82     20000
weighted avg       0.89      0.85      0.86     20000



In [8]:
from sklearn.metrics import accuracy_score

vader_acc = accuracy_score(df['Positive'], df['vader_sentiment'])
bert_acc = accuracy_score(df['Positive'], df['bert_sentiment'])

print(f"VADER Accuracy: {vader_acc:.2f}")
print(f"BERT Accuracy: {bert_acc:.2f}")


VADER Accuracy: 0.81
BERT Accuracy: 0.85


In [13]:
import gradio as gr
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)

def predict_sentiment(text):
    if not text.strip():
        return " Please enter some text."

    result = classifier(text)[0]
    label = result['label']
    score = round(result['score'], 4)

    emoji = "😃" if label == "POSITIVE" else "😠"
    verdict = f"{emoji} {label} ({score * 100:.1f}% confidence)"

    return verdict

demo = gr.Interface(
    fn=predict_sentiment,
    inputs=gr.Textbox(
        lines=5,
        placeholder="Paste your Amazon review, tweet, or customer rant here...",
        label="Enter Text"
    ),
    outputs=gr.Text(label=" Anguzu's AI Sentiment Analysis"),
    title="Anguzu Sentiment AI",
    description="""
Built by Anguzu,I believer emotions matter, and data doesn't lie.
This AI uses transformer-based deep learning to classify text as either positive or negative.
Try me on feedback, tweets, product reviews, or even your ex’s last message .

Examples:
- “I love this app  it's pure genius!”
- “Total garbage. Waste of my time.”
- “Eh... it works okay I guess.”
""",
    theme="soft",
    allow_flagging="never"
)

demo.launch()


Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://17799f0cc250c36220.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer.save_pretrained("./anguzu-sentiment-model")
model.save_pretrained("./anguzu-sentiment-model")
