# Sentiment Analysis with Fine-Tuned BERT

This project demonstrates how to build a sentiment analysis system using a fine-tuned BERT model.

In [2]:


!pip install transformers tensorflow pandas scikit-learn matplotlib




In [4]:
!wget -O imdb.csv https://raw.githubusercontent.com/datasets/sentiment/master/data/imdb.csv


--2024-11-21 15:06:54--  https://raw.githubusercontent.com/datasets/sentiment/master/data/imdb.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2024-11-21 15:06:55 ERROR 404: Not Found.



## Step 1: Load Dataset

In [None]:

import pandas as pd


url = "imdb.csv"
data = pd.read_csv(url)


print(data.head())




## Step 2: Preprocess Text

In [None]:

import re

def preprocess_text(text):

    text = re.sub(r"http\S+", "", text)

    text = re.sub(r"@\w+", "", text)

    text = re.sub(r"[^a-zA-Z\s]", "", text)

    text = text.lower()
    return text

data['clean_text'] = data['text'].apply(preprocess_text)


## Step 3: Split Data into Train and Test Sets

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    data['clean_text'], data['label'], test_size=0.2, random_state=42
)


## Step 4: Tokenization using BERT Tokenizer

In [None]:

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize_data(texts, max_len=128):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors="tf"
    )

train_tokens = tokenize_data(X_train)
test_tokens = tokenize_data(X_test)


## Step 5: Load Pre-Trained BERT and Fine-Tune

In [None]:

from transformers import TFBertForSequenceClassification


model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


### Compile the Model

In [None]:

import tensorflow as tf

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)


### Train the Model

In [None]:

history = model.fit(
    x={'input_ids': train_tokens['input_ids'], 'attention_mask': train_tokens['attention_mask']},
    y=y_train,
    validation_data=(
        {'input_ids': test_tokens['input_ids'], 'attention_mask': test_tokens['attention_mask']}, y_test
    ),
    epochs=3,
    batch_size=16
)


## Step 6: Evaluate the Model

In [None]:

results = model.evaluate(
    x={'input_ids': test_tokens['input_ids'], 'attention_mask': test_tokens['attention_mask']},
    y=y_test
)
print("Test Accuracy: {:.2f}%".format(results[1] * 100))


## Step 7: Test the Model on Custom Inputs

In [None]:

def predict_sentiment(text):
    tokens = tokenize_data([text])
    output = model.predict({'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask']})
    sentiment = tf.argmax(output.logits, axis=1).numpy()[0]
    return "Positive" if sentiment == 1 else "Negative"

print(predict_sentiment("I absolutely loved this product!"))
print(predict_sentiment("It was a terrible experience."))


## Step 8: Save the Model

In [None]:

model.save('bert_sentiment_model')


## Step 9: Deploy with FastAPI (Optional)

In [None]:

from fastapi import FastAPI

app = FastAPI()

@app.post("/predict/")
def predict(text: str):
    tokens = tokenizer([text], max_length=128, truncation=True, padding='max_length', return_tensors='tf')
    output = model.predict({'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask']})
    sentiment = tf.argmax(output.logits, axis=1).numpy()[0]
    return {"sentiment": "Positive" if sentiment == 1 else "Negative"}
