<a href="https://colab.research.google.com/github/sharwanbagaria03/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("data.csv")
print(data.head())

In [None]:
data.info()

In [None]:
data['review'] = data['Review_Header'].astype(str) + ': ' + data['Review_text'].astype(str)

In [None]:
data = data.drop(columns=['Unique_ID', 'Category', 'Rating', 'Review_Header', 'Review_text'])

In [None]:
positive_data = data[data['Own_Rating'] == 'Positive']
negative_neutral_data = data[data['Own_Rating'].isin(['Negative', 'Neutral'])]
positive_sampled_data = positive_data.sample(n=10000, random_state=42)
data = pd.concat([negative_neutral_data, positive_sampled_data])
data = data.reset_index(drop=True)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
reviews = data['review']
sentiments = list(data['Own_Rating'])

### Clean Text data

In [None]:
import nltk
nltk.download('wordnet')
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
CLEANR = re.compile('<.*?>')

def clean(review):
    review = re.sub(CLEANR, '', review) # remove html tags
    review = re.sub('[^a-zA-Z ]', '', review)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(i) for i in review]
    return ' '.join(review)

In [None]:
reviews = reviews.apply(clean)
reviews[:10]

In [None]:
y = pd.get_dummies(sentiments)['Positive']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(list(reviews), y, test_size=0.2, random_state=0)

### Setup Transformers

In [None]:
import transformers
import tensorflow as tf

In [None]:
# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(x_train,
                            truncation=True,
                            padding=True)

val_encodings = tokenizer(x_test,
                            truncation=True,
                            padding=True)

In [None]:
# convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    y_test
))

### Load pre-trained model

In [None]:
# Load Model
from transformers import TFDistilBertForSequenceClassification, AutoTokenizer
import tensorflow as tf

# Load pre-trained model and tokenizer, change num_labels to 1
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

### Train model

In [None]:
# Compile the model with BinaryCrossentropy loss and from_logits=True
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
# For num_labels=1, you should use sigmoid activation and binary cross-entropy loss
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

# Train the model
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          validation_data=val_dataset.shuffle(100).batch(16))


In [None]:
# train for more 2 epochs
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          validation_data=val_dataset.shuffle(100).batch(16))

In [None]:
model.save_pretrained("/content/output")

In [None]:
loaded_model = TFDistilBertForSequenceClassification.from_pretrained("/content/output")

In [None]:
predictions = model.predict(val_dataset.batch(16)).logits
predicted_probs = tf.sigmoid(predictions)

In [None]:
# Define the classification function
def classify_predictions(predictions, threshold_positive=0.7, threshold_neutral=0.3):
    predicted_probs = tf.sigmoid(predictions)

    predictions_class = []
    for prob in predicted_probs:
        if prob >= threshold_positive:
            predictions_class.append('Positive')
        elif prob <= threshold_neutral:
            predictions_class.append('Negative')
        else:
            predictions_class.append('Neutral')

    return predictions_class

In [None]:
classified_predictions = classify_predictions(predictions)
results_df = pd.DataFrame({'Review': x_test, 'True Sentiment': y_test, 'Predicted Sentiment': classified_predictions})

In [None]:
# Save the results to a CSV file
results_df.to_csv('/content/predictions.csv', index=False)