<a href="https://colab.research.google.com/github/ritikpathania/project/blob/main/sentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Define the local path where you want to save the dataset zip file
local_zip_path = '/content/dataset.zip'

# Download the zip file from the Google Drive link
!gdown --id 1AAdD7d1TZSQE8mYFZI8_febLIwgznxhR -O "$local_zip_path"

# Specify the local directory where you want to extract the files
extraction_path = '/content/dataset'

# Create the extraction directory if it doesn't exist
import os
os.makedirs(extraction_path, exist_ok=True)

# Unzip the dataset
import zipfile
with zipfile.ZipFile('/content/gdrive/MyDrive/archive.zip', 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# List the contents of the extraction directory to verify
extracted_files = os.listdir(extraction_path)
print(f"Extracted files: {extracted_files}")

# Load the dataset from the extracted directory
dataset_dir = '/content/dataset'
dataset_path = f"/content/dataset/Twitter_Data.csv"  # Replace with the actual CSV file
dataset = pd.read_csv(dataset_path)

# Split the dataset into training and validation data
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Downloading...
From: https://drive.google.com/uc?id=1AAdD7d1TZSQE8mYFZI8_febLIwgznxhR
To: /content/dataset.zip
100% 20.9M/20.9M [00:00<00:00, 49.5MB/s]
Extracted files: ['__MACOSX', 'Twitter_Data.csv']


In [None]:
# Filter out non-empty and non-null text values
train_data = train_data.dropna(subset=['clean_text'])
train_data = train_data[train_data['clean_text'] != '']

# Map your labels to 0, 1, and 2
train_data['category'] = train_data['category'].map({-1: 0, 0: 1, 1: 2})
val_data['category'] = val_data['category'].map({-1: 0, 0: 1, 1: 2})

In [None]:
# Tokenize the text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
X_train = tokenizer(list(train_data['clean_text']), padding='max_length', truncation=True, max_length=128, return_tensors='tf', return_token_type_ids=False)
X_val = tokenizer(list(val_data['clean_text']), padding='max_length', truncation=True, max_length=128, return_tensors='tf', return_token_type_ids=False)

# Convert to TensorFlow tensors
X_train = {key: tf.convert_to_tensor(X_train[key]) for key in X_train}
X_val = {key: tf.convert_to_tensor(X_val[key]) for key in X_val}

# Prepare labels
y_train = train_data['category'].to_numpy()
y_val = val_data['category'].to_numpy()

# Build the BERT-based model
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # Three classes

# Compile the model with the appropriate loss function
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=3, batch_size=8)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')

Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Loss: 0.9380, Validation Accuracy: 0.5450
