<a href="https://colab.research.google.com/github/prince02356/movie_recommendation_system/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install transformers tensorflow datasets
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
import pandas as pd
from sklearn.model_selection import train_test_split

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [14]:
data = pd.read_csv('/content/sentiment_analysis.csv')
data = data[['text', 'sentiment']]

In [15]:
sentiment_mapping = {'positive': 0, 'neutral': 1, 'negative': 2}  # Replace with your actual sentiment labels
data['sentiment'] = data['sentiment'].map(sentiment_mapping)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['text'], data['sentiment'], test_size=0.2, random_state=42
)

In [16]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.to_list(), truncation=True, padding=True, max_length=128)

In [17]:
def encode_data(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((dict(encodings), labels)).batch(16)

train_dataset = encode_data(train_encodings, train_labels.to_list())
val_dataset = encode_data(val_encodings, val_labels.to_list())

In [18]:
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

train_dist_dataset = strategy.experimental_distribute_dataset(train_dataset)
val_dist_dataset = strategy.experimental_distribute_dataset(val_dataset)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
@tf.function
def train_step(inputs):
    def step_fn(inputs):
        x, y = inputs
        with tf.GradientTape() as tape:
            logits = model(x, training=True)
            loss = loss_fn(y, logits.logits)  # logits from the Hugging Face model
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_accuracy.update_state(y, logits.logits)
        return loss

    per_replica_losses = strategy.run(step_fn, args=(inputs,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

@tf.function
def val_step(inputs):
    def step_fn(inputs):
        x, y = inputs
        logits = model(x, training=False)
        val_accuracy.update_state(y, logits.logits)

    strategy.run(step_fn, args=(inputs,))

In [20]:
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    train_accuracy.reset_state()
    val_accuracy.reset_state()

    for step, inputs in enumerate(train_dist_dataset):
        loss = train_step(inputs)
        if step % 100 == 0:
            print(f"Step {step}, Loss: {loss.numpy():.4f}, Accuracy: {train_accuracy.result().numpy():.4f}")

    for val_inputs in val_dist_dataset:
        val_step(val_inputs)

    print(f"Epoch {epoch + 1} - Train Accuracy: {train_accuracy.result().numpy():.4f}, "
          f"Validation Accuracy: {val_accuracy.result().numpy():.4f}")


Epoch 1/3
Step 0, Loss: 1.1016, Accuracy: 0.3750
Epoch 1 - Train Accuracy: 0.3960, Validation Accuracy: 0.3600

Epoch 2/3
Step 0, Loss: 1.0969, Accuracy: 0.3750
Epoch 2 - Train Accuracy: 0.5789, Validation Accuracy: 0.4400

Epoch 3/3
Step 0, Loss: 0.8871, Accuracy: 0.5000
Epoch 3 - Train Accuracy: 0.7118, Validation Accuracy: 0.5800
