In [1]:
pip install torch torchvision torchaudio tensorflow transformers datasets matplotlib

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

file_path = r"/content/glassdoor_reviews_val_reviewResponses.csv"

df = pd.read_csv(file_path)

df["reviews.val.reviewResponses.val.createDate"] = pd.to_datetime(df["reviews.val.reviewResponses.val.createDate"])
df = df.sort_values(by="reviews.val.reviewResponses.val.createDate")

df = df[["reviews.val.reviewResponses.val.responseText", "reviews.val.reviewResponses.val.helpfulCount"]].dropna()

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["reviews.val.reviewResponses.val.responseText"])
sequences = tokenizer.texts_to_sequences(df["reviews.val.reviewResponses.val.responseText"])

max_len = 100
X_padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

y = np.array(df["reviews.val.reviewResponses.val.helpfulCount"])

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 32
train_dataset = TextDataset(X_train_tensor, y_train_tensor)
test_dataset = TextDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

vocab_size = len(tokenizer.word_index) + 1

print("Data preprocessing complete.")


Data preprocessing complete.


In [5]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

In [6]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [7]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out

In [9]:
def train_model(model, train_loader, test_loader, num_epochs=20):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch).view(-1, 1)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    print("Training complete.")

embed_dim = 128
hidden_dim = 64
output_dim = 1

rnn_model = RNNModel(vocab_size, embed_dim, hidden_dim, output_dim)

print("Training RNN...")
train_model(rnn_model, train_loader, test_loader)

del rnn_model
torch.cuda.empty_cache()



Training RNN...
Epoch 1/20, Loss: 0.0007883246289566159
Epoch 2/20, Loss: 4.827381781069562e-05
Epoch 3/20, Loss: 1.7462046031369027e-08
Epoch 4/20, Loss: 0.0001319755392614752
Epoch 5/20, Loss: 3.6953519156668335e-05
Epoch 6/20, Loss: 5.881193283130415e-05
Epoch 7/20, Loss: 5.4073010687716305e-05
Epoch 8/20, Loss: 1.3463161849358585e-05
Epoch 9/20, Loss: 5.565949479091614e-08
Epoch 10/20, Loss: 5.718412467103917e-06
Epoch 11/20, Loss: 7.426489446515916e-06
Epoch 12/20, Loss: 1.5559136954834685e-05
Epoch 13/20, Loss: 6.766345904907212e-06
Epoch 14/20, Loss: 4.136377356189769e-06
Epoch 15/20, Loss: 3.9785727494745515e-06
Epoch 16/20, Loss: 1.340925678050553e-06
Epoch 17/20, Loss: 8.532170170383324e-08
Epoch 18/20, Loss: 6.081435458327178e-06
Epoch 19/20, Loss: 6.495471183143309e-08
Epoch 20/20, Loss: 7.872761671023909e-06
Training complete.


In [10]:
def train_model(model, train_loader, test_loader, num_epochs=20):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch).view(-1, 1)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    print("Training complete.")

embed_dim = 128
hidden_dim = 64
output_dim = 1

lstm_model = LSTMModel(vocab_size, embed_dim, hidden_dim, output_dim)

print("Training LSTM...")
train_model(lstm_model, train_loader, test_loader)

del lstm_model
torch.cuda.empty_cache()

Training LSTM...
Epoch 1/20, Loss: 3.5686105093191145e-06
Epoch 2/20, Loss: 2.4379182832490187e-06
Epoch 3/20, Loss: 4.869214535574429e-05
Epoch 4/20, Loss: 0.000134305126266554
Epoch 5/20, Loss: 2.35141351367929e-06
Epoch 6/20, Loss: 2.713463982217945e-05
Epoch 7/20, Loss: 1.1034371993901004e-07
Epoch 8/20, Loss: 8.924265785026364e-06
Epoch 9/20, Loss: 4.8908723329077475e-06
Epoch 10/20, Loss: 1.6740119690439315e-06
Epoch 11/20, Loss: 2.7393266464059707e-06
Epoch 12/20, Loss: 7.663428505111369e-07
Epoch 13/20, Loss: 3.367177470181559e-08
Epoch 14/20, Loss: 8.720852662236211e-08
Epoch 15/20, Loss: 1.0420690159662627e-06
Epoch 16/20, Loss: 1.2372335334021045e-07
Epoch 17/20, Loss: 1.0936803391814465e-06
Epoch 18/20, Loss: 5.098818434134955e-08
Epoch 19/20, Loss: 1.0012775675249941e-07
Epoch 20/20, Loss: 4.10841096254444e-07
Training complete.


In [11]:
def train_model(model, train_loader, test_loader, num_epochs=20):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch).view(-1, 1)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    print("Training complete.")

embed_dim = 128
hidden_dim = 64
output_dim = 1

gru_model = GRUModel(vocab_size, embed_dim, hidden_dim, output_dim)

print("Training GRU...")
train_model(gru_model, train_loader, test_loader)

del gru_model
torch.cuda.empty_cache()

Training GRU...
Epoch 1/20, Loss: 7.941242074593902e-05
Epoch 2/20, Loss: 2.641279024828691e-05
Epoch 3/20, Loss: 0.0002931212948169559
Epoch 4/20, Loss: 4.140390956308693e-06
Epoch 5/20, Loss: 5.05127536598593e-05
Epoch 6/20, Loss: 3.163808287354186e-05
Epoch 7/20, Loss: 5.368213351175655e-06
Epoch 8/20, Loss: 2.6451677967997966e-06
Epoch 9/20, Loss: 1.3264468634588411e-06
Epoch 10/20, Loss: 1.1292986528133042e-05
Epoch 11/20, Loss: 5.122758011566475e-06
Epoch 12/20, Loss: 3.825518433586694e-06
Epoch 13/20, Loss: 2.0297797163948417e-05
Epoch 14/20, Loss: 1.2213388345116982e-06
Epoch 15/20, Loss: 1.9448675629973877e-06
Epoch 16/20, Loss: 2.5541638137838163e-07
Epoch 17/20, Loss: 6.23634775820392e-07
Epoch 18/20, Loss: 5.842661721544573e-06
Epoch 19/20, Loss: 3.410814599646983e-07
Epoch 20/20, Loss: 1.9637536752270535e-07
Training complete.


In [None]:
from transformers import pipeline

explainer = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-alpha")

prompt = "Explain why LSTM performs better than RNN for long-term dependencies in time-series data."
explanation = explainer(prompt, max_length=100)

print(explanation[0]["generated_text"])
# It was showing the storage is full because of that the excecution is got stucked, i'm unable to run it


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]