In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch
from collections import Counter
from torch.utils.data import WeightedRandomSampler

# Load the dataset
df = pd.read_csv('dataset.csv',encoding='windows-1252')

# Calculate class weights to handle class imbalance
class_weights = torch.tensor([1.0 / count for count in Counter(df["category"]).values()])

# Map category labels to integers
label_map = {label: i for i, label in enumerate(df["category"].unique())}

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [5]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
     -------------------------------------- 547.8/547.8 kB 2.9 MB/s eta 0:00:00
Collecting multiprocess
  Downloading multiprocess-0.70.16-py39-none-any.whl (133 kB)
     -------------------------------------- 133.4/133.4 kB 7.7 MB/s eta 0:00:00
Collecting aiohttp
  Downloading aiohttp-3.9.5-cp39-cp39-win_amd64.whl (371 kB)
     -------------------------------------- 371.6/371.6 kB 5.7 MB/s eta 0:00:00
Collecting requests>=2.32.2
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
     ---------------------------------------- 64.9/64.9 kB 3.6 MB/s eta 0:00:00
Collecting xxhash
  Downloading xxhash-3.4.1-cp39-cp39-win_amd64.whl (29 kB)
Collecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting tqdm>=4.66.3
  Downloading tqdm-4.66.4-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.3/78.3 kB 4.5 MB/s eta 0:00:00
Collecting frozenlist>=1.1.1
  Downloa

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.
gradio 4.29.0 requires urllib3~=2.0, but you have urllib3 1.25.11 which is incompatible.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.
conda-repo-cli 1.0.20 requires requests==2.28.1, but you have requests 2.32.3 which is incompatible.
anaconda-client 1.11.0 requires urllib3>=1.26.4, but you have urllib3 1.25.11 which is incompatible.


In [2]:
label_map

{'quality': 0, 'overall': 1, 'service': 2, 'price': 3}

In [3]:
# Define custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        review = self.dataframe.iloc[idx]["review"]
        category = self.dataframe.iloc[idx]["category"]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        label = torch.tensor(label_map[category])

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": label,
        }


In [5]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
hf_token='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'
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_map))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Specify the output directory
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
)

# Create datasets and dataloaders
train_dataset = ReviewDataset(train_df, tokenizer, max_length=128)
test_dataset = ReviewDataset(test_df, tokenizer, max_length=128)

# Compute weights for each sample in the dataset
class_weights = [class_weights[label_map[label]] for label in train_df["category"]]
sampler = WeightedRandomSampler(class_weights, len(train_dataset), replacement=True)

train_dataloader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Define metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)

print(results)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 180
  Batch size = 8


{'eval_loss': 0.1362021118402481, 'eval_accuracy': 0.9666666666666667, 'eval_f1': 0.9666774536295432, 'eval_precision': 0.9679906796300239, 'eval_recall': 0.9666666666666667, 'eval_runtime': 133.524, 'eval_samples_per_second': 1.348, 'eval_steps_per_second': 0.172, 'epoch': 3.0}


In [10]:
trainer.save_model()

Saving model checkpoint to tf_model.h5
Configuration saved in tf_model.h5\config.json
Model weights saved in tf_model.h5\pytorch_model.bin


In [None]:
!pip install accelerate>=0.21.0
!pip install transformers[torch]
!pip install accelerate -U

In [14]:
from transformers import TFAutoModelForSequenceClassification

model1=TFAutoModelForSequenceClassification.from_pretrained("results", num_labels=len(label_map),from_pt=True)


loading configuration file results\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading w

In [16]:
user_review = input('Enter review: ')

# Tokenize the user review (assuming you have a tokenizer)
tokenized_user_review = tokenizer(user_review, truncation=True, padding=True, return_tensors="tf")

# Make predictions
predictions = model1.predict(dict(tokenized_user_review))

# Extract the predicted probabilities for each class
predicted_probabilities = predictions[0][0]

# Get the predicted class index
predicted_class_index = predicted_probabilities.argmax()

def print_key_by_value(dict, value1):
  for key, value in dict.items():
    if value == value1:
      return key

# Example usage:
predicted_category=print_key_by_value(label_map,predicted_class_index )
# Print the predicted class index
print('predicted index',predicted_class_index)
print("Predicted category:", predicted_category)

Enter review: The service was atrocious. It took weeks to receive my order, and customer support was unresponsive.
predicted index 2
Predicted category: service
