## Read the dataset csv file

In [1]:
!pip install accelerate --upgrade

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [4]:
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
# Separating positive and negative samples
positive_samples = df[df['sentiment'] == 'positive'].sample(n=7500, random_state=42)
negative_samples = df[df['sentiment'] == 'negative'].sample(n=7500, random_state=42)

# Concatenating the samples to get the final subset
subset_df = pd.concat([positive_samples, negative_samples])

# Shuffling the subset
subset_df = subset_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Displaying the first few rows of the subset
print(subset_df.head())

                                              review sentiment
0  As a kid, I never understood WHY anyone would ...  negative
1  ......... and you get Chori Chori Chupke Chupk...  positive
2  There were many 'spooky' westerns made in the ...  negative
3  I enjoyed this movie, granted it is mainly bec...  positive
4  This was an interesting movie. I could have do...  positive


In [28]:


# Getting the total number of samples
total_samples = subset_df.shape[0]

# Counting the number of positive and negative samples
class_counts = subset_df['sentiment'].value_counts()

# Displaying the results
print(f'Total number of samples: {total_samples}')
print(f'Number of positive samples: {class_counts["positive"]}')
print(f'Number of negative samples: {class_counts["negative"]}')

Total number of samples: 15000
Number of positive samples: 7500
Number of negative samples: 7500


## Process the data

In [30]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [31]:
def process_data(row):

    text = row['review']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    label = 0
    if row['sentiment'] == 'positive':
        label += 1

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [32]:
print(process_data({
    'review': 'this is a sample review of a movie.',
    'sentiment': 'positive'
}))

{'input_ids': [101, 2023, 2003, 1037, 7099, 3319, 1997, 1037, 3185, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 1, 'text': 'this is a sample review of a movie.'}


In [33]:
processed_data = []

for i in range(len(subset_df)):
    processed_data.append(process_data(subset_df.iloc[i]))

## Generate the dataset

In [34]:
from sklearn.model_selection import train_test_split

new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(
    new_df,
    test_size=0.2,
    random_state=2022
)

In [35]:
import pyarrow as pa
from datasets import Dataset

train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

## Create a model

In [70]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
from transformers import TrainingArguments
import torch
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


from sklearn.metrics import f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"f1": f1_score(labels, predictions)}


from transformers import TrainerCallback

class F1ScoreCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.best_f1 = 0.0
        self.best_model = None

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics["eval_f1"] > self.best_f1:
            self.best_f1 = metrics["eval_f1"]
            self.best_model = kwargs["model"].to("cpu")
            # Saving the best model
            self.best_model.save_pretrained(f"./best_model_epoch_{state.epoch}")



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


from transformers import Trainer

class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        # Move the model to the GPU
        model.to(self.args.device)

        # Move all tensors in inputs to the GPU device
        inputs = {k: v.to(self.args.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        return super().training_step(model, inputs)




# Initializing the Custom Trainer with the same parameters as before
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg,
    compute_metrics=compute_metrics,
    callbacks=[F1ScoreCallback()]
)


## Train and Evaluate the model

In [72]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.308497,0.873799
2,No log,0.310922,0.873371
3,0.260800,0.372663,0.879316


TrainOutput(global_step=564, training_loss=0.24384238787576662, metrics={'train_runtime': 409.0993, 'train_samples_per_second': 87.998, 'train_steps_per_second': 1.379, 'total_flos': 1192206587904000.0, 'train_loss': 0.24384238787576662, 'epoch': 3.0})

In [73]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        # Move the model to the GPU
        model.to(self.args.device)

        # Move all tensors in inputs to the GPU device
        inputs = {k: v.to(self.args.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        return super().training_step(model, inputs)




# Initialize the Custom Trainer with the same parameters as before
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_hg,
    eval_dataset=valid_hg,
    compute_metrics=compute_metrics,
    callbacks=[F1ScoreCallback()]
)
trainer.evaluate()

{'eval_loss': 0.372662752866745,
 'eval_f1': 0.8793160144689246,
 'eval_runtime': 9.8727,
 'eval_samples_per_second': 303.869,
 'eval_steps_per_second': 4.761}

## Save the model

In [74]:
model.save_pretrained('./model/')

## Load the model

In [75]:
from transformers import AutoModelForSequenceClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)

In [76]:
from transformers import AutoTokenizer

new_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

## Get predictions

In [77]:
import torch
import numpy as np

def get_prediction(text, model, tokenizer):
    # Encoding the input text
    encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

    # Moving the input encoding to the same device as the model
    encoding = {k: v.to(model.device) for k, v in encoding.items()}

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**encoding)

    logits = outputs.logits

    # Converting logits to probabilities
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze())
    probs = probs.cpu().detach().numpy()

    # Determining the label using probability
    label = np.argmax(probs, axis=-1)

    if label == 1:
        return {
            'sentiment': 'Positive',
            'probability': probs[1]
        }
    else:
        return {
            'sentiment': 'Negative',
            'probability': probs[0]
        }

text_to_predict = "This is a positive example."
prediction = get_prediction(text_to_predict, new_model, new_tokenizer)
print(prediction)


{'sentiment': 'Positive', 'probability': 0.56779325}


In [78]:
get_prediction('The movie was awful.', new_model, new_tokenizer)

{'sentiment': 'Negative', 'probability': 0.9465001}