In [1]:
!pip install transformers
!pip install transformers[torch]
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m99.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m121.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [1]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import Trainer, TrainingArguments


train_df = pd.read_csv('filtered_data_colab_clean.csv')

# Ensure that the text data is string
train_df['Description'] = train_df['Description'].astype(str)

train_texts = train_df['Description'].tolist()
train_labels = train_df['Policy_type'].tolist()

# Use a pre-trained RoBERTa tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# Define a PyTorch dataset
class TosdrDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
train_dataset = TosdrDataset(train_encodings, train_labels)

# Initialize a pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()







Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should pr

Step,Training Loss
10,1.6501
20,1.634
30,1.6137
40,1.5143
50,1.2788
60,0.9631
70,0.7954
80,0.6125
90,0.3366
100,0.2781


TrainOutput(global_step=1256, training_loss=0.10946027759084802, metrics={'train_runtime': 469.682, 'train_samples_per_second': 42.761, 'train_steps_per_second': 2.674, 'total_flos': 5284464773246976.0, 'train_loss': 0.10946027759084802, 'epoch': 2.0})

In [2]:
from sklearn.metrics import classification_report
import numpy as np

# Load and preprocess the test data
test_df = pd.read_csv('test_data_clean.csv')

# Ensure that the text data is string
test_df['Description'] = test_df['Description'].astype(str)

test_texts = test_df['Description'].tolist()
test_labels = test_df['Policy_type'].tolist()  # true labels

# Define a dictionary for mapping labels
label_mapping = {0: "Terms of Service", 
                 1: "Privacy Policy", 
                 2: "Cookie Policy", 
                 3: "Data Policy", 
                 4: "Unknown Policy"}


test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create the test dataset
test_dataset = TosdrDataset(test_encodings, test_labels)

# Predict the labels for the test data
predictions = trainer.predict(test_dataset)

# Convert the predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Print the predicted labels
print(predicted_labels)

# Print the classification report
print(classification_report(test_labels, predicted_labels))

policy_names = [label_mapping[label] for label in predicted_labels]
test_df['Predicted Policy'] = policy_names

# Save DataFrame to a new CSV file
test_df.to_csv('test_with_policy_roberta.csv', index=False)


[1 0 0 ... 2 2 2]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1491
           1       1.00      1.00      1.00      1888
           2       0.99      1.00      1.00       101
           3       0.68      0.94      0.79        16
           4       0.95      0.95      0.95       215

    accuracy                           0.99      3711
   macro avg       0.92      0.98      0.95      3711
weighted avg       0.99      0.99      0.99      3711



In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert the predictions to labels for roberta model
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

print(f"Roberta Model Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


Roberta Model Metrics: Accuracy: 0.9954190245216923, Precision: 0.9954639814836846, Recall: 0.9954190245216923, F1: 0.9954334227166756


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess the training data
train_df = pd.read_csv('filtered_data_colab.csv')
train_df['Description'] = train_df['Description'].astype(str)
texts = train_df['Description'].tolist()
labels = train_df['Policy_type'].tolist()

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train the LinearSVC model
model_svc = LinearSVC()
model_svc.fit(X_train, y_train)

# # Predict the labels for the test data
# y_pred = model.predict(X_test)

# # Print the classification report
# print(classification_report(y_test, y_pred))

from sklearn.metrics import classification_report

# Load and preprocess the test data
test_df = pd.read_csv('other_data_colab.csv')
test_df['Description'] = test_df['Description'].astype(str)

test_texts = test_df['Description'].tolist()
test_labels = test_df['Policy_type'].tolist()  # true labels

# Transform the test data using the TF-IDF vectorizer
X_test = vectorizer.transform(test_texts)

# Predict the labels for the test data
predicted_labels = model_svc.predict(X_test)

# Define the label mapping
label_mapping = {
    0: "Terms of Service", 
    1: "Privacy Policy", 
    2: "Cookie Policy", 
    3: "Data Policy", 
    4: "Unknown Policy"
}

# Convert numerical labels to textual labels
predicted_policies = [label_mapping[label] for label in predicted_labels]

# Print the classification report
print(classification_report(test_labels, predicted_labels))

test_df['Predicted Policy'] = predicted_policies

# Save DataFrame to a new CSV file
test_df.to_csv('test_with_policy_svm.csv', index=False)



              precision    recall  f1-score   support

           0       0.93      0.99      0.96      1491
           1       0.95      0.98      0.96      1888
           2       0.83      0.77      0.80       101
           3       0.00      0.00      0.00        16
           4       0.92      0.41      0.57       215

    accuracy                           0.94      3711
   macro avg       0.73      0.63      0.66      3711
weighted avg       0.93      0.94      0.93      3711



In [8]:
# Predict the labels for the test data for svm model
predicted_labels = model_svc.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

print(f"LinearSVC Model Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


LinearSVC Model Metrics: Accuracy: 0.9377526273241714, Precision: 0.9333502226242438, Recall: 0.9377526273241714, F1: 0.9295988862638168


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Load and preprocess the training data
train_df = pd.read_csv('filtered_data_colab.csv')
train_df['Description'] = train_df['Description'].astype(str)

train_texts = train_df['Description'].tolist()
train_labels = train_df['Policy_type'].tolist()

# Vectorize the training texts
X_train = vectorizer.fit_transform(train_texts)

# Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, train_labels)

# Load and preprocess the test data
test_df = pd.read_csv('other_data_colab.csv')
test_df['Description'] = test_df['Description'].astype(str)

test_texts = test_df['Description'].tolist()
test_labels = test_df['Policy_type'].tolist()

# Vectorize the test texts
X_test = vectorizer.transform(test_texts)

# Predict the labels for the test data
predicted_labels = knn.predict(X_test)

# Define the label mapping
label_mapping = {
    0: "Terms of Service", 
    1: "Privacy Policy", 
    2: "Cookie Policy", 
    3: "Data Policy", 
    4: "Unknown Policy"
}

# Convert numerical labels to textual labels
predicted_policies = [label_mapping[label] for label in predicted_labels]

# Print the classification report
print(classification_report(test_labels, predicted_labels))

test_df['Predicted Policy'] = predicted_policies

# Save DataFrame to a new CSV file
test_df.to_csv('test_with_policy_KN.csv', index=False)


              precision    recall  f1-score   support

           0       0.78      0.92      0.85      1491
           1       0.85      0.85      0.85      1888
           2       0.52      0.23      0.32       101
           3       0.00      0.00      0.00        16
           4       1.00      0.06      0.11       215

    accuracy                           0.81      3711
   macro avg       0.63      0.41      0.43      3711
weighted avg       0.82      0.81      0.79      3711



In [9]:
# Predict the labels for the test data for knn model
predicted_labels = knn.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

print(f"KNN Model Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


KNN Model Metrics: Accuracy: 0.8127189436809485, Precision: 0.8177732510154699, Recall: 0.8127189436809485, F1: 0.787112240642588


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import Trainer, TrainingArguments

# Load and preprocess the training data
train_df = pd.read_csv('filtered_data_colab.csv')

# Ensure that the text data is string
train_df['Description'] = train_df['Description'].astype(str)

train_texts = train_df['Description'].tolist()
train_labels = train_df['Policy_type'].tolist()

# Use the pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('mukund/privbert')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# Define a PyTorch dataset
class TosdrDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
train_dataset = TosdrDataset(train_encodings, train_labels)

# Initialize the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('mukund/privbert', num_labels=5)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define a trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Load and preprocess the test data
test_df = pd.read_csv('other_data_colab.csv')
test_df['Description'] = test_df['Description'].astype(str)

test_texts = test_df['Description'].tolist()
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Create the test dataset
test_dataset = TosdrDataset(test_encodings, test_labels)

# Predict the labels for the test data
predictions = trainer.predict(test_dataset)

# Convert the predictions to labels
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Print the predicted labels
print(predicted_labels)

# Print the classification report
print(classification_report(test_labels, predicted_labels))

policy_names = [label_mapping[label] for label in predicted_labels]
test_df['Predicted Policy'] = policy_names

# Save DataFrame to a new CSV file
test_df.to_csv('test_with_policy_privbert.csv', index=False)


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at mukund/privbert were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mukund/privbert and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifie

Step,Training Loss
10,1.544
20,1.5557
30,1.5126
40,1.4583
50,1.358
60,1.1191
70,0.9036
80,0.6621
90,0.3986
100,0.3472


[1 0 0 ... 2 2 2]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1491
           1       1.00      1.00      1.00      1888
           2       1.00      1.00      1.00       101
           3       0.88      0.94      0.91        16
           4       0.97      0.97      0.97       215

    accuracy                           1.00      3711
   macro avg       0.97      0.98      0.97      3711
weighted avg       1.00      1.00      1.00      3711



In [10]:
# Convert the predictions to labels for privbert model
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Calculate metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels, average='weighted')
recall = recall_score(test_labels, predicted_labels, average='weighted')
f1 = f1_score(test_labels, predicted_labels, average='weighted')

print(f"PrivBert Model Metrics: Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")


PrivBert Model Metrics: Accuracy: 0.9954190245216923, Precision: 0.9954639814836846, Recall: 0.9954190245216923, F1: 0.9954334227166756
