In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup

In [None]:
data = pd.read_csv('/content/drive/MyDrive/SharedTask/coherence_data_with_length_and_complexity.csv')

In [None]:
data.head()

Unnamed: 0,text,model,label,source,id,coherence,complexity,length
0,". Yasuo Ōtsuka (大塚 康雄 Ōtsuka Yasuo, born Febru...",davinci,1,wikipedia,12996,18.012884,13.2,444
1,\nWe present a detailed study of the inner reg...,cohere,1,arxiv,33734,9.027633,13.7,182
2,The two most popular sports in America in the...,cohere,1,reddit,56228,22.171431,13.9,402
3,Open your iMovie program and select either a p...,dolly,1,wikihow,52940,7.956687,9.5,185
4,\nThis paper addresses the problem of training...,cohere,1,peerread,46630,14.839405,16.3,85


In [None]:
texts = data['text'].values
labels = data['label'].values

In [None]:

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_texts)



In [None]:
# BERT Tokenizer and Model
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)  #

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Assuming binary classification

# Tokenize and pad your text data
inputs_train = tokenizer(train_texts.tolist(), padding=True, truncation=True, return_tensors="pt")
inputs_test = tokenizer(test_texts.tolist(), padding=True, truncation=True, return_tensors="pt")

# Combine BERT embeddings with TF-IDF vectors
combined_features_train = torch.cat([inputs_train['input_ids'], torch.tensor(X_train_tfidf.toarray())], dim=-1)
combined_features_test = torch.cat([inputs_test['input_ids'], torch.tensor(X_test_tfidf.toarray())], dim=-1)



sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Create PyTorch Dataset and DataLoader
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.features[idx], 'labels': self.labels[idx]}

train_dataset = CustomDataset(combined_features_train, torch.tensor(train_labels))
test_dataset = CustomDataset(combined_features_test, torch.tensor(test_labels))

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tuning with the Trainer class
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_fine_tuned",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)
