In [None]:
!pip install pandas

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [None]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/MyDrive/train/

DataSet_Misinfo_FAKE.csv  DataSet_Misinfo_TRUE.csv  shuffled_data.csv


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset


def train(dataset_path):
    # Load dataset
    df = pd.read_csv(dataset_path)
    df['text'] = df['text'].astype(str)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # Model and tokenizer setup
    model_name = 'distilbert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    def tokenize(data):
        return tokenizer(data, padding=True, truncation=True, return_tensors='pt')

    #train_encodings = tokenize([{"text": t} for t in train_df['text'].tolist()])
    #val_encodings = tokenize([{"text": t} for t in val_df['text'].tolist()])

    # Create datasets
    class NewsDataset(Dataset):
        def __init__(self, dataframe, tokenizer, padding_length=512):
            self.data = dataframe
            self.tokenizer = tokenizer
            self.text = dataframe['text'].tolist()
            self.labels = dataframe['label'].tolist()
            self.padding_length = padding_length

        def __getitem__(self, idx):
            text = self.text[idx]
            inputs = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt',
                                    max_length=self.padding_length)
            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]
            label = self.labels[idx]

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': torch.tensor(label, dtype=torch.long)
            }

        def __len__(self):
            return len(self.labels)


    train_dataset = NewsDataset(train_df, tokenizer)
    val_dataset = NewsDataset(val_df, tokenizer)

    # Set up the Trainer
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=32,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="epoch",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained('/content/drive/MyDrive/train')
    tokenizer.save_pretrained('/content/drive/MyDrive/train')


if __name__ == '__main__':
    dataset_path = '/content/drive/MyDrive/train/shuffled_data.csv'
    train(dataset_path)
   



Epoch,Training Loss,Validation Loss
1,0.001,0.05334
2,0.0228,0.048948
3,0.0,0.056304


In [None]:
  model.save_pretrained('/content/drive/MyDrive/train')
  tokenizer.save_pretrained('/content/drive/MyDrive/train')