In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch


# Load the datasets
test_data = pd.read_csv('data/Test.csv')

test_text = test_data['text']


# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Initialize the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model")



test_dataset = NewsDataset(test_text, tokenizer, max_length=512)




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Perform inference on new data
def predict_on_new_data(texts):
    # Remove None samples
    inference_dataset = [sample for sample in texts if sample is not None]
    # Initialize DataLoader
    inference_dataloader = DataLoader(inference_dataset, batch_size=8)
    predictions = []
    for batch in inference_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            outputs = model(input_ids, attention_mask=attention_mask)
            batch_predictions = torch.argmax(outputs.logits, axis=1)
            predictions.extend(batch_predictions.cpu().numpy())
    return predictions

In [3]:
inference_results = predict_on_new_data(test_dataset)
print("Inference Results:", inference_results)

Inference Results: [1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [4]:
test_data['prediction'] = inference_results
test_data['prediction'] = test_data['prediction'].map({0: 'Real', 1: 'Fake'}) 

In [5]:
# find accuracy of the model using ground truth column 
accuracy = accuracy_score(test_data['Ground_Truth'], test_data['prediction'])
accuracy

0.8490566037735849

In [8]:
import requests

# Google Drive file link
file_link = "https://drive.google.com/uc?export=download&id=122F6JBh8o_N7K4fGemdNcjjt2c7XoxMD"

# Send a GET request to the file link
response = requests.get(file_link)

# Check if the request was successful
if response.status_code == 200:
    # Get the file name from the response headers
    content_disposition = response.headers.get('content-disposition')
    if content_disposition:
        file_name = content_disposition.split('filename=')[1].strip('"')
    else:
        file_name = "downloaded_file.bin"  # Default file name if not found in headers

    # Save the file to local filesystem
    with open(file_name, 'wb') as f:
        f.write(response.content)

    print(f"File '{file_name}' downloaded successfully.")
else:
    print("Failed to download the file. Status code:", response.status_code)


File 'config.json' downloaded successfully.
