In [None]:
from transformers import DistilBertTokenizer, BertForSequenceClassification
import torch

# Load the default tokenizer from Hugging Face
tokenizer = DistilBertTokenizer.from_pretrained('shafitanvir31/bangla-bert-finetuned-sagor2')

# Load the fine-tuned model from Hugging Face
model = BertForSequenceClassification.from_pretrained('shafitanvir31/bangla-bert-finetuned-sagor2')

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set the model to evaluation mode
model.eval()

print("Model and tokenizer loaded successfully with the default tokenizer!")



In [16]:
import pandas as pd
new_df = pd.read_csv('/kaggle/input/test-dataset354/modified_dataset (2).csv')

# Preview the dataset
print(new_df.head())
print('Number of samples:', new_df.shape[0])


   ID                                              TITLE               SOURCE  \
0   1                            খেজুরবাগানে গুড় সম্মেলন          prothom alo   
1   2  টয়লেটে মুঠোফোন চালিয়ে পাইলসের ঝুঁকি বাড়াচ্ছেন ...          prothom alo   
2   3            ফারুকীকে অভিনন্দন জানিয়ে কী লিখলেন তিশা          prothom alo   
3   4  খুন হওয়ার আশঙ্কা করে ভাষণ দেওয়ার পরদিন ২৮ গুলি...          prothom alo   
4   5                             দীর্ঘ হচ্ছে লাশের সারি  bangladesh protidin   

                                             ARTICLE  LABEL          TOPIC  \
0  সকালে অতিথিদের আপ্যায়নে ছিল খেজুরের রস। পরে চা...      1     enviroment   
1  মোডে বসে মুঠোফোন চালালে যা হয়এতে দুটি বিষয় ঘটে...      1        general   
2  অন্তর্বর্তী সরকারের উপদেষ্টা হিসেবে শপথ নিলেন ...      1  entertainment   
3  ভারতের প্রভাবশালী রাজনৈতিক পরিবারে তাঁর জন্ম। ...      1       politics   
4  ডেঙ্গুজ্বরে আক্রান্ত হয়ে সারা দেশের হাসপাতালে ...      1         health   

   YEAR          BASE  
0   NaN  Common Sens

In [17]:
# Merge TITLE and ARTICLE columns
new_df['news'] = new_df['TITLE'] + ' ' + new_df['ARTICLE']

# Check for any missing values in the 'news' column
print(new_df['news'].isnull().sum())

# If there are missing values, you might want to handle them
# For example, fill NaN values with empty strings
new_df['news'] = new_df['news'].fillna('')

# Preview the 'news' column
print(new_df[['news']].head())


0
                                                news
0  খেজুরবাগানে গুড় সম্মেলন সকালে অতিথিদের আপ্যায়ন...
1  টয়লেটে মুঠোফোন চালিয়ে পাইলসের ঝুঁকি বাড়াচ্ছেন ...
2  ফারুকীকে অভিনন্দন জানিয়ে কী লিখলেন তিশা অন্তর্...
3  খুন হওয়ার আশঙ্কা করে ভাষণ দেওয়ার পরদিন ২৮ গুলি...
4  দীর্ঘ হচ্ছে লাশের সারি ডেঙ্গুজ্বরে আক্রান্ত হয়...


In [18]:
def tokenize_and_split(text, tokenizer, max_length):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens) > max_length - 2:
        # Split the tokens into chunks of size max_length - 2
        chunks = [tokens[i:i + (max_length - 2)] for i in range(0, len(tokens), max_length - 2)]
    else:
        chunks = [tokens]
    
    return chunks


In [19]:
def prepare_inference_data(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    article_ids = []  # To keep track of which chunks belong to which article
    
    for idx, row in df.iterrows():
        text = row['news']
        
        # Handle possible NaN values
        if pd.isna(text):
            text = ''
        
        chunks = tokenize_and_split(text, tokenizer, max_length)
        
        for chunk in chunks:
            # Reconstruct the chunk back to a string
            chunk_text = tokenizer.convert_tokens_to_string(chunk)
            
            # Encode the chunk
            encoding = tokenizer.encode_plus(
                chunk_text,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            
            input_ids.append(encoding['input_ids'])
            attention_masks.append(encoding['attention_mask'])
            article_ids.append(idx)  # Keep track of the article ID
    
    # Convert lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    article_ids = torch.tensor(article_ids)
    
    return input_ids, attention_masks, article_ids


In [20]:
MAX_LEN = 512  # Use the same MAX_LEN as during training
# Prepare the data for inference
input_ids, attention_masks, article_ids = prepare_inference_data(new_df, tokenizer, MAX_LEN)


In [21]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Create the dataset
inference_dataset = TensorDataset(input_ids, attention_masks, article_ids)

# Create DataLoader
batch_size = 4  # Adjust based on your GPU memory
inference_dataloader = DataLoader(inference_dataset, sampler=SequentialSampler(inference_dataset), batch_size=batch_size)


In [22]:
import numpy as np
model.eval()

# Tracking variables 
article_predictions = {}

for batch in inference_dataloader:
    # Unpack the inputs from the dataloader
    b_input_ids = batch[0].to(device)
    b_attention_mask = batch[1].to(device)
    b_article_ids = batch[2].to('cpu').numpy()
    
    with torch.no_grad():
        outputs = model(b_input_ids,
                        attention_mask=b_attention_mask)
        
        logits = outputs.logits
    
    logits = logits.detach().cpu().numpy()
    preds = np.argmax(logits, axis=1)
    
    for article_id, pred in zip(b_article_ids, preds):
        article_id = int(article_id)
        if article_id not in article_predictions:
            article_predictions[article_id] = []
        article_predictions[article_id].append(pred)


In [23]:
# Aggregate predictions per article
final_predictions = []

for idx in range(len(new_df)):
    preds = article_predictions.get(idx, [0])  # Default to class '0' if no prediction is available
    # Majority vote
    final_pred = max(set(preds), key=preds.count)
    final_predictions.append(final_pred)


In [24]:
# Add the predictions to the DataFrame
new_df['PREDICTION'] = final_predictions

# Map numeric predictions to class labels if needed
# For example, if your labels are 0 and 1, and you want to map them to 'Fake' and 'Real':
# label_mapping = {0: 'Fake', 1: 'Real'}
# new_df['PREDICTION_LABEL'] = new_df['PREDICTION'].map(label_mapping)

# Preview the DataFrame with predictions
print(new_df[['ID', 'TITLE', 'PREDICTION']].head())


   ID                                              TITLE  PREDICTION
0   1                            খেজুরবাগানে গুড় সম্মেলন           0
1   2  টয়লেটে মুঠোফোন চালিয়ে পাইলসের ঝুঁকি বাড়াচ্ছেন ...           1
2   3            ফারুকীকে অভিনন্দন জানিয়ে কী লিখলেন তিশা           1
3   4  খুন হওয়ার আশঙ্কা করে ভাষণ দেওয়ার পরদিন ২৮ গুলি...           0
4   5                             দীর্ঘ হচ্ছে লাশের সারি           1


In [25]:
# Specify the output file path
output_file_path = '/kaggle/working/predicted_dataset.csv'

# Save the DataFrame to CSV
new_df.to_csv(output_file_path, index=False)

print(f"Dataset with predictions saved to {output_file_path}")


Dataset with predictions saved to /kaggle/working/predicted_dataset.csv


In [26]:
final_predictions

[0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]