In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/merged.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/merged.csv'

In [None]:
df = df.dropna()
df = df[df['label']!= 'neutral']
df['label'].value_counts()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os

# Ensure 'punkt_tab' is downloaded to the specified directory, or any appropriate directory
nltk.download('punkt_tab', download_dir=os.path.join(os.getcwd(), "nltk_data"))

# Append the directory containing NLTK data to the search path
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))


# ... (rest of your code)

# Assuming your DataFrame is named 'df' and has columns 'comment' and 'label'

# 1. Preprocessing with NLTK
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        tokens = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(tokens)
    else:
        return ""

df['comment'] = df['comment'].apply(preprocess_text)

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])


In [None]:
# prompt: torch transformer model to classify if its fake or real from x trian y train

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']

X_train_ids, X_train_masks = zip(*[tokenize_text(text) for text in X_train])
X_test_ids, X_test_masks = zip(*[tokenize_text(text) for text in X_test])

X_train_ids = torch.cat(X_train_ids, dim=0)
X_train_masks = torch.cat(X_train_masks, dim=0)
X_test_ids = torch.cat(X_test_ids, dim=0)
X_test_masks = torch.cat(X_test_masks, dim=0)


# Convert labels to numerical representation
label_mapping = {'fake': 0, 'real': 1}  # Example mapping
y_train_num = torch.tensor([label_mapping[label] for label in y_train])
y_test_num = torch.tensor([label_mapping[label] for label in y_test])


# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_ids, X_train_masks, y_train_num)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TensorDataset(X_test_ids, X_test_masks, y_test_num)
test_loader = DataLoader(test_dataset, batch_size=16)

# Define the classifier model
class BERT_Classifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BERT_Classifier, self).__init__()
        self.bert = bert_model
        if freeze_bert:
          for param in self.bert.parameters():
              param.requires_grad = False
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 2) # 2 output classes (fake/real)
        self.relu = nn.ReLU()
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

# Initialize the model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERT_Classifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    for batch in train_loader:
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_masks)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1} complete.')


# Evaluation
y_pred = []
with torch.no_grad():
  for batch in test_loader:
    input_ids, attention_masks, labels = batch
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    labels = labels.to(device)

    outputs = model(input_ids, attention_masks)
    _, predicted = torch.max(outputs, 1)
    y_pred.extend(predicted.cpu().numpy())

print(classification_report(y_test_num, y_pred))
