# Imports

In [None]:
import os
import re
import nltk
import pandas as pd
import torch
import torch.nn as nn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from torch.utils.data import Dataset
from transformers import BertModel, BertTokenizer
import matplotlib.pyplot as plt

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Preaparing Data

## Reading

### Gun violance

In [None]:
gun_context_spam = pd.read_csv('dataset/gun-violence/context-spam/published_data_spam-MLJ-2022_gun-violence_context-spam_context_spam_5000_no_link.csv')
gun_not_context_spam = pd.read_csv('dataset/gun-violence/context-spam/published_data_spam-MLJ-2022_gun-violence_context-spam_not_context_spam_5000_no_link.csv')
gun_context_spam.drop(['tweet_id'], axis=1, inplace=True)
gun_not_context_spam.drop(['tweet_id'], axis=1, inplace=True)
gun = pd.concat([gun_context_spam, gun_not_context_spam])
gun.reset_index(drop=True, inplace=True)

In [None]:
gun

### MeToo

In [None]:
metoo_context_spam = pd.read_csv('dataset/metoo/context-spam/published_data_spam-MLJ-2022_metoo_context-spam_context_spam_5000_no_link.csv')
metoo_not_context_spam = pd.read_csv('dataset/metoo/context-spam/published_data_spam-MLJ-2022_metoo_context-spam_not_context_spam_5000_no_link.csv')
metoo_context_spam.drop(['tweet_id'], axis=1, inplace=True)
metoo_not_context_spam.drop(['tweet_id'], axis=1, inplace=True)
metoo = pd.concat([metoo_context_spam, metoo_not_context_spam])
metoo.reset_index(drop=True, inplace=True)

In [None]:
metoo

### Parenting

In [None]:
parenting_context_spam = pd.read_csv('dataset/parenting/context-spam/published_data_spam-MLJ-2022_parenting_context-spam_context_spam_5000_no_link.csv')
parenting_not_context_spam = pd.read_csv('dataset/parenting/context-spam/published_data_spam-MLJ-2022_parenting_context-spam_not_context_spam_5000_no_link.csv')
parenting_context_spam.drop(['tweet_id'], axis=1, inplace=True)
parenting_not_context_spam.drop(['tweet_id'], axis=1, inplace=True)
parenting = pd.concat([parenting_context_spam, parenting_not_context_spam])
parenting.reset_index(drop=True, inplace=True)

In [None]:
parenting

## Cleaning

In [None]:
def clean_text(text):
    text = re.sub(r"@\w+", "", text)
    
    text = re.sub(r"<em>.*?</em>", "", text)
    
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    text = re.sub(r"[^\w\s]", "", text)
    
    text = re.sub(r"\d+", "", text)
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    cleaned_text = " ".join(lemmatized_tokens)
    
    return cleaned_text

In [None]:
gun['text'] = gun['text'].apply(clean_text)
metoo['text'] = metoo['text'].apply(clean_text)
parenting['text'] = parenting['text'].apply(clean_text)

## Spliting

In [None]:
gun_train, gun_test = train_test_split(
    gun, test_size=0.1, random_state=42, stratify=gun['label'])
gun_train, gun_val = train_test_split(
    gun_train, test_size=0.1, random_state=42, stratify=gun_train['label'])

gun_train = pd.DataFrame(gun_train, columns=gun.columns)
gun_test = pd.DataFrame(gun_test, columns=gun.columns)
gun_val = pd.DataFrame(gun_val, columns=gun.columns)

metoo_train, metoo_test = train_test_split(
    metoo, test_size=0.1, random_state=42, stratify=metoo['label'])
metoo_train, metoo_val = train_test_split(
    metoo_train, test_size=0.1, random_state=42, stratify=metoo_train['label'])

metoo_train = pd.DataFrame(metoo_train, columns=metoo.columns)
metoo_test = pd.DataFrame(metoo_test, columns=metoo.columns)
metoo_val = pd.DataFrame(metoo_val, columns=metoo.columns)

parenting_train, parenting_test = train_test_split(
    parenting, test_size=0.1, random_state=42, stratify=parenting['label'])
parenting_train, parenting_val = train_test_split(
    parenting_train, test_size=0.1, random_state=42, stratify=parenting_train['label'])

parenting_train = pd.DataFrame(parenting_train, columns=parenting.columns)
parenting_test = pd.DataFrame(parenting_test, columns=parenting.columns)
parenting_val = pd.DataFrame(parenting_val, columns=parenting.columns)

gun_train.reset_index(drop=True, inplace=True)
gun_test.reset_index(drop=True, inplace=True)
gun_val.reset_index(drop=True, inplace=True)

metoo_train.reset_index(drop=True, inplace=True)
metoo_test.reset_index(drop=True, inplace=True)
metoo_val.reset_index(drop=True, inplace=True)

parenting_train.reset_index(drop=True, inplace=True)
parenting_test.reset_index(drop=True, inplace=True)
parenting_val.reset_index(drop=True, inplace=True)


# Feature Extraction

## Bag Of Words

In [None]:
vectorizer = CountVectorizer()

### Parenting

In [None]:
parenting_train_bow = vectorizer.fit_transform(parenting_train['text'])
parenting_test_bow = vectorizer.transform(parenting_test['text'])
parenting_train_bow = parenting_train_bow.toarray()
parenting_test_bow = parenting_test_bow.toarray()

### Gun

In [None]:
gun_train_bow = vectorizer.fit_transform(gun_train['text'])
gun_test_bow = vectorizer.transform(gun_test['text'])
gun_train_bow = gun_train_bow.toarray()
gun_test_bow = gun_test_bow.toarray()

### MeToo

In [None]:
metoo_train_bow = vectorizer.fit_transform(metoo_train['text'])
metoo_test_bow = vectorizer.transform(metoo_test['text'])
metoo_train_bow = metoo_train_bow.toarray()
metoo_test_bow = metoo_test_bow.toarray()

# Classic Algorithms

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [2, 4, 8, 16],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1_macro')

### Parenting

In [None]:
grid_search.fit(parenting_train_bow, parenting_train['label'])

best_params = grid_search.best_params_
print('Best Parameters:', best_params)
best_model = grid_search.best_estimator_

parenting_rf_pred = best_model.predict(parenting_test_bow)

### Gun

In [None]:
grid_search.fit(gun_train_bow, gun_train['label'])

best_params = grid_search.best_params_
print('Best Parameters:', best_params)
best_model = grid_search.best_estimator_

gun_rf_pred = best_model.predict(gun_test_bow)

### MeToo

In [None]:
grid_search.fit(metoo_train_bow, metoo_train['label'])

best_params = grid_search.best_params_
print('Best Parameters:', best_params)
best_model = grid_search.best_estimator_

metoo_rf_pred = best_model.predict(metoo_test_bow)

# Neural Network

## Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

In [None]:
class MyModel(nn.Module):
    def __init__(self, bert_model):
        super(MyModel, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert_model.config.hidden_size, 2)

    def forward(self, inputs):
        outputs = self.bert_model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        bert_output = last_hidden_state[:, 0, :]
        dropout_output = self.dropout(bert_output)
        linear_output = self.linear(dropout_output)
        return linear_output

## Loading Data

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        return
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'text']
        label = self.df.loc[idx, 'label']
        tokenized = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        output = {
            'input_ids': tokenized['input_ids'].flatten(),
            'attention_mask': tokenized['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }
        return output

### Datasets

In [None]:
gun_train_dataset = CustomDataset(gun_train, bert_tokenizer)
gun_val_dataset = CustomDataset(gun_val, bert_tokenizer)
gun_test_dataset = CustomDataset(gun_test, bert_tokenizer)

metoo_train_dataset = CustomDataset(metoo_train, bert_tokenizer)
metoo_val_dataset = CustomDataset(metoo_val, bert_tokenizer)
metoo_test_dataset = CustomDataset(metoo_test, bert_tokenizer)

parenting_train_dataset = CustomDataset(parenting_train, bert_tokenizer)
parenting_val_dataset = CustomDataset(parenting_val, bert_tokenizer)
parenting_test_dataset = CustomDataset(parenting_test, bert_tokenizer)

### Data Loaders

In [None]:
batch_size = 22
num_epochs = 20

gun_train_dataloader = torch.utils.data.DataLoader(gun_train_dataset, batch_size=batch_size, shuffle=True)
gun_val_dataloader = torch.utils.data.DataLoader(gun_val_dataset, batch_size=batch_size, shuffle=True)
gun_test_dataloader = torch.utils.data.DataLoader(gun_test_dataset, batch_size=batch_size, shuffle=True)

metoo_train_dataloader = torch.utils.data.DataLoader(metoo_train_dataset, batch_size=batch_size, shuffle=True)
metoo_val_dataloader = torch.utils.data.DataLoader(metoo_val_dataset, batch_size=batch_size, shuffle=True)
metoo_test_dataloader = torch.utils.data.DataLoader(metoo_test_dataset, batch_size=batch_size, shuffle=True)

parenting_train_dataloader = torch.utils.data.DataLoader(parenting_train_dataset, batch_size=batch_size, shuffle=True)
parenting_val_dataloader = torch.utils.data.DataLoader(parenting_val_dataset, batch_size=batch_size, shuffle=True)
parenting_test_dataloader = torch.utils.data.DataLoader(parenting_test_dataset, batch_size=batch_size, shuffle=True)

## Learning

In [None]:
def train_model(bert_model, train_dataloader, validation_dataloader) -> MyModel:
    bert_model = bert_model.to(device)
    my_model = MyModel(bert_model)
    my_model = my_model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(my_model.parameters(), lr=0.000005)

    best_accuracy = 0.0
    patience = 5
    counter = 0

    for epoch in range(20):
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for batch in train_dataloader:
            inputs = {key: value.to(device) for key, value in batch.items() if key != 'label'}
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = my_model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            probabilities = torch.nn.functional.softmax(outputs, dim=1)
            predictions = torch.argmax(probabilities, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.shape[0]
            inputs = None
            labels = None
        torch.cuda.empty_cache()

        epoch_loss = running_loss / len(train_dataloader)
        epoch_accuracy = correct_predictions / total_predictions
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

        # Validation
        my_model.eval()
        val_correct_predictions = 0
        val_total_predictions = 0

        with torch.no_grad():
            for val_batch in validation_dataloader:
                val_inputs = {key: value.to(device) for key, value in val_batch.items() if key != 'label'}
                val_labels = val_batch['label'].to(device)

                val_outputs = my_model(val_inputs)
                val_probabilities = torch.nn.functional.softmax(val_outputs, dim=1)
                val_predictions = torch.argmax(val_probabilities, dim=1)
                val_correct_predictions += (val_predictions == val_labels).sum().item()
                val_total_predictions += val_labels.shape[0]
                val_inputs = None
                val_labels = None
        torch.cuda.empty_cache()

        val_accuracy = val_correct_predictions / val_total_predictions
        print(f"Validation Accuracy: {val_accuracy:.4f}")

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered. Training stopped.")
                break

        my_model.train()
    
    return my_model

### Parenting

In [None]:
model_file = 'models/parenting_model.pth'
parenting_bert_model = BertModel.from_pretrained('bert-base-uncased')

if os.path.isfile(model_file):
    parenting_model = MyModel(parenting_bert_model)
    parenting_model.load_state_dict(torch.load(model_file))
    parenting_model = parenting_model.to(device)
else:
    parenting_model = train_model(parenting_bert_model, parenting_train_dataloader, parenting_val_dataloader)
    torch.save(parenting_model.state_dict(), model_file)

### Gun

In [None]:
model_file = 'models/gun_model.pth'
gun_bert_model = BertModel.from_pretrained('bert-base-uncased')

if os.path.isfile(model_file):
    gun_model = MyModel(parenting_bert_model)
    gun_model.load_state_dict(torch.load(model_file))
    gun_model = gun_model.to(device)
else:
    gun_model = train_model(gun_bert_model, gun_train_dataloader, gun_val_dataloader)
    torch.save(gun_model.state_dict(), model_file)

### MeToo

In [None]:
model_file = 'models/metoo_model.pth'
metoo_bert_model = BertModel.from_pretrained('bert-base-uncased')

if os.path.isfile(model_file):
    metoo_model = MyModel(metoo_bert_model)
    metoo_model.load_state_dict(torch.load(model_file))
    metoo_model = metoo_model.to(device)
else:
    metoo_model= train_model(metoo_bert_model, metoo_train_dataloader, metoo_val_dataloader)
    torch.save(metoo_model.state_dict(), model_file)

# Testing

In [None]:
def report(test, pred):
    print("Classification Report:")
    print(classification_report(test, pred))
    cm = confusion_matrix(test, pred, labels=[0, 1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['not spam', 'spam'])
    disp.plot()
    plt.show()

In [None]:
def test_model(my_model, test_dataloader):
    my_model.eval()

    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for test_batch in test_dataloader:
            test_inputs = {key: value.to(device) for key, value in test_batch.items() if key != 'label'}
            test_labels = test_batch['label'].to(device)

            test_outputs = my_model(test_inputs)
            test_probabilities = torch.nn.functional.softmax(test_outputs, dim=1)
            test_predictions = torch.argmax(test_probabilities, dim=1)

            true_labels.extend(test_labels.cpu().numpy())
            predicted_labels.extend(test_predictions.cpu().numpy())
            test_inputs = None
            test_labels = None
        torch.cuda.empty_cache()

    report(true_labels, predicted_labels)

## Within Domain

### Parenting

#### Neural Network

In [None]:
test_model(parenting_model, parenting_test_dataloader)

#### Random Forest

In [None]:
report(parenting_test['label'], parenting_rf_pred)

### Gun

#### Neural Network

In [None]:
test_model(gun_model, gun_test_dataloader)

#### Random Forest

In [None]:
report(gun_test['label'], gun_rf_pred)

### MeToo

#### Neural Network

In [None]:
test_model(metoo_model, metoo_test_dataloader)

#### Random Forest

In [None]:
report(metoo_test['label'], metoo_rf_pred)

## Cross Domain

### Parenting Model

#### Gun Data

In [None]:
test_model(parenting_model, gun_test_dataloader)

#### MeToo Data

In [None]:
test_model(parenting_model, metoo_test_dataloader)

### Gun Model

#### Parenting Data

In [None]:
test_model(gun_model, parenting_test_dataloader)

#### MeToo Data

In [None]:
test_model(gun_model, metoo_test_dataloader)

### MeToo Model

#### Gun Data

In [None]:
test_model(metoo_model, gun_test_dataloader)

#### Parenting Data

In [None]:
test_model(metoo_model, parenting_test_dataloader)