In [17]:
import re
import nltk
import pandas as pd
import torch
import torch.nn as nn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertModel, BertTokenizer

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/storm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/storm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/storm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Preaparing Data

## Reading

### Gun violance

In [3]:
gun_context_spam = pd.read_csv('dataset/gun-violence/context-spam/published_data_spam-MLJ-2022_gun-violence_context-spam_context_spam_5000_no_link.csv')
gun_not_context_spam = pd.read_csv('dataset/gun-violence/context-spam/published_data_spam-MLJ-2022_gun-violence_context-spam_not_context_spam_5000_no_link.csv')
gun_context_spam.drop(['tweet_id'], axis=1, inplace=True)
gun_not_context_spam.drop(['tweet_id'], axis=1, inplace=True)
gun = pd.concat([gun_context_spam, gun_not_context_spam])
gun.reset_index(drop=True, inplace=True)

In [4]:
gun

Unnamed: 0,text,label
0,Stack-On PDS-1500 Gun Safe <em>URL01 Removed</...,1
1,#protection #safe Buy Now and be Safe <em>URL0...,1
2,I'm in the running to win the PolyCase - CZ P-...,1
3,"ZEV Technologies Prizefighter, a cheap Agency ...",1
4,I'm in the running to win a Springfield 1911 P...,1
...,...,...
4995,"Sai Baba! Sai Buhari! ""Dead men are getting ap...",0
4996,Alpha Krav Maga CT Pistol retention while firi...,0
4997,@USER01 “Uhm...Well...I’ll just be straightfor...,0
4998,The 2nd Amendment states it it PLAIN LANGUAGE ...,0


### MeToo

In [5]:
metoo_context_spam = pd.read_csv('dataset/metoo/context-spam/published_data_spam-MLJ-2022_metoo_context-spam_context_spam_5000_no_link.csv')
metoo_not_context_spam = pd.read_csv('dataset/metoo/context-spam/published_data_spam-MLJ-2022_metoo_context-spam_not_context_spam_5000_no_link.csv')
metoo_context_spam.drop(['tweet_id'], axis=1, inplace=True)
metoo_not_context_spam.drop(['tweet_id'], axis=1, inplace=True)
metoo = pd.concat([metoo_context_spam, metoo_not_context_spam])
metoo.reset_index(drop=True, inplace=True)

In [6]:
metoo

Unnamed: 0,text,label
0,Check out the latest happening in #events. #Ev...,1
1,"Will b at #Half-PriceBooks, 15146 5/12: book s...",1
2,Thursday at 12 noon CT URL01_Removed recaps 5 ...,1
3,Move To #LosAngeles - Come #Glow Up And Be A S...,1
4,Men's Sweater Hoodie Fashion Casual Slim Ch......,1
...,...,...
4995,This started when i was 6. A elderly man who h...,0
4996,Meet the woman who started #MeToo 10 years ago...,0
4997,@USER01 @USER02 @USER03 @USER04 #MeToo and no ...,0
4998,@USER01 @USER02 Correct. But she’s not talking...,0


### Parenting

In [7]:
parenting_context_spam = pd.read_csv('dataset/parenting/context-spam/published_data_spam-MLJ-2022_parenting_context-spam_context_spam_5000_no_link.csv')
parenting_not_context_spam = pd.read_csv('dataset/parenting/context-spam/published_data_spam-MLJ-2022_parenting_context-spam_not_context_spam_5000_no_link.csv')
parenting_context_spam.drop(['tweet_id'], axis=1, inplace=True)
parenting_not_context_spam.drop(['tweet_id'], axis=1, inplace=True)
parenting = pd.concat([parenting_context_spam, parenting_not_context_spam])
parenting.reset_index(drop=True, inplace=True)

In [8]:
parenting

Unnamed: 0,text,label
0,Pura Stainless Drinking Bottles Bundle Prizepa...,1
1,LG’s Twin Wash System @USER02 Is Everything Yo...,1
2,#AD #GetActiveAtWalmart Enter to #win some awe...,1
3,@USER01 tweet me when your hosting these givea...,1
4,Giveaway! Three Winners! Scribblenauts™ Showdo...,1
...,...,...
4995,@USER01 We sure can! Give us a quick call when...,0
4996,This Windex ad about kids growing up is making...,0
4997,Tips for surviving Daylight Savings Time - <em...,0
4998,The Lane Bryant ad the networks didn't want yo...,0


## Cleaning

In [9]:
def clean_text(text):
    text = re.sub(r"@\w+", "", text)
    
    text = re.sub(r"<em>.*?</em>", "", text)
    
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    
    text = re.sub(r"[^\w\s]", "", text)
    
    text = re.sub(r"\d+", "", text)
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
    cleaned_text = " ".join(lemmatized_tokens)
    
    return cleaned_text

In [10]:
gun['text'] = gun['text'].apply(clean_text)
metoo['text'] = metoo['text'].apply(clean_text)
parenting['text'] = parenting['text'].apply(clean_text)

## Spliting

In [None]:
gun_train, gun_test = train_test_split(
    gun, test_size=0.2, random_state=42, stratify=gun['label'])

metoo_train, metoo_test = train_test_split(
    metoo, test_size=0.2, random_state=42, stratify=metoo['label'])

parenting_train, parenting_test = train_test_split(
    parenting, test_size=0.2, random_state=42, stratify=parenting['label'])

# Deep Learning

## Model

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

In [12]:
class MyModel(nn.Module):
    def __init__(self, bert_model):
        super(MyModel, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert_model.config.hidden_size, 2)

    def forward(self, inputs):
        outputs = self.bert_model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        bert_output = last_hidden_state[:, 0, :]
        dropout_output = self.dropout(bert_output)
        linear_output = self.linear(dropout_output)
        return linear_output

## Loading Data

In [14]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
        return
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.loc[idx, 'text']
        label = self.df.loc[idx, 'label']
        tokenized = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        output = {
            'input_ids': tokenized['input_ids'].flatten(),
            'attention_mask': tokenized['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }
        return output

### Datasets

In [None]:
gun_train_dataset = CustomDataset(gun_train, bert_tokenizer)
gun_test_dataset = CustomDataset(gun_test, bert_tokenizer)

metoo_train_dataset = CustomDataset(metoo_train, bert_tokenizer)
metoo_test_dataset = CustomDataset(metoo_test, bert_tokenizer)

parenting_train_dataset = CustomDataset(parenting_train, bert_tokenizer)
parenting_test_dataset = CustomDataset(parenting_test, bert_tokenizer)

### Data Loaders

In [None]:
batch_size = 16
num_epochs = 10

gun_train_dataloader = torch.utils.data.DataLoader(gun_train_dataset, batch_size=batch_size, shuffle=True)
gun_test_dataloader = torch.utils.data.DataLoader(gun_test_dataset, batch_size=batch_size, shuffle=True)

metoo_train_dataloader = torch.utils.data.DataLoader(metoo_train_dataset, batch_size=batch_size, shuffle=True)
metoo_test_dataloader = torch.utils.data.DataLoader(metoo_test_dataset, batch_size=batch_size, shuffle=True)

parenting_train_dataloader = torch.utils.data.DataLoader(parenting_train_dataset, batch_size=batch_size, shuffle=True)
parenting_test_dataloader = torch.utils.data.DataLoader(parenting_test_dataset, batch_size=batch_size, shuffle=True)

## Learning

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

### Parenting

In [15]:
parenting_model = MyModel(bert_model)
bert_model = bert_model.to(device)
parenting_model = parenting_model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(parenting_model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for batch in parenting_train_dataloader:
        inputs = {key: value.to(device) for key, value in batch.items() if key != 'label'}
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = parenting_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        predictions = torch.argmax(probabilities, dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_predictions += labels.shape[0]

    epoch_loss = running_loss / len(parenting_train_dataloader)
    epoch_accuracy = correct_predictions / total_predictions
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

Epoch 1/10, Loss: 0.1955, Accuracy: 0.9624


KeyboardInterrupt: 