In [1]:
%pip install pandas
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device is {device}")

Note: you may need to restart the kernel to use updated packages.
device is cpu


In [2]:
#downloading the data

import requests

url = 'https://zenodo.org/records/4561253/files/WELFake_Dataset.csv'  
response = requests.get(url)

with open('news_data.csv', 'wb') as file:
    file.write(response.content)

print("downloading finished")

downloading finished


In [3]:
import pandas as pd
df = pd.read_csv('news_data.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
df.info()
df.isnull().sum()
df.describe()
df.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [5]:
#removing duds

df = df.dropna()
df = df.drop(columns=['Unnamed: 0'])
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df.head()
df = df.drop_duplicates()
df = df.dropna(subset=['text', 'title'])


In [6]:
#some regex cleaning

import re

def cleanup(text):
    if pd.isna(text):
        return ""
    text = text.strip()  
    #remove any web urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 
    #remove email addresses
    text = re.sub(r'\b\w+@\w+\.\w+\b', '', text)  
    #get rid of any duplicate spacing
    text = re.sub(r'\s+', ' ', text)  
    #get rid of punctuaation
    text = re.sub(r'[^\w\s]', '', text) 
    #lowercase all text
    text = text.lower() 
    return text

df['text'] = df['text'].apply(cleanup)
df['title'] = df['title'].apply(cleanup)

In [7]:
df.head()

Unnamed: 0,title,text,label
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1
2,unbelievable obamas attorney general says most...,now most of the demonstrators gathered last ni...,1
3,bobby jindal raised hindu uses story of christ...,a dozen politically active pastors came here f...,0
4,satan 2 russia unvelis an image of its terrify...,the rs28 sarmat missile dubbed satan 2 will re...,1
5,about time christian group sues amazon and spl...,all we can say on this one is it s about time ...,1


In [8]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [9]:
#removing any stop words

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

df['text'] = df['text'].apply(remove_stop_words)
df['title'] = df['title'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to /home/rachel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
print(stop_words)

{'was', 'into', 'again', 'by', 'we', 'are', 'he', 'her', "that'll", 'about', 'during', 'won', 'same', 'very', 'am', 'will', 'until', 'under', 'no', 'if', 'being', "won't", 'haven', 'what', 'above', 'ourselves', 'aren', 'now', 'wouldn', 'doing', 'yourself', 'then', 'she', 'has', 'yourselves', 'here', "you'll", 'your', "didn't", 'before', 've', 'our', 'more', 'just', "you've", 'have', "hasn't", 'd', 'as', 'hers', 'on', 'theirs', 'whom', 'been', 'a', 'o', 'because', 'his', "haven't", "it's", 'couldn', 'himself', 'didn', 'through', 'these', 'in', 'there', 'and', "should've", 'did', "wasn't", 'why', 's', 'how', 'or', 'its', 'both', "you're", 'him', 'should', 'against', "weren't", 'my', 'with', 'weren', 'where', 'needn', 'me', 'be', 'out', 'm', 'who', 'once', 'myself', "she's", 'while', 'some', 'the', 'after', 'them', 'mightn', "you'd", 'too', 'to', 'that', 'down', 'itself', 'hadn', 'isn', 'any', "shan't", 'few', "mustn't", 'themselves', "don't", 'mustn', 'own', 'each', "hadn't", 'they', 'yo

In [11]:
print(df)

                                                   title  \
0      law enforcement high alert following threats c...   
2      unbelievable obamas attorney general says char...   
3      bobby jindal raised hindu uses story christian...   
4      satan 2 russia unvelis image terrifying new su...   
5      time christian group sues amazon splc designat...   
...                                                  ...   
72127  wikileaks email shows clinton foundation funds...   
72129  russians steal research trump hack us democrat...   
72130  watch giuliani demands democrats apologize tru...   
72131   migrants refuse leave train refugee camp hungary   
72132  trump tussle gives unpopular mexican leader mu...   

                                                    text  label  
0      comment expected barack obama members fyf911 f...      1  
2      demonstrators gathered last night exercising c...      1  
3      dozen politically active pastors came private ...      0  
4      rs28 sar

In [12]:
#tokenizing and lemmatizing the text with POS (took around 6 mins to run)

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v' 
    elif treebank_tag.startswith('N'):
        return 'n' 
    elif treebank_tag.startswith('R'):
        return 'r'  
    else:
        return 'n'  

def pos_lemmatize(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet_pos(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

df['text'] = df['text'].apply(lambda x: pos_lemmatize(str(x)))
df['title'] = df['title'].apply(lambda x: pos_lemmatize(str(x)))

df.head()

Unnamed: 0,title,text,label
0,law enforcement high alert follow threat cop w...,comment expect barack obama member fyf911 fuky...,1
2,unbelievable obamas attorney general say charl...,demonstrator gather last night exercise consti...,1
3,bobby jindal raise hindu us story christian co...,dozen politically active pastor come private d...,0
4,satan 2 russia unvelis image terrify new super...,rs28 sarmat missile dub satan 2 replace ss18 f...,1
5,time christian group sue amazon splc designati...,say one time someone sue southern poverty law ...,1


In [13]:
#split the data up into train, val, and test

%pip install scikit-learn
import sklearn
from sklearn.model_selection import train_test_split

#combining the title with the text of article
df['combined'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

X = df['combined']  
y = df['label']  

#training data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

#rest is validation and testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train)

Note: you may need to restart the kernel to use updated packages.
6183     hurray riff raffs alynda segarra find concept ...
60285    brooklyn mother three doesnt work shes proudly...
51211    barbarian gate muslim morocco keep breaking sp...
45217    jezebel mike penny twitchycom steven superaiel...
25090    corinthian college must pay student 1 billion ...
                               ...                        
71433    half briton want stay eu poll edinburgh reuter...
41294    bill hillary clinton inc sale right price spec...
869      orlando gunman shoot least 8 time autopsy find...
16332    lethal gap supreme court handle death penalty ...
63490    poll show world overwhelmingly love president ...
Name: combined, Length: 44184, dtype: object


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

#training data will create a vocabulary
vectorizer.fit(X_train)

training_text_vector = vectorizer.transform(X_train)
validation_text_vector = vectorizer.transform(X_val)
test_text_vector = vectorizer.transform(X_test)

In [39]:

import torch
from scipy.sparse import csr_matrix

#need to use sparse matrix because the dense versions take too much memory

def matrix_to_sparse_tensor(matrix):
    coo = matrix.tocoo()
    indices = torch.tensor([coo.row, coo.col], dtype=torch.long)
    values = torch.tensor(coo.data, dtype=torch.float32)
    size = torch.Size(coo.shape)
    
    return torch.sparse_coo_tensor(indices, values, size)

train_feature_tensor = matrix_to_sparse_tensor(training_text_vector)
val_feature_tensor = matrix_to_sparse_tensor(validation_text_vector)
test_feature_tensor = matrix_to_sparse_tensor(test_text_vector)

In [40]:
train_label_tensor = torch.tensor(y_train.values, dtype=torch.long)
val_label_tensor = torch.tensor(y_val.values, dtype=torch.long)
test_label_tensor = torch.tensor(y_test.values, dtype=torch.long)


In [41]:
print(train_feature_tensor.shape)
print(val_feature_tensor.shape)
print(test_feature_tensor.shape)
print(train_feature_tensor)


torch.Size([44184, 291416])
torch.Size([9468, 291416])
torch.Size([9469, 291416])
tensor(indices=tensor([[     0,      0,      0,  ...,  44183,  44183,  44183],
                       [  3773,   4103,   4597,  ..., 266784, 268234, 268804]]),
       values=tensor([0.0283, 0.0278, 0.0133,  ..., 0.0683, 0.0384, 0.0313]),
       size=(44184, 291416), nnz=8879963, layout=torch.sparse_coo)


In [43]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
    
        if isinstance(feature, torch.sparse.Tensor):
            feature = feature.to_dense()
            
        return feature, label

In [44]:
from torch.utils.data import DataLoader

train_custom_dataset = CustomDataset(train_feature_tensor, train_label_tensor)
val_custom_dataset = CustomDataset(val_feature_tensor, val_label_tensor)
test_custom_dataset = CustomDataset(test_feature_tensor, test_label_tensor)

#maybe try 32 or 64?
batch_size = 16

#create loaders for the train val and test data
train_loader = DataLoader(train_custom_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_custom_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_custom_dataset, batch_size=batch_size, shuffle=False)

In [30]:

import torch.nn as nn

#define LSTM cell
class Cell(nn.Module):
    def __init__(self, input_dimension, hidden_dimension):
        super(Cell, self).__init__()
        self.hidden_dimension = hidden_dimension
        
        #we have the input, forget, and output gates. we also have the cell state
        self.W_input_gate = nn.Linear(input_dimension + hidden_dimension, hidden_dimension)
        self.W_forget_gate = nn.Linear(input_dimension + hidden_dimension, hidden_dimension)
        self.W_o_output_gate = nn.Linear(input_dimension + hidden_dimension, hidden_dimension)
        self.W_Cell_state_update = nn.Linear(input_dimension + hidden_dimension, hidden_dimension)
        
    def forward(self, x, hidden):
        
        prev_hidden, prev_cell_state = hidden

        #first the current cell and previous hidden cell are concatenated
        combined = torch.cat((x, prev_hidden), dim=1)
        
        #apply the transformations to the gates and cell state
        input = torch.sigmoid(self.W_input_gate(combined))
        forget = torch.sigmoid(self.W_forget_gate(combined))
        output = torch.sigmoid(self.W_o_output_gate(combined))
        cell_state = torch.tanh(self.W_Cell_state_update(combined))
        
        #calculate the new cell and hidden state
        new_cell = forget * prev_cell_state + input * cell_state
        new_hidden_state = output * torch.tanh(new_cell)
        
        return new_hidden_state, new_cell


In [31]:

import torch
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_dimension, hidden_dimension, output_dimension, num_layers):
        super(LSTMModel, self).__init__()
        self.hidden_dimension = hidden_dimension
        self.num_layers = num_layers  # Define num_layers here

        # Create a list of LSTM cells
        self.cells = nn.ModuleList(
            [nn.LSTM(input_dimension if i == 0 else hidden_dimension, hidden_dimension) for i in range(num_layers)]
        )
        self.fully_connected_layer = nn.Linear(hidden_dimension, output_dimension)

    def forward(self, x):
        batch_size = x.size(0)
        h = [torch.zeros(batch_size, self.hidden_dimension) for _ in range(self.num_layers)]
        c = [torch.zeros(batch_size, self.hidden_dimension) for _ in range(self.num_layers)]

        for t in range(x.size(1)):
            for i, cell in enumerate(self.cells):
                x, (h[i], c[i]) = cell(x, (h[i], c[i]))
        
        output = self.fully_connected_layer(x)
        return output

In [32]:
for inputs, labels in train_loader:
    print(inputs, labels)

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0152, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0933, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]) tensor([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
        1, 1, 0, 1, 1, 0, 1, 1])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]) tensor([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
        1, 0, 1, 1, 0, 1, 0, 0])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0

KeyboardInterrupt: 

In [45]:
import torch
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Define model parameters
input_dimension = 291416  # Example feature dimension
hidden_dimension = 128
output_dimension = 2  # For binary classification
num_layers = 2

# Instantiate the model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
model = LSTMModel(input_dimension, hidden_dimension, output_dimension, num_layers).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            # Move data to GPU
            inputs, labels = inputs.to(device), labels.to(device)
            print(inputs.shape)

            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

# Evaluation loop
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            # Move data to GPU
            inputs = inputs.to_dense()
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Accuracy: {accuracy:.4f}')

# Example usage:
# Assuming you have DataLoaders named `train_loader` and `test_loader`

# Train the model
train(model, train_loader, criterion, optimizer, num_epochs=10)

# Evaluate the model
evaluate(model, test_loader)

RuntimeError: DataLoader worker (pid(s) 152132) exited unexpectedly