In [1]:
%pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
#downloading the data

import requests

url = 'https://zenodo.org/records/4561253/files/WELFake_Dataset.csv'  
response = requests.get(url)

with open('news_data.csv', 'wb') as file:
    file.write(response.content)

print("downloading finished")

downloading finished


In [3]:
import pandas as pd
df = pd.read_csv('news_data.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
df.info()
df.isnull().sum()
df.describe()
df.columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [5]:
#removing duds

df = df.dropna()
df = df.drop(columns=['Unnamed: 0'])
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df.head()
df = df.drop_duplicates()
df = df.dropna(subset=['text', 'title'])


In [6]:
#some regex cleaning

import re

def cleanup(text):
    if pd.isna(text):
        return ""
    text = text.strip()  
    #remove any web urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 
    #remove email addresses
    text = re.sub(r'\b\w+@\w+\.\w+\b', '', text)  
    #get rid of any duplicate spacing
    text = re.sub(r'\s+', ' ', text)  
    #get rid of punctuaation
    text = re.sub(r'[^\w\s]', '', text) 
    #lowercase all text
    text = text.lower() 
    return text

df['text'] = df['text'].apply(cleanup)
df['title'] = df['title'].apply(cleanup)

In [7]:
df.head()

Unnamed: 0,title,text,label
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1
2,unbelievable obamas attorney general says most...,now most of the demonstrators gathered last ni...,1
3,bobby jindal raised hindu uses story of christ...,a dozen politically active pastors came here f...,0
4,satan 2 russia unvelis an image of its terrify...,the rs28 sarmat missile dubbed satan 2 will re...,1
5,about time christian group sues amazon and spl...,all we can say on this one is it s about time ...,1


In [8]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [9]:
#removing any stop words

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

df['text'] = df['text'].apply(remove_stop_words)
df['title'] = df['title'].apply(remove_stop_words)

[nltk_data] Downloading package stopwords to /home/rachel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
print(stop_words)

{'out', 'd', 'for', 'then', 'again', 'couldn', 'an', 'her', 'll', 'what', 'own', 'by', 'about', 'myself', 'each', "shouldn't", 'in', 'm', "it's", 'until', 'doesn', 'i', 'themselves', 'this', 'me', "you'll", 'yours', 'these', 't', 'same', 'should', "won't", "wasn't", 'yourselves', 'which', 'and', 'you', 'most', "isn't", 'needn', 'whom', 'him', 'that', "hadn't", 'above', 'y', 'our', 'not', 'such', 'their', 'how', 'they', 'did', 'your', 'weren', 'shan', 'who', 'didn', 'now', 're', "didn't", 'is', 'both', 'have', 'before', 'all', 'haven', "mustn't", 'of', 'some', 'ours', 'there', 'just', 'his', 'had', "doesn't", 'my', "shan't", 'won', 'has', 'isn', 'other', 'ourselves', 'nor', 'from', 'ma', 'wouldn', 'below', 'with', "she's", 's', "should've", 'as', 'herself', 'than', 'she', 'few', 'doing', "couldn't", 'off', 'been', 'down', 'at', 'into', 'against', 'we', 'its', 'will', 'am', 'wasn', 'here', "haven't", 'are', 'or', 'once', 'no', 'only', "hasn't", 'the', 'hadn', 'can', "needn't", 'over', "w

In [11]:
print(df)

                                                   title  \
0      law enforcement high alert following threats c...   
2      unbelievable obamas attorney general says char...   
3      bobby jindal raised hindu uses story christian...   
4      satan 2 russia unvelis image terrifying new su...   
5      time christian group sues amazon splc designat...   
...                                                  ...   
72127  wikileaks email shows clinton foundation funds...   
72129  russians steal research trump hack us democrat...   
72130  watch giuliani demands democrats apologize tru...   
72131   migrants refuse leave train refugee camp hungary   
72132  trump tussle gives unpopular mexican leader mu...   

                                                    text  label  
0      comment expected barack obama members fyf911 f...      1  
2      demonstrators gathered last night exercising c...      1  
3      dozen politically active pastors came private ...      0  
4      rs28 sar

In [12]:
#tokenizing and lemmatizing the text with POS (took around 6 mins to run)

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v' 
    elif treebank_tag.startswith('N'):
        return 'n' 
    elif treebank_tag.startswith('R'):
        return 'r'  
    else:
        return 'n'  

def pos_lemmatize(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet_pos(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)

df['text'] = df['text'].apply(lambda x: pos_lemmatize(str(x)))
df['title'] = df['title'].apply(lambda x: pos_lemmatize(str(x)))

df.head()

Unnamed: 0,title,text,label
0,law enforcement high alert follow threat cop w...,comment expect barack obama member fyf911 fuky...,1
2,unbelievable obamas attorney general say charl...,demonstrator gather last night exercise consti...,1
3,bobby jindal raise hindu us story christian co...,dozen politically active pastor come private d...,0
4,satan 2 russia unvelis image terrify new super...,rs28 sarmat missile dub satan 2 replace ss18 f...,1
5,time christian group sue amazon splc designati...,say one time someone sue southern poverty law ...,1


In [20]:
#split the data up into train, val, and test

%pip install scikit-learn
import sklearn
from sklearn.model_selection import train_test_split

#combining the title with the text of article
df['combined'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

X = df['combined']  
y = df['label']  

#training data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

#rest is validation and testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train)

Note: you may need to restart the kernel to use updated packages.
6183     hurray riff raffs alynda segarra find concept ...
60285    brooklyn mother three doesnt work shes proudly...
51211    barbarian gate muslim morocco keep breaking sp...
45217    jezebel mike penny twitchycom steven superaiel...
25090    corinthian college must pay student 1 billion ...
                               ...                        
71433    half briton want stay eu poll edinburgh reuter...
41294    bill hillary clinton inc sale right price spec...
869      orlando gunman shoot least 8 time autopsy find...
16332    lethal gap supreme court handle death penalty ...
63490    poll show world overwhelmingly love president ...
Name: combined, Length: 44184, dtype: object


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

#training data will create a vocabulary
vectorizer.fit(X_train)

training_text_vector = vectorizer.transform(X_train)
validation_text_vector = vectorizer.transform(X_val)
test_text_vector = vectorizer.transform(X_test)

In [32]:
import torch
from scipy.sparse import csr_matrix

#need to use sparse matrix because the dense versions take too much memory

def matrix_to_sparse_tensor(matrix):
    coo = matrix.tocoo()
    indices = torch.tensor([coo.row, coo.col], dtype=torch.long)
    values = torch.tensor(coo.data, dtype=torch.float32)
    size = torch.Size(coo.shape)
    
    return torch.sparse_coo_tensor(indices, values, size)

train_feature_tensor = matrix_to_sparse_tensor(training_text_vector)
val_feature_tensor = matrix_to_sparse_tensor(validation_text_vector)
test_feature_tensor = matrix_to_sparse_tensor(test_text_vector)




In [33]:
train_label_tensor = torch.tensor(y_train.values, dtype=torch.long)
val_label_tensor = torch.tensor(y_val.values, dtype=torch.long)
test_label_tensor = torch.tensor(y_test.values, dtype=torch.long)


In [34]:
print(train_feature_tensor.shape)
print(val_feature_tensor.shape)
print(test_feature_tensor.shape)
print(train_feature_tensor)


torch.Size([44184, 291416])
torch.Size([9468, 291416])
torch.Size([9469, 291416])
tensor(indices=tensor([[     0,      0,      0,  ...,  44183,  44183,  44183],
                       [  3773,   4103,   4597,  ..., 266784, 268234, 268804]]),
       values=tensor([0.0283, 0.0278, 0.0133,  ..., 0.0683, 0.0384, 0.0313]),
       size=(44184, 291416), nnz=8879963, layout=torch.sparse_coo)
