In [2]:
# Fake News Classifier - Preprocessing & Feature Extraction

# Now that we have the combined dataset, we’ll:

# 1. Load the dataset we saved earlier
# 2. Clean the text (remove stopwords, punctuation, lowercase)
# 3. Tokenize words
# 4. Convert text into numerical features using **TF-IDF Vectorizer**


In [4]:
import pandas as pd
import numpy as np
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

#download stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\punwa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
print(len(stop_words))

198


In [7]:
#Load clean dataset
news_df = pd.read_csv("./data/combined_news.csv")
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,Boiler Room #93 – The Outgoing Head of Hydra,Tune in to the Alternate Current Radio Network...,US_News,"January 19, 2017",0
1,Britain's May heads to Middle East,LONDON (Reuters) - British Prime Minister Ther...,worldnews,"November 28, 2017",1
2,FAMILY OF S.C. SHOOTING VICTIM HAS A MESSAGE F...,"Poor little Al if he s not race baiting, he s ...",politics,"Apr 9, 2015",0
3,Venezuelan crisis spawns boom in gambling,"CARACAS (Reuters) - The Whale , The Dog or ...",worldnews,"November 7, 2017",1
4,N KOREA JUST REVEALED Plans To Unleash An Unim...,WFB North Korea for the first time this week ...,left-news,"Sep 6, 2017",0


In [23]:
#text cleaning

def clean_text(text):
    text =  text.lower()
    text = text.translate(str.maketrans("","",string.punctuation))  #remove punctuation
    words = text.split()
    words = [w for w in words if w not in stop_words] #remove stopwords
    return  " ".join(words)

# Apply cleaning to create the new column
news_df["clean_title"] = news_df["title"].apply(clean_text)

# Verify
print(news_df[["title", "clean_title"]].head())

                                               title  \
0       Boiler Room #93 – The Outgoing Head of Hydra   
1                 Britain's May heads to Middle East   
2  FAMILY OF S.C. SHOOTING VICTIM HAS A MESSAGE F...   
3          Venezuelan crisis spawns boom in gambling   
4  N KOREA JUST REVEALED Plans To Unleash An Unim...   

                                         clean_title  
0               boiler room 93 – outgoing head hydra  
1                     britains may heads middle east  
2  family sc shooting victim message al sharpton ...  
3             venezuelan crisis spawns boom gambling  
4  n korea revealed plans unleash unimaginable at...  


In [28]:
# Convert text into numerical features
vectorizer = TfidfVectorizer(max_features=5000)  # limit to top 5000 words
X = vectorizer.fit_transform(news_df["clean_title"]).toarray()
y = news_df["label"].values

print("Feature Matrix Shape:", X.shape)

Feature Matrix Shape: (44898, 5000)


In [30]:
# split data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size = 0.2, random_state =42
)

print("Training Set:",  X_train.shape)
print("Testing Set:", y_test.shape)

Training Set: (35918, 5000)
Testing Set: (8980,)


In [31]:
# Save for model training notebook
np.save("./data/X_train.npy", X_train)
np.save("./data/X_test.npy", X_test)
np.save("./data/y_train.npy", y_train)
np.save("./data/y_test.npy", y_test)