# Fake News Detection Project
## Using Logistic Regression




In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import re
from nltk.corpus import stopwords
from sklearn. feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [55]:
train = pd.read_csv('data/train.csv').drop(columns='id')
train.sample(5)

Unnamed: 0,title,author,text,label
20705,Tony Romo’s Star Is Eclipsed by Another Feel-G...,Juliet Macur,"FRISCO, Tex. — The Dallas Cowboys’ new head...",0
7239,"Mosul Suicide Bomber Was British, Says Islamic...",Breitbart London,BAGHDAD (AFP) — The Islamic State group on ...,0
9008,"No, Mexico Doesn’t Have A Wall On Its Southern...",Allan Wall,X Dear Reader! VDARE.com isn’t just a website....,1
1298,North Korea’s Nuclear Blasts Keep Getting Stro...,Michael Forsythe,North Korea said it conducted its fifth underg...,0
14010,State Department Informants Tip Clinton Campai...,Terresa Monroe-Hamilton,"1 comment \nGee, the State Department and Hill...",1


In [56]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 650.1+ KB


### Data Preprocessing

In [57]:
# Remove missing values
train.isna().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [58]:
train = train.fillna('')
train.isna().sum()

title     0
author    0
text      0
label     0
dtype: int64

In [59]:
# Seperate input and target
X = train.drop(columns='label')
y = train['label']

### Stemming

Stemming is the process of reducing a complex word to its Root i.e. most simplest word <br>
example: actor, actress, acting --> act

In [60]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re


def stemming(content):
    """
    Perform stemming on the given content.
    Args:
        content (str): The input text to be stemmed.
    Returns:
        str: The stemmed version of the input text.
    """
    # Initialize the stemmer and preprocess the content
    stemmer = PorterStemmer()
    # Remove non-alphabetic characters (numerics and special characters) and convert to lowercase
    clean_content = re.sub(r'[^a-zA-Z]', ' ', content).lower()

    # Split into words and filter out stopwords
    filtered_words = [
        stemmer.stem(word)
        for word in clean_content.split()
        if word not in stopwords.words('english')
    ]

    # Join the stemmed words back into a single string sepreated by empty spaces
    return ' '.join(filtered_words)

In [61]:
# Test the 'stemming' function on the example string
test_stemming_func = "Hello Darkness! my old friend 123"
stemming(test_stemming_func)

'hello dark old friend'

In [62]:
def combine_title_author(df):
    """Combines 'title' and 'author' columns into a single 'content' column."""
    df['content'] = df['title'] + ' - by ' + df['author']
    return df[['content']]


def content_stemming(df):
    """Applies stemming to the 'content' column in the DataFrame."""
    df.loc[:, 'content'] = df['content'].apply(
        stemming)  # Apply stemming to the content column
    return df[['content']]


# post-preprocessing : JUST FOR DEMONSTARATION PURPOSES
content_stemming(combine_title_author(X)).head()

Unnamed: 0,content
0,hous dem aid even see comey letter jason chaff...
1,flynn hillari clinton big woman campu breitbar...
2,truth might get fire consortiumnew com
3,civilian kill singl us airstrik identifi jessi...
4,iranian woman jail fiction unpublish stori wom...


## Vectorization

- As the values are still in text format, we need to convert them in the numeric frm in order to work with ML models
- Converting Textual Data to Numeric is called ***Vectorization***
- We will use `TfidfVectorizer()`, which  converts a raw count matrix (produced by CountVectorizer) into a TF-IDF (Term Frequency-Inverse Document Frequency) matrix. It applies weighting to each term based on its frequency within the document and across the entire corpus.



In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
final_pipeline = Pipeline(
    steps=[
        ('combine_content', FunctionTransformer(lambda df: combine_title_author(
            df), validate=False)),  # Combine title and author
        ('content_stemming', FunctionTransformer(lambda df: content_stemming(
            df), validate=False)),  # stemming the strings in each content
        # Apply TFIDF Vectorizer to 'content' column
        ('tf-ifd',
         ColumnTransformer(transformers=[('tfidf', TfidfVectorizer(), 'content')])),
        ('logreg', LogisticRegression(max_iter=200))
    ])

### Splitting into Train and validation sets


In [64]:
# stratify=y make sure that training and validation sets have equal proportion of the different target labels.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(14560, 4)
(6240, 4)
(14560,)
(6240,)


### Training ML Model

In [65]:
final_pipeline.fit(X_train, y_train)
y_pred_val = final_pipeline.predict(X_val)

### Model Evaluation on validation set

In [66]:
from sklearn.metrics import classification_report, f1_score

print(f1_score(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

0.9746256895193065
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      3116
           1       0.96      0.99      0.97      3124

    accuracy                           0.97      6240
   macro avg       0.97      0.97      0.97      6240
weighted avg       0.97      0.97      0.97      6240



### Hyperparameter Tuning
No Need to optimize hyperparameters as the model is already performing really well, The model is highly accurate (97%) and performs well on both classes, as indicated by the high precision, recall, and F1-scores.

### Final Test on test set

In [75]:
test = pd.read_csv('data/test.csv')
test.sample(5)

Unnamed: 0,id,title,author,text
2021,22821,SU 35 Vs F 35 Unbiased Detailed Comparison: To...,wmw_admin,"Behind the headlines - conspiracies, cover-ups..."
2713,23513,Torch-Wielding Mob Of Democrat Villagers Grab ...,Daisy Luther,"in: Special Interests , US News Do you remembe..."
3365,24165,Pat Cleveland: Early Supermodel and Author Wit...,Guy Trebay,"WILLINGBORO, N. J. — The peacocks were root..."
3531,24331,#LoudonClear: Tonight’s Special Guest Tim Burton,Trevor Loudon,We are Gulag Bound / *Resisters' Log* / #Lou...
3250,24050,Palestine Considers Suing Israel in Internatio...,James M. Dorsey,2 Shares\n2 0 0 0\nThe Palestine Football Asso...


In [68]:
test.isna().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [None]:
test = test.fillna('')
test.isna().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [70]:
X_test = test.drop(columns='label')
y_test = test['label']

final_y_pred = final_pipeline.predict(X_test)

KeyError: "['label'] not found in axis"

In [None]:
print(f1_score(y_test, final_y_pred))
print(classification_report(y_test, final_y_pred))