# Fake News Detection Project
## Using Logistic Regression




In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


import re
from nltk.corpus import stopwords
from sklearn. feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [31]:
train = pd.read_csv('data/train.csv').drop(columns='id')
train.sample(5)

Unnamed: 0,title,author,text,label
8909,Camera Catches Hillary’s Sick Hidden Message T...,Prissy Holly,Camera Catches Hillary’s Sick Hidden Message T...,1
1962,Hillary Clinton Would Use SCOTUS Vacancies to ...,"Joe Wolverton, II, J.D.",Email \nJust days before the presidential elec...,1
3848,‘Arab Spring’ and the Washington-Brussels-Riya...,The Saker,Be the First to Comment! Search articles,1
11486,"Pour booster les ventes, Dassault offre un por...",,,1
2397,Donald Trump Meets with Victims of Obamacare a...,Charlie Spiering,President Donald Trump hosted a listening sess...,0


In [32]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   20242 non-null  object
 1   author  18843 non-null  object
 2   text    20761 non-null  object
 3   label   20800 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 650.1+ KB


### Data Preprocessing

In [33]:
# Remove missing values
train.isna().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [34]:
train = train.fillna('')
train.isna().sum()

title     0
author    0
text      0
label     0
dtype: int64

### Stemming

Stemming is the process of reducing a complex word to its Root i.e. most simplest word <br>
example: actor, actress, acting --> act

In [35]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

def stemming(content):
    """
    Perform stemming on the given content.
    Args:
        content (str): The input text to be stemmed.
    Returns:
        str: The stemmed version of the input text.
    """
    # Initialize the stemmer and preprocess the content
    stemmer = PorterStemmer()
    # Remove non-alphabetic characters (numerics and special characters) and convert to lowercase
    clean_content = re.sub(r'[^a-zA-Z]', ' ', content).lower()

    # Split into words and filter out stopwords
    filtered_words = [
        stemmer.stem(word)
        for word in clean_content.split()
        if word not in stopwords.words('english')
    ]

    # Join the stemmed words back into a single string sepreated by empty spaces
    return ' '.join(filtered_words)

In [36]:
# Test the function on the example string
ger = "Hello Darkness! my old friend 123"
stemming(ger)

'hello dark old friend'

In [37]:
train['content'] = train['title'] +' - by ' + train['author']
train.iloc[0:5,-1].values

array(['House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It - by Darrell Lucus',
       'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart - by Daniel J. Flynn',
       'Why the Truth Might Get You Fired - by Consortiumnews.com',
       '15 Civilians Killed In Single US Airstrike Have Been Identified - by Jessica Purkiss',
       'Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery - by Howard Portnoy'],
      dtype=object)

In [38]:
train['content']  = train['content'].apply(func=stemming)
train.iloc[0:5, -1].values

array(['hous dem aid even see comey letter jason chaffetz tweet darrel lucu',
       'flynn hillari clinton big woman campu breitbart daniel j flynn',
       'truth might get fire consortiumnew com',
       'civilian kill singl us airstrik identifi jessica purkiss',
       'iranian woman jail fiction unpublish stori woman stone death adulteri howard portnoy'],
      dtype=object)

In [39]:
# Seperate input and target
X = train['content'].values
y = train['label'].values

## Vectorization

- As the values are still in text format, we need to convert them in the numeric frm in order to work with ML models
- Converting Textual Data to Numeric is called ***Vectorization***
- We will use `TfidfVectorizer()`, which  converts a raw count matrix (produced by CountVectorizer) into a TF-IDF (Term Frequency-Inverse Document Frequency) matrix. It applies weighting to each term based on its frequency within the document and across the entire corpus.



In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)

In [41]:
# 16984 unique words which are not stop words
len(tfidf.vocabulary_)

17128

### Splitting into Train and validation sets


In [42]:
# stratify=y make sure that training and validation sets have equal proportion of the different target labels.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)


(14560, 17128)
(6240, 17128)
(14560,)
(6240,)


### Training ML Model

In [43]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=200)

logreg.fit(X_train, y_train)
y_pred_val = logreg.predict(X_val)

### Model Evaluation on validation set

In [44]:
from sklearn.metrics import classification_report, f1_score

print(f1_score(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

0.9737627651217596
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      3116
           1       0.96      0.99      0.97      3124

    accuracy                           0.97      6240
   macro avg       0.97      0.97      0.97      6240
weighted avg       0.97      0.97      0.97      6240



### Hyperparameter Tuning
No Need to optimize hyperparameters as the model is already performing really well, The model is highly accurate (97%) and performs well on both classes, as indicated by the high precision, recall, and F1-scores.

### Final Test on test set

In [46]:
test = pd.read_csv('data/test.csv')
test.sample(5)

Unnamed: 0,id,title,author,text
1656,22456,Huma may have violated ‘legal obligation’ rega...,Howard Portnoy,Print \nRepublican National Committee spokesma...
1384,22184,US hacking ‘hysteria’ aimed at distracting vot...,admin,US hacking ‘hysteria’ aimed at distracting vot...
3442,24242,Man gets only one missed call from Mom,,"Topics: Parents , Phone calls , mom \nLocal ..."
797,21597,Must See Documentary on the July 2016 Turkish ...,Jafe Arnoldski (noreply@blogger.com),"October 28, 2016 - \nKatehon - \n\n\n\nIn this..."
493,21293,North Dakota’s Public Bank Is Funding Police R...,,North Dakota’s Public Bank Is Funding Police R...


In [47]:
train.isna().sum()

title      0
author     0
text       0
label      0
content    0
dtype: int64