Importing the Requried Libraries

In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Importing Datasets

In [43]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')


  fake = pd.read_csv('Fake.csv')


In [44]:
# Assigning labels
true["label"] = 1
fake["label"] = 0

In [45]:
# Mergeing datasets
data = pd.concat([true, fake], axis=0)

In [46]:
# Drop unnecessary columns
data = data.drop(["title", "subject", "date"], axis=1)

In [47]:
# Shuffle data
data = data.sample(frac=1).reset_index(drop=True)

Preprocessing the datas

In [48]:
# Preprocessing function
def preprocess_text(text_data):
    preprocessed_text = []
    for sentence in tqdm(text_data):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        preprocessed_text.append(' '.join(
            token.lower() for token in str(sentence).split()
            if token not in stopwords.words('english') or token in ["not", "no"]
        ))
    return preprocessed_text

In [49]:
data['text'] = preprocess_text(data['text'].values)

100%|██████████| 44919/44919 [27:08<00:00, 27.59it/s]


In [50]:
# Feature Engineering
data['word_count'] = data['text'].apply(lambda x: len(x.split()))
data['char_count'] = data['text'].apply(lambda x: len(x))
data['special_char_count'] = data['text'].apply(lambda x: sum(1 for c in x if not c.isalnum()))

In [51]:
# Splitting data
X = data['text']
y = data['label']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [52]:
# Vectorization
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [53]:
# Logistic Regression Model
model = LogisticRegression()
model.fit(x_train, y_train)

In [54]:
# Testing & Accuracy
print("Logistic Regression Accuracy:", accuracy_score(y_test, model.predict(x_test)))

Logistic Regression Accuracy: 0.9886910062333036


In [55]:
# Decision Tree Model
tree_model = DecisionTreeClassifier()
tree_model.fit(x_train, y_train)

In [56]:
# Testing & Accuracy
print("Decision Tree Accuracy:", accuracy_score(y_test, tree_model.predict(x_test)))

Decision Tree Accuracy: 0.9951914514692787


In [57]:
# Manual Testing Function
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)

    # Preprocess text
    pptext= preprocess_text(new_def_test['text'].values)

    # Transform using trained vectorizer
    tf = vectorizer.transform(pptext)

    # Prediction
    yy = model.predict(tf)

    if yy[0] == 0:
        print("Fake News")
    else:
        print("Real News")

In [59]:
# Run manual testing
getinput = input("Enter news text: ")
manual_testing(getinput)

Enter news text: 21st Century Wire says Does Hollywood do anything other than comic book remakes any more? The latest over-budget chimera, Batman V Superman, cost around $410 million to make. You d think they could squeeze a decent film out of that. You d think. He hasn t felt this low since Gigli , said the Huff Post. Movie star Ben Affleck may have just beaten Chris Christie for this year s most vacant stare. Watch: READ MORE HOLLYWOOD NEWS AT: 21st Century Wire Hollywood Files


100%|██████████| 1/1 [00:00<00:00, 108.94it/s]

Fake News



