In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Download stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
news = pd.read_csv('/content/sample_data/fake_news_dataset.csv')

In [10]:
print(news.head())

   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  
1  Ever get the feeling your life circles the rou...      0  
2  Why the Truth Might Get You Fired October 29, ...      1  
3  Videos 15 Civilians Killed In Single US Airstr...      1  
4  Print \nAn Iranian woman has been sentenced to...      1  


In [11]:
# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
# Function for Lemmatization
def lemmatization(content):
    lemmatized_content = re.sub('[^a-zA-Z]', ' ', content)  # Remove non-alphabetic characters
    lemmatized_content = lemmatized_content.lower()  # Convert to lowercase
    lemmatized_content = lemmatized_content.split()  # Split into words
    lemmatized_content = [lemmatizer.lemmatize(word) for word in lemmatized_content if word not in stopwords.words('english')]
    lemmatized_content = ' '.join(lemmatized_content)  # Join the words back into a single string
    return lemmatized_content

In [13]:
# Missing values
news = news.fillna('')

In [15]:
# Merging the author name and news title
news['content'] = news['author'] + ' ' + news['title']

In [16]:
# Lemmatization
news['content'] = news['content'].apply(lemmatization)

In [17]:
# Separating the data & label
X = news['content'].values
Y = news['label'].values

In [18]:
# Converting the text data to numerical data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [19]:
# Train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [20]:
# Regression Model
model = LogisticRegression()
model.fit(X_train, Y_train)

In [21]:
# Accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print("Accuracy Score of the training data: ", training_data_accuracy)

Accuracy Score of the training data:  0.9868990384615385


In [29]:
# Accuracy score on the testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy Score of the testing data: ", testing_data_accuracy)

Accuracy Score of the testing data:  0.9778846153846154


In [30]:
# Predict on a test sample
X_new = X_test[3]
prediction = model.predict(X_new)
print(prediction)

# Prediction result
if(prediction[0] == 0):
    print('The news is Real')
else:
    print('The news is Fake')

[0]
The news is Real


In [28]:
# Print the true label for the test sample
print(Y_test[3])

0
