Import Libraries and Dataset

In [None]:
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('news.csv')

Analyse the data

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
# Count the number of REAL and FAKE values
Counter(df['label'])

In [None]:
sns.countplot(data=df, x='label', order=df['label'].value_counts().index, palette = "magma_r")


In [None]:
#real vs fake
fig = px.pie(df,names='label',title='Proportion of Real vs. Fake News', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [None]:
# Get the labels
labels = df.label
labels.head(10)

In [None]:
# Set up the plot
plt.figure(figsize=(16,4))
plt.title('Relationship between length of the text article and its label',fontsize=20)

# Extract the text and label columns
texts = df['text']
labels = df['label']

# Extract the length of the text column
lengths = texts.apply(lambda x: len(x))

# Map each label to a color
colors = labels.apply(lambda x: 'green' if x == 'REAL' else 'red')

# Create a scatter plot
plt.scatter(lengths, labels, c=colors)

# Label the axes
plt.xlabel('Length')
plt.ylabel('Label')

# Show the plot
plt.show()

Split the dataset into training and testing sets

In [None]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
print(x_train.shape)
print(x_test.shape)

In [None]:
print(y_train.shape)
print(y_test.shape)

Data Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
# Preprocess the data
def preprocess(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the tokens
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    # Rejoin the stemmed tokens into a single string
    preprocessed_text = ' '.join(stemmed_tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the text data
df['preprocessed_text'] = df['text'].apply(preprocess)


In [None]:
# Apply the preprocessing function to the text data
x_train_preprocessed = x_train.apply(preprocess)
x_test_preprocessed = x_test.apply(preprocess)

In [None]:
df.head(10)

Visualizing preprocessed text data

In [None]:
#Creating word cloud for real and fake preprocessed data
from wordcloud import WordCloud

# Extract the preprocessed text and label columns from the dataset
texts = df['preprocessed_text']
labels = df['label']

# Create a string containing the preprocessed text data for "real" news articles
real_text = ' '.join([text for text, label in zip(texts, labels) if label == 'REAL'])

# Create a word cloud object for "real" news articles
real_wordcloud = WordCloud().generate(real_text)

# Create a string containing the preprocessed text data for "fake" news articles
fake_text = ' '.join([text for text, label in zip(texts, labels) if label == 'FAKE'])

# Create a word cloud object for "fake" news articles
fake_wordcloud = WordCloud().generate(fake_text)

# Display the word cloud for "real" news articles
plt.figure(figsize=(8,6))
plt.imshow(real_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Real News',fontsize=20)
plt.show()

# Display the word cloud for "fake" news articles
plt.figure(figsize=(8,6))
plt.imshow(fake_wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Fake News',fontsize=20)
plt.show()


TfidfVectorizer Initialization

In [None]:
# Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

In [None]:
# Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train_preprocessed) 
tfidf_test=tfidf_vectorizer.transform(x_test_preprocessed)

In [None]:
print(tfidf_test.shape)
print(tfidf_train.shape)

Model training, Evaluation, and Prediction along with Classification report and Confusion matrix

PassiveAggressiveClassifier

In [None]:
# Initialize a PassiveAggressiveClassifier
model1 = PassiveAggressiveClassifier(max_iter=50)
model1.fit(tfidf_train,y_train)


In [None]:
# Predict on the test set and calculate accuracy
y_pred1 = model1.predict(tfidf_test)
score1 = accuracy_score(y_test,y_pred1)
print(f'Accuracy: {round(score1, 3)}')

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred1)

# Print the classification report
print(report)

In [None]:
# Build confusion matrix for PassiveAggressiveClassifier
cm1 = confusion_matrix(y_test,y_pred1, labels=['FAKE','REAL']) 

In [None]:
print("\033[1m"+'Confusion Matrix for PassiveAggressiveClassifier'+ "\033[0m")
plot_confusion_matrix(conf_mat=cm1,show_absolute=True,
                                show_normed=True,
                                colorbar=True,class_names=['FAKE','REAL'], cmap=plt.cm.magma_r)

LogisticRegression

In [None]:
# Initialize a LogisticRegression
model2 = LogisticRegression(max_iter=50)
model2.fit(tfidf_train,y_train)

In [None]:
# Predict on the test set and calculate accuracy
y_pred2 = model2.predict(tfidf_test)
score2 = accuracy_score(y_test,y_pred2)
print(f'Accuracy: {round(score2, 3)}')

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred2)

# Print the classification report
print(report)

In [None]:
# Build confusion matrix for LogisticRegression
cm2 = confusion_matrix(y_test,y_pred2, labels=['FAKE','REAL'])

In [None]:
print("\033[1m"+'Confusion Matrix for LogisticRegression'+ "\033[0m")
plot_confusion_matrix(conf_mat=cm2,show_absolute=True,
                                show_normed=True,
                                colorbar=True,class_names=['FAKE','REAL'], cmap=plt.cm.magma_r)

DecisionTreeClassifier

In [None]:
# Initialize DecisionTreeClassifier
model3 = DecisionTreeClassifier()
model3.fit(tfidf_train,y_train)

In [None]:
# Predict on the test set and calculate accuracy
y_pred3 = model3.predict(tfidf_test)
score3 = accuracy_score(y_test,y_pred3)
print(f'Accuracy: {round(score3, 5)}')

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred3)

# Print the classification report
print(report)

In [None]:
# Build confusion matrix for DecisionTreeClassifier
cm3 = confusion_matrix(y_test,y_pred3, labels=['FAKE','REAL'])

In [None]:
print("\033[1m"+'Confusion Matrix for DecisionTreeClassifier'+ "\033[0m")
plot_confusion_matrix(conf_mat=cm3,show_absolute=True,
                                show_normed=True,
                                colorbar=True,class_names=['FAKE','REAL'], cmap=plt.cm.magma_r)

RandomForestClassifier

In [None]:
# Initialize RandomforestClassifier
model4 = RandomForestClassifier()
model4.fit(tfidf_train,y_train)

In [None]:
# Predict on the test set and calculate accuracy
y_pred4 = model4.predict(tfidf_test)
score4 = accuracy_score(y_test,y_pred4)
print(f'Accuracy: {round(score4, 5)}')

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred4)

# Print the classification report
print(report)

In [None]:
# Build confusion matrix for RandomForestClassifier
cm4 = confusion_matrix(y_test,y_pred4, labels=['FAKE','REAL']) 

In [None]:
print("\033[1m"+'Confusion Matrix for RandomForestClassifier'+ "\033[0m")
plot_confusion_matrix(conf_mat=cm4,show_absolute=True,
                                show_normed=True,
                                colorbar=True,class_names=['FAKE','REAL'], cmap=plt.cm.magma_r)

Support Vector Machine Classifier

In [None]:
from sklearn.svm import SVC

#Initializa SupportVectorMachineClassifier
model5 = SVC()
model5.fit(tfidf_train,y_train)

In [None]:
# Predict on the test set and calculate accuracy
y_pred5 = model5.predict(tfidf_test)
score5 = accuracy_score(y_test,y_pred5)
print(f'Accuracy: {round(score5, 5)}')

In [None]:
# Generate the classification report
report = classification_report(y_test, y_pred5)

# Print the classification report
print(report)

In [None]:
# Build confusion matrix for RandomForestClassifier
cm5 = confusion_matrix(y_test,y_pred5, labels=['FAKE','REAL']) 

In [None]:
print("\033[1m"+'Confusion Matrix for SupportVectorMachine'+ "\033[0m")
plot_confusion_matrix(conf_mat=cm5,show_absolute=True,
                                show_normed=True,
                                colorbar=True,class_names=['FAKE','REAL'], cmap=plt.cm.magma_r)

Data Visualization

In [None]:
labels={'PA Classifier':score1,'LR':score2,'DT Classifier':score3,'RF Classifier':score4,'SVM':score5}

In [None]:
plt.figure(figsize=(10,6))
plt.title('Accuracy Comparison of ML Models Bar Chart',fontsize=20)
colors=['lightblue','lightblue','lightblue','lightblue','lightblue']
plt.xticks(fontsize=10,color='midnightblue')
plt.yticks(fontsize=16,color='midnightblue')
plt.ylabel('Accuracy',fontsize=16)
plt.xlabel('Models',fontsize=16)
plt.bar(labels.keys(),labels.values(),edgecolor='black',color=colors, linewidth=2,alpha=0.8)

In [None]:
# Set up the plot
plt.figure(figsize=(10,6))
plt.title('Accuracy Comparison of ML Models Line Plot',fontsize=20)

# Extract the model accuracies
accuracies = [score1, score2, score3, score4,score5]

# Extract the names of the model
model_names = ['Passive Aggressive Classifier', 'Logistic Regression', 'Decision Tree Classifier', 'Random Forest Classifier','Support Vector Machine']

# Plot the accuracy values
plt.plot(model_names, accuracies)

# Add a legend and label the axes
plt.legend(['Accuracy'])
plt.xlabel('Model', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)

# Show the plot
plt.show()
