# Fake New Dectection USing Dataset 3

In [None]:
import numpy as np
import pandas as pd
import re,string,unicodedata
import os

# NLP Libs
import nltk
from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
#from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#ML Algos
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score,
                             matthews_corrcoef,
                             cohen_kappa_score,
                             roc_auc_score)

In [None]:
# Import our data

df = pd.read_csv("./dataset/WELFake_dataset/WELFake_Dataset.csv")

# Data Analysis

In [None]:
df.head(10)

In [None]:
# Invert the labels: 0 to 1 and 1 to 0
df['label'] = 1 - df['label']

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
# Visualization Libs
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,9))

#sns.countplot(df.label)
sns.countplot(data=df, x='label')

plt.title('Total Fake and Real News Articles', fontsize=24)
plt.ylabel('Total', fontsize=16)
plt.xlabel('')
plt.xticks([1, 0], ['Fake', 'Real'], fontsize=16)

plt.show()

In [None]:
df.columns

In [None]:
print(df.isnull().sum())

In [None]:
df.sample(10)

In [None]:
print(df.isnull().sum())

In [None]:
df.info()

In [None]:
df=df.fillna(' ')
# Now we'll create the Corpus that will be used in our NLP model

# This will create a single column with all the relevant text

df['total']=df['title']+' '+df['text']

In [None]:
print(df.isnull().sum())

In [None]:
df = df.drop(['Unnamed: 0', 'title','text'],axis = 1)

In [None]:
df.head()

# Cleaning and preprocessing

In [None]:
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [None]:
stop_words = stopwords.words('english')
lemmatizer=WordNetLemmatizer()
for index,row in df.iterrows():
    filter_sentence = ''

    sentence = row['total']
    sentence = re.sub(r'[^\w\s]','',sentence) #cleaning
    words = nltk.word_tokenize(sentence) #tokenization
    words = [w.lower() for w in words if not w in stop_words]  #stopwords removal

    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()

    df.loc[index,'total'] = filter_sentence

In [None]:
df.head()

In [None]:
df.to_csv("./dataset/WELFake_dataset/cleaned_dataset_22.csv")

# Vectorization

In [None]:
X_train = df['total']
Y_train = df['label']

In [None]:
#Feature extraction using count vectorization and tfidf.
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X_train)
freq_term_matrix = count_vectorizer.transform(X_train)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [None]:
tf_idf_matrix

# Modeling

In [None]:
test_counts = count_vectorizer.transform(df['total'].values)
test_tfidf = tfidf.transform(test_counts)

#split in samples
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(tf_idf_matrix, Y_train, random_state=0)


from sklearn.model_selection import StratifiedShuffleSplit

# Sample data (replace this with your dataset)
X = tf_idf_matrix  # Features
y = Y_train    # Target labels (0 or 1)

# Create an instance of StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)


# Perform the split
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Now, X_train and y_train contain the training data and labels, and X_test and y_test contain the test data and labelss

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
print(y_train.shape, y_test.shape)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

# Training and Evaluation

In [None]:
# DEMO
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)
print('Accuracy of Logistic classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
print('====================================')
print('Model Precison: ', precision_score(y_test, pred))
print('Model Recall Score: ', recall_score(y_test, pred))
print('Model F1 Score: ', f1_score(y_test, pred))
print('Model Accuracy Score: ', accuracy_score(y_test, pred))
print('====================================')
print('classification_report: ')
print(classification_report(y_test, pred))
cm = confusion_matrix(y_test, pred)
cm
ConfusionMatrixDisplay.from_predictions(y_test,pred)

In [None]:
def get_evaluation_result(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    mcc = matthews_corrcoef(y_test, pred)
    kappa = cohen_kappa_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred)
    
    print('Accuracy of Logistic classifier on training set: {:.2f}'.format(logreg.score(X_train, y_train)))
    print('Accuracy of Logistic classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
    print('====================================')
    # Print the results
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Matthews Correlation Coefficient: {mcc}")
    print(f"Cohen's Kappa: {kappa}")
    print(f"AUC-ROC: {roc_auc}")
    print('====================================')
    print('classification_report: ')
    print(classification_report(y_test, pred))
    
    cm = confusion_matrix(y_test, pred)
    cm
    ConfusionMatrixDisplay.from_predictions(y_test,pred)

# 1. “lbfgs” Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lbfgs = LogisticRegression(solver='lbfgs')

get_evaluation_result(lbfgs, X_train, X_test, y_train, y_test)

# 2. “liblinear” Logistic Regression

In [None]:
liblinear = LogisticRegression(solver='liblinear')

get_evaluation_result(liblinear, X_train, X_test, y_train, y_test)

# 3. “newton-cg” Logistic Regression

In [None]:
newton_cg = LogisticRegression(solver='newton-cg')

get_evaluation_result(newton_cg, X_train, X_test, y_train, y_test)

# 4. “sag” Logistic Regression

In [None]:
sag = LogisticRegression(solver='sag')

get_evaluation_result(sag, X_train, X_test, y_train, y_test)

# 5. Random Forest Logistic Regression

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

get_evaluation_result(rf, X_train, X_test, y_train, y_test)

# 6. Perceptron

In [None]:
from sklearn.linear_model import Perceptron

perceptron = Perceptron(max_iter=1000, eta0=0.1, random_state=0)
get_evaluation_result(perceptron, X_train, X_test, y_train, y_test)

# 7. RidgeClassifier

In [None]:
from sklearn.linear_model import RidgeClassifier
ridge_classifier = RidgeClassifier(alpha=1.0, solver='auto', random_state=0)
get_evaluation_result(ridge_classifier, X_train, X_test, y_train, y_test)

# 8. CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
# Create a CatBoostClassifier instance with specified hyperparameters
catboost_classifier = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, loss_function='Logloss', verbose=0)

get_evaluation_result(catboost_classifier, X_train, X_test, y_train, y_test)

# 9. NearestCentroidClassifier

In [None]:
from sklearn.neighbors import NearestCentroid
ncc = NearestCentroid()

get_evaluation_result(ncc, X_train, X_test, y_train, y_test)

# 10. Stochastic Gradient Descent (SGDClassifier)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_classifier = SGDClassifier()

get_evaluation_result(sgd_classifier, X_train, X_test, y_train, y_test)

# 11. SVC (kernel=”linear”, C=0.025):

In [None]:
from sklearn.svm import SVC
svm_lin = SVC(kernel="linear", C=0.025)

get_evaluation_result(svm_lin, X_train, X_test, y_train, y_test)

# 12. SVC (gama=2, C=1)

In [None]:
svm_rbf = SVC(kernel="rbf", gamma=2, C=1)

get_evaluation_result(svm_rbf, X_train, X_test, y_train, y_test)

# 13. LinearSVC

In [None]:
from sklearn.svm import LinearSVC
linear_svm_classifier = LinearSVC(C=1.0, random_state=0)

get_evaluation_result(linear_svm_classifier, X_train, X_test, y_train, y_test)

# 14. SDGClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
# Create an SGDClassifier instance with hinge loss (SVM-like behavior), L2 regularization, and an initial learning rate (eta0)
sgd_classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, eta0=0.01, random_state=0, max_iter=1000)

get_evaluation_result(sgd_classifier, X_train, X_test, y_train, y_test)

# 15. ZeroRGaussianProcessClassifier

In [None]:
from sklearn.dummy import DummyClassifier

# Create a ZeroR classifier
zeror_model = DummyClassifier(strategy="most_frequent")

get_evaluation_result(zeror_model, X_train, X_test, y_train, y_test)

# 16. DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=0)

get_evaluation_result(dt_classifier, X_train, X_test, y_train, y_test)

# 17.Passive Aggressive (PassiveAggressiveClassifier)

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier
pa_classifier = PassiveAggressiveClassifier(C=1.0, random_state=0, max_iter=1000)

get_evaluation_result(pa_classifier, X_train, X_test, y_train, y_test)

# 18.ExtraTreeClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
extra_tree_classifier = ExtraTreesClassifier(n_estimators=100, random_state=0)

get_evaluation_result(extra_tree_classifier, X_train, X_test, y_train, y_test)

# 19.Random Patches

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a base learner (e.g., Decision Tree)
base_learner = DecisionTreeClassifier()

# Create a BaggingClassifier with Random Patches
bagging = BaggingClassifier(base_learner, max_samples=0.8, max_features=0.8, n_estimators=10, random_state=42)

get_evaluation_result(bagging, X_train, X_test, y_train, y_test)

# 20.VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Define individual classifiers
classifier1 = DecisionTreeClassifier()
classifier2 = SVC(probability=True)  # Use probability=True for soft voting
classifier3 = RandomForestClassifier()

# Create a VotingClassifier instance with soft voting
voting_classifier = VotingClassifier(estimators=[
    ('decision_tree', classifier1),
    ('svm', classifier2),
    ('random_forest', classifier3)
], voting='soft')  # You can also use 'hard' for hard voting

get_evaluation_result(voting_classifier, X_train, X_test, y_train, y_test)

# 21. Stacked_generalization

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=0)),
    ('svc', SVC(probability=True)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=0))
]

# Define the meta-model
meta_model = LogisticRegression()

# Create a StackingClassifier instance
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

get_evaluation_result(stacked_model, X_train, X_test, y_train, y_test)

# 22.MLPClassifier:

In [None]:
from sklearn.neural_network import MLPClassifier

# Create an MLPClassifier instance with specified architecture and hyperparameters
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', max_iter=1000, random_state=0)

get_evaluation_result(mlp_classifier, X_train, X_test, y_train, y_test)

# 23.BernoulliRBM:

In [None]:
import lightgbm as lgb
lgb_classifier = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.05, n_estimators=100)

get_evaluation_result(lgb_classifier, X_train, X_test, y_train, y_test)

# 24.AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=0)

get_evaluation_result(adaboost_classifier, X_train, X_test, y_train, y_test)

# 25.GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

get_evaluation_result(gb_classifier, X_train, X_test, y_train, y_test)

# 26.Ordinal Learning Model

In [None]:
from mord import LogisticAT

# Create and train an Ordinal Logistic Regression model (LogisticAT)
ordinal_classifier = LogisticAT(alpha=1.0)  # You can adjust the regularization parameter alpha

get_evaluation_result(ordinal_classifier, X_train, X_test, y_train, y_test)

# 27.Xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Create an XGBClassifier instance with specified hyperparameters
xgb_classifier = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)

get_evaluation_result(xgb_classifier, X_train, X_test, y_train, y_test)

# 28.Decision Stump

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create and train a Decision Stump (a DecisionTreeClassifier with max_depth=1)
decision_stump = DecisionTreeClassifier(max_depth=1)

get_evaluation_result(decision_stump, X_train, X_test, y_train, y_test)

# 29.ComplementNB

In [None]:
from sklearn.naive_bayes import ComplementNB

# Create a ComplementNB instance
complement_nb_classifier = ComplementNB()
get_evaluation_result(complement_nb_classifier, X_train, X_test, y_train, y_test)

# 30.MultinomialNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Create a MultinomialNB instance
multinomial_nb_classifier = MultinomialNB()
get_evaluation_result(multinomial_nb_classifier, X_train, X_test, y_train, y_test)