In [1]:
import re
import pandas as pd
import spacy
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
df = pd.read_csv("FakeReal_NewsData.csv")
df.head()

Unnamed: 0,title,text,subject,News
0,Tim Kaine Shows Trump DECENT Way To Respond A...,"Senator Tim Kaine (D VA) , Hillary Clinton s...",News,fake
1,AMERICAN NANNY Helping Muslim Refugees Found B...,Is the government of Austria harboring violent...,Government News,fake
2,"Trump presidency faces longer odds after Iowa,...",NEW YORK - Republican candidate Donald Trump...,politicsNews,real
3,Red Cross says life has stopped in Myanmar's R...,GENEVA - Life has stopped in its tracks in M...,worldnews,real
4,Obama And Justin Trudeau Had Dinner Tuesday; ...,As the nation is embroiled in one scandal afte...,News,fake


In [3]:
df.shape

(44689, 4)

In [4]:
df.News.value_counts()

News
fake    23478
real    21211
Name: count, dtype: int64

In [5]:
df.isnull().sum()

title      0
text       0
subject    0
News       0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(4)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
df["merged_txt"] = df['title'] + " " + df['text']
df.head()

Unnamed: 0,title,text,subject,News,merged_txt
0,Tim Kaine Shows Trump DECENT Way To Respond A...,"Senator Tim Kaine (D VA) , Hillary Clinton s...",News,fake,Tim Kaine Shows Trump DECENT Way To Respond A...
1,AMERICAN NANNY Helping Muslim Refugees Found B...,Is the government of Austria harboring violent...,Government News,fake,AMERICAN NANNY Helping Muslim Refugees Found B...
2,"Trump presidency faces longer odds after Iowa,...",NEW YORK - Republican candidate Donald Trump...,politicsNews,real,"Trump presidency faces longer odds after Iowa,..."
3,Red Cross says life has stopped in Myanmar's R...,GENEVA - Life has stopped in its tracks in M...,worldnews,real,Red Cross says life has stopped in Myanmar's R...
4,Obama And Justin Trudeau Had Dinner Tuesday; ...,As the nation is embroiled in one scandal afte...,News,fake,Obama And Justin Trudeau Had Dinner Tuesday; ...


In [10]:
df['real'] = df['News'].apply(lambda x: 1 if x=='real' else 0)
df.head()

Unnamed: 0,title,text,subject,News,merged_txt,real
0,Tim Kaine Shows Trump DECENT Way To Respond A...,"Senator Tim Kaine (D VA) , Hillary Clinton s...",News,fake,Tim Kaine Shows Trump DECENT Way To Respond A...,0
1,AMERICAN NANNY Helping Muslim Refugees Found B...,Is the government of Austria harboring violent...,Government News,fake,AMERICAN NANNY Helping Muslim Refugees Found B...,0
2,"Trump presidency faces longer odds after Iowa,...",NEW YORK - Republican candidate Donald Trump...,politicsNews,real,"Trump presidency faces longer odds after Iowa,...",1
3,Red Cross says life has stopped in Myanmar's R...,GENEVA - Life has stopped in its tracks in M...,worldnews,real,Red Cross says life has stopped in Myanmar's R...,1
4,Obama And Justin Trudeau Had Dinner Tuesday; ...,As the nation is embroiled in one scandal afte...,News,fake,Obama And Justin Trudeau Had Dinner Tuesday; ...,0


In [11]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    text = re.sub(r'[^\w\s\’]' , " ", text)
    text = re.sub(r'[ \n]+' , " ", text)
    text = text.strip().lower()
    
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop:
            continue
        
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [12]:
df['processed_txt'] = df['merged_txt'].apply(preprocess)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.processed_txt, df.real, test_size=0.2, random_state = 42)
print(X_train.shape," ... ", X_test.shape," ... ", y_train.shape," ... ", y_test.shape)

(35748,)  ...  (8937,)  ...  (35748,)  ...  (8937,)


In [14]:
accuracy_scores = {}
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [15]:
models = {
    'XGBoost' : XGBClassifier(random_state=42),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

for model_name, model in models.items():
    
    # Train the model
    print(f'fitting {model_name} model...')
    model.fit(X_train_vec, y_train)
    
    # Predict and calculate accuracy
    y_pred = model.predict(X_test_vec)
    accuracy_scores[model_name] = accuracy_score(y_test, y_pred)
    

fitting XGBoost model...
fitting Multinomial Naive Bayes model...
fitting Random Forest model...
fitting Decision Tree model...


In [16]:
print("Accuracy Scores:")
for model_name, score in accuracy_scores.items():
    print(f"{model_name}: {score:.4f}")


Accuracy Scores:
XGBoost: 0.9932
Multinomial Naive Bayes: 0.9515
Random Forest: 0.9794
Decision Tree: 0.9593


In [17]:
cv_scores ={}
models = {
    'XGBoost' : XGBClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

y_train = np.array(y_train)

# 5-Fold Cross Validation using KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():
    print(f"Training {model_name} model...")
    
    fold_accuracies = []  # To store accuracy for each fold
    
    # Split the data into train and validation sets for each fold
    for train_index, val_index in kf.split(X_train_vec):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train_vec[train_index], X_train_vec[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        
        # Train the model
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on the validation fold
        y_pred = model.predict(X_val_fold)
        
        # Calculate accuracy for the current fold
        accuracy = accuracy_score(y_val_fold, y_pred)
        fold_accuracies.append(accuracy)
    
    # Store the mean accuracy for the model
    mean_accuracy = np.mean(fold_accuracies)
    cv_scores[model_name] = mean_accuracy

    # Print the fold accuracies and mean accuracy
    print(f"{model_name} Fold Accuracies: {fold_accuracies}")
    print(f"{model_name} Mean Cross-Validation Accuracy: {mean_accuracy:.4f}")
    
# Print the mean Cross-Validation Accuracy Scores for all models
print("\nOverall Cross-Validation Accuracy Scores:")
for model_name, score in cv_scores.items():
    print(f"{model_name}: {score:.4f}")

Training XGBoost model...
XGBoost Fold Accuracies: [0.9889510489510489, 0.99006993006993, 0.9934265734265735, 0.9874108266890474, 0.9906280598685131]
XGBoost Mean Cross-Validation Accuracy: 0.9901
Training Random Forest model...
Random Forest Fold Accuracies: [0.9791608391608392, 0.9783216783216783, 0.9822377622377623, 0.97580081130228, 0.9801370821093859]
Random Forest Mean Cross-Validation Accuracy: 0.9791

Overall Cross-Validation Accuracy Scores:
XGBoost: 0.9901
Random Forest: 0.9791


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
embedding_dim = 100  # Size of the embedding vectors
max_len = 100  # Maximum sequence length
vocab_size = 20000  # Example vocabulary size (can be changed)

# Assuming X_train, X_test, y_train, y_test are already available
# Tokenization and padding
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)  # Fit tokenizer on training data
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Load GloVe embeddings
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
with open(r'E:\Repositories_Data-Science\Downloads\glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        if word in tokenizer.word_index:
            idx = tokenizer.word_index[word]
            embedding_matrix[idx] = coefficients


None
Epoch 1/15
1118/1118 - 64s - 58ms/step - accuracy: 0.8626 - loss: 0.3241 - val_accuracy: 0.9194 - val_loss: 0.2193
Epoch 2/15
1118/1118 - 66s - 59ms/step - accuracy: 0.9126 - loss: 0.2235 - val_accuracy: 0.9063 - val_loss: 0.2399
Epoch 3/15
1118/1118 - 63s - 56ms/step - accuracy: 0.9426 - loss: 0.1546 - val_accuracy: 0.9631 - val_loss: 0.1020
Epoch 4/15
1118/1118 - 58s - 52ms/step - accuracy: 0.9578 - loss: 0.1147 - val_accuracy: 0.9670 - val_loss: 0.0932
Epoch 5/15
1118/1118 - 60s - 53ms/step - accuracy: 0.9679 - loss: 0.0886 - val_accuracy: 0.9676 - val_loss: 0.0809
Epoch 6/15
1118/1118 - 67s - 60ms/step - accuracy: 0.9746 - loss: 0.0696 - val_accuracy: 0.9800 - val_loss: 0.0543
Epoch 7/15
1118/1118 - 61s - 55ms/step - accuracy: 0.9794 - loss: 0.0565 - val_accuracy: 0.9844 - val_loss: 0.0465
Epoch 8/15
1118/1118 - 65s - 58ms/step - accuracy: 0.9825 - loss: 0.0502 - val_accuracy: 0.9861 - val_loss: 0.0448
Epoch 9/15
1118/1118 - 67s - 60ms/step - accuracy: 0.9844 - loss: 0.0426 - 

In [None]:

# Define the LSTM model using Sequential
lstm = Sequential([
    Embedding(
        input_dim=len(tokenizer.word_index) + 1,  # Vocabulary size
        output_dim=embedding_dim,  # Embedding vector size
        weights=[embedding_matrix],  # Pre-trained GloVe embeddings
        trainable=False  # Keep embeddings fixed
        input_length = max_len
    ),
    Dropout(0.3),
    LSTM(128),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
lstm.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)
lstm.build()
print(lstm.summary())
# Train the model
history = lstm.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=15,  # Adjust epochs as needed
    batch_size=32,  # Adjust batch size as needed
    verbose=2
)

# Evaluate the model
y_pred = (lstm.predict(X_test_pad) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"LSTM Model Accuracy: {accuracy:.4f}")


In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      4703
           1       0.99      0.98      0.99      4234

    accuracy                           0.99      8937
   macro avg       0.99      0.99      0.99      8937
weighted avg       0.99      0.99      0.99      8937



In [None]:
y_pred = models['XGBoost'].predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4703
           1       0.99      0.99      0.99      4234

    accuracy                           0.99      8937
   macro avg       0.99      0.99      0.99      8937
weighted avg       0.99      0.99      0.99      8937

