In [3]:
import pandas as pd

data ingestion

In [4]:
fake = pd.read_csv('.\data\Fake.csv')
true = pd.read_csv('.\data\True.csv')

data validation

In [5]:
print(fake.shape,true.shape)

(23481, 4) (21417, 4)


adding label to datasets

In [6]:
fake['label'] = 0
true['label'] = 1

concating fake and real data

In [7]:
data = pd.concat([fake,true], ignore_index=True)

saving uprocessed data for future use

In [8]:
# data.to_csv('.\data\data.csv')


creating a copy of data for experimentation

In [9]:
df = data.copy()

cheching data info for datatypes and count of null values

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


adding all the valuable content to a single column

In [11]:
df['content'] = df['subject'] + ' '  + df['title'] + ' ' + df['text']

In [12]:
df.drop(columns=['title','subject','date', 'text'], inplace=True)

Shuffling dataset

In [13]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,label,content
0,0,US_News Ben Stein Calls Out 9th Circuit Court:...
1,1,politicsNews Trump drops Steve Bannon from Nat...
2,1,politicsNews Puerto Rico expects U.S. to lift ...
3,0,News OOPS: Trump Just Accidentally Confirmed ...
4,1,politicsNews Donald Trump heads for Scotland t...


preprocessing

In [15]:
df['content'] = df['content'].str.lower() #lower casing

In [16]:
# removing punctuation

import string

def remove_punctuation(text):
    """Remove punctuation from a given text."""
    return ''.join(char for char in text if char not in string.punctuation)

df['content'] = df['content'].apply(remove_punctuation)

In [17]:
# tokenizing
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize_text_nltk(text):
    """Tokenize text using NLTK."""
    tokens = [word for word in word_tokenize(text) if word.isalpha()]
    return tokens

# Apply to DataFrame
df['tokens'] = df['content'].apply(tokenize_text_nltk)

[nltk_data] Downloading package punkt to C:\Users\Shiwang
[nltk_data]     Upadhyay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# removing stopwords

import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    """Remove stopwords."""
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens


df['filtered_tokens'] = df['tokens'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to C:\Users\Shiwang
[nltk_data]     Upadhyay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# stemming

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_tokens(tokens):
    """Apply stemming."""
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

df['stemmed_tokens'] = df['filtered_tokens'].apply(stem_tokens)

In [20]:
#vectorizing usinf TF-IDF vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorize(corpus, max_features=5000):
    """Convert corpus into TF-IDF vectors."""
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(corpus)
    return X, vectorizer

df['processed_text'] = df['stemmed_tokens'].apply(" ".join)
X, vectorizer = tfidf_vectorize(df['processed_text'])
print(f"TF-IDF Shape: {X.shape}")

TF-IDF Shape: (44898, 5000)


Feature selection and train test split

In [21]:
from sklearn.model_selection import train_test_split
Y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

## model training, hyperparameter tuning and evaluation

Logistic regrssion

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [23]:
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

In [24]:
# hyperparameter tuning

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l2'],  # L1 not supported in some solvers
    'solver': ['liblinear', 'lbfgs'],
}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [25]:
# evaluation

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

In [26]:
# printing results

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.9970
ROC AUC: 0.9999
Confusion Matrix:
 [[4682   14]
 [  13 4271]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



Support Vector Classifier

In [27]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [28]:
# svm = SVC(random_state=42)
# svm.fit(X_train, y_train)

In [29]:
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
#     'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
# }
# grid_search = GridSearchCV(SVC(probability=True, random_state=42), param_grid, cv=5, verbose=1, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# best_model = grid_search.best_estimator_

In [30]:
# y_pred = best_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)
# class_report = classification_report(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])

In [31]:
# print(f"Best Parameters: {grid_search.best_params_}")
# print(f"Accuracy: {accuracy:.4f}")
# print(f"ROC AUC: {roc_auc:.4f}")
# print("Confusion Matrix:\n", conf_matrix)
# print("Classification Report:\n", class_report)

Decision tree

In [32]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [33]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

In [34]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [35]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [36]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Accuracy: 0.9998
Confusion Matrix:
 [[4696    0]
 [   2 4282]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



Random Forest Classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [39]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [40]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [41]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [42]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.9991
Confusion Matrix:
 [[4688    8]
 [   0 4284]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4696
           1       1.00      1.00      1.00      4284

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



Naive Bayes Classifier

In [43]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [44]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [45]:
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]
}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [46]:
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [47]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Best Parameters: {'alpha': 0.1}
Accuracy: 0.9408
Confusion Matrix:
 [[4440  256]
 [ 276 4008]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94      4696
           1       0.94      0.94      0.94      4284

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980

