In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pickle  # Import pickle for model persistence
import numpy as np
import os
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
# Path to the new working directory (Change it to your working directory)
new_directory = r"C:\Users\thevi\Downloads\Project"

# Change the working directory
os.chdir(new_directory)

# Verify the change
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)
# Load the datasets
train_df = pd.read_csv("C:\\Users\\thevi\\Downloads\\Project\\archive\\train_liar.csv")
test_df = pd.read_csv("C:\\Users\\thevi\\Downloads\\Project\\archive\\test_liar.csv")
valid_df = pd.read_csv("C:\\Users\\thevi\\Downloads\\Project\\archive\\valid_liar.csv")

# Make sure the column name is correct
column_name = 'statement'  # Adjust if the column name is different in your dataset

def preprocess_text(text):
    """
    Function to preprocess text data for fake news detection
    """
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Check if the column exists in the dataframe and apply preprocessing
if column_name in train_df.columns:
    train_df[column_name] = train_df[column_name].apply(preprocess_text)
else:
    print(f"Column {column_name} not found in train dataframe.")

if column_name in test_df.columns:
    test_df[column_name] = test_df[column_name].apply(preprocess_text)
else:
    print(f"Column {column_name} not found in test dataframe.")

if column_name in valid_df.columns:
    valid_df[column_name] = valid_df[column_name].apply(preprocess_text)
else:
    print(f"Column {column_name} not found in validation dataframe.")

# TF-IDF with n-grams (including stop word removal in the preprocessing step)
tfidf_ngram = TfidfVectorizer(ngram_range=(1,4), use_idf=True, smooth_idf=True)
train_tfidf_ngram = tfidf_ngram.fit_transform(train_df[column_name].values)

# Display some of the n-gram features
print(tfidf_ngram.get_feature_names_out()[:25])

Current Working Directory: C:\Users\thevi\Downloads\Project
['005' '005 percent' '005 percent inspected'
 '005 percent inspected dealer' '005 standard' '005 standard drunken'
 '005 standard drunken driving' '01' '01 03' '01 03 percent'
 '01 03 percent reduction' '01 percent' '01 percent taxpayer'
 '01 percent taxpayer people' '02' '02 04' '02 04 06' '02 pension'
 '02 pension system' '02 pension system funded' '025' '025 percent'
 '025 percent rd' '03' '03 percent']


In [2]:


# Preprocess and encode the labels
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(train_df['label'])

# Set up stratified k-fold cross-validation
kf = StratifiedKFold(n_splits=5)

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "k-NN": KNeighborsClassifier()
}

scores = {}
trained_models = {}
confusion_matrices = {}  # Dictionary to store confusion matrices for each model

for name, model in models.items():
    model_scores = []
    conf_matrix = None

    for train_index, val_index in kf.split(train_tfidf_ngram, Y_train):
        X_train_kf, X_val_kf = train_tfidf_ngram[train_index], train_tfidf_ngram[val_index]
        Y_train_kf, Y_val_kf = Y_train[train_index], Y_train[val_index]

        model.fit(X_train_kf, Y_train_kf)
        predictions = model.predict(X_val_kf)
        score = accuracy_score(Y_val_kf, predictions)
        model_scores.append(score)

        # Compute the confusion matrix for this fold
        current_conf_matrix = confusion_matrix(Y_val_kf, predictions)
        conf_matrix = current_conf_matrix if conf_matrix is None else conf_matrix + current_conf_matrix

    scores[name] = np.mean(model_scores)
    confusion_matrices[name] = conf_matrix

    # Save the trained model using pickle
    with open(f"{name}_model.pkl", 'wb') as f:
        pickle.dump(model, f)

# Print the average accuracy and confusion matrix for each model
for model_name, matrix in confusion_matrices.items():
    print(f"{model_name} Accuracy: {scores[model_name]}")
    print(f"{model_name} Confusion Matrix:\n{matrix}\n")

Naive Bayes Accuracy: 0.5943939492244749
Naive Bayes Confusion Matrix:
[[ 581 3906]
 [ 247 5505]]

Logistic Regression Accuracy: 0.6045512964857108
Logistic Regression Confusion Matrix:
[[1029 3458]
 [ 591 5161]]

SVM Accuracy: 0.5920508194156082
SVM Confusion Matrix:
[[ 604 3883]
 [ 294 5458]]

Random Forest Accuracy: 0.597519521708598
Random Forest Confusion Matrix:
[[1069 3418]
 [ 703 5049]]

Decision Tree Accuracy: 0.5593315389136542
Decision Tree Confusion Matrix:
[[2185 2302]
 [2210 3542]]

k-NN Accuracy: 0.5722242153150953
k-NN Confusion Matrix:
[[2119 2368]
 [2012 3740]]



In [3]:
# Transform the test and validation datasets using the same TF-IDF vectorizer
test_tfidf_ngram = tfidf_ngram.transform(test_df[column_name].values)
valid_tfidf_ngram = tfidf_ngram.transform(valid_df[column_name].values)
# Preprocess and encode the labels
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(train_df['label'])
Y_valid = label_encoder.transform(valid_df['label'])  # Ensure labels for validation are encoded similarly

models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "k-NN": KNeighborsClassifier()
}

scores = {}
trained_models = {}
confusion_matrices = {}  # Dictionary to store confusion matrices for each model

# Fit models on the entire training set and evaluate on the validation set
for name, model in models.items():
    # Fit the model
    model.fit(train_tfidf_ngram, Y_train)
    # Save the trained model using pickle
    with open(f"{name}_model.pkl", 'wb') as f:
        pickle.dump(model, f)

    # Predict on the validation set
    predictions = model.predict(valid_tfidf_ngram)
    score = accuracy_score(Y_valid, predictions)
    scores[name] = score

    # Compute the confusion matrix
    conf_matrix = confusion_matrix(Y_valid, predictions)
    confusion_matrices[name] = conf_matrix

# Print the average accuracy and confusion matrix for each model
for model_name, matrix in confusion_matrices.items():
    print(f"{model_name} Accuracy: {scores[model_name]}")
    print(f"{model_name} Confusion Matrix:\n{matrix}\n")


Naive Bayes Accuracy: 0.583008573655495
Naive Bayes Confusion Matrix:
[[115 500]
 [ 35 633]]

Logistic Regression Accuracy: 0.6250974279033515
Logistic Regression Confusion Matrix:
[[262 353]
 [128 540]]

SVM Accuracy: 0.6157443491816056
SVM Confusion Matrix:
[[212 403]
 [ 90 578]]

Random Forest Accuracy: 0.6149649259547935
Random Forest Confusion Matrix:
[[318 297]
 [197 471]]

Decision Tree Accuracy: 0.5533904910366328
Decision Tree Confusion Matrix:
[[306 309]
 [264 404]]

k-NN Accuracy: 0.5635229929851909
k-NN Confusion Matrix:
[[299 316]
 [244 424]]



In [5]:
Y_train = label_encoder.fit_transform(train_df['label'])
Y_valid = label_encoder.transform(valid_df['label'])

# Setup for Logistic Regression
with open('Logistic Regression_model.pkl', 'rb') as f:
    logistic_model = pickle.load(f)

logistic_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}
logistic_grid = GridSearchCV(logistic_model, logistic_params, cv=5, scoring='accuracy')
logistic_grid.fit(train_tfidf_ngram, Y_train)
print("Best parameters for Logistic Regression:", logistic_grid.best_params_)

# Save the improved Logistic Regression model
with open('Logistic_Regression_model_tuned.pkl', 'wb') as f:
    pickle.dump(logistic_grid.best_estimator_, f)

# Setup for SVM
with open('SVM_model.pkl', 'rb') as f:
    svm_model = pickle.load(f)

svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm_grid = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy')
svm_grid.fit(train_tfidf_ngram, Y_train)
print("Best parameters for SVM:", svm_grid.best_params_)

# Save the improved SVM model
with open('SVM_model_tuned.pkl', 'wb') as f:
    pickle.dump(svm_grid.best_estimator_, f)

# Evaluate the improved models on the validation set
models = {
    "Logistic Regression Tuned": logistic_grid.best_estimator_,
    "SVM Tuned": svm_grid.best_estimator_
}

for name, model in models.items():
    predictions = model.predict(valid_tfidf_ngram)
    score = accuracy_score(Y_valid, predictions)
    conf_matrix = confusion_matrix(Y_valid, predictions)
    print(f"{name} Accuracy: {score}")
    print(f"{name} Confusion Matrix:\n{conf_matrix}\n")

Best parameters for Logistic Regression: {'C': 10, 'solver': 'liblinear'}
Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Logistic Regression Tuned Accuracy: 0.6212003117692907
Logistic Regression Tuned Confusion Matrix:
[[295 320]
 [166 502]]

SVM Tuned Accuracy: 0.6219797349961029
SVM Tuned Confusion Matrix:
[[284 331]
 [154 514]]



In [9]:
test_tfidf_ngram = tfidf_ngram.transform(test_df[column_name].values)
# Load the tuned Logistic Regression model
with open('Logistic_Regression_model_tuned.pkl', 'rb') as f:
    logistic_model_tuned = pickle.load(f)

# Load the tuned SVM model
with open('SVM_model_tuned.pkl', 'rb') as f:
    svm_model_tuned = pickle.load(f)

# Load the encoded labels for the test dataset, ensure they are properly encoded

Y_test = label_encoder.transform(test_df['label']) 

# Initialize dictionary to store models
models = {
    "Logistic Regression Tuned": logistic_model_tuned,
    "SVM Tuned": svm_model_tuned
}

# Initialize dictionary to store results
results = {}

# Evaluate each model on the test set
for name, model in models.items():
    predictions = model.predict(test_tfidf_ngram)
    accuracy = accuracy_score(Y_test, predictions)
    conf_matrix = confusion_matrix(Y_test, predictions)
    
    results[name] = {
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix
    }

# Print results for each model
for model_name, result in results.items():
    print(f"{model_name} Accuracy: {result['accuracy']}")
    print(f"{model_name} Confusion Matrix:\n{result['confusion_matrix']}\n")

Logistic Regression Tuned Accuracy: 0.6216429699842022
Logistic Regression Tuned Confusion Matrix:
[[246 307]
 [172 541]]

SVM Tuned Accuracy: 0.6153238546603476
SVM Tuned Confusion Matrix:
[[230 323]
 [164 549]]



In [10]:
#We selected the tuned LR model as the main model for the Demo
# Path where the model will be saved
model_path = 'best_logistic_regression_model.pkl'

# Save the tuned Logistic Regression model to a file
with open(model_path, 'wb') as file:
    pickle.dump(logistic_model_tuned, file)

print(f"Model saved successfully at {model_path}")

Model saved successfully at best_logistic_regression_model.pkl


In [13]:
# TF-IDF with n-grams (including stop word removal in the preprocessing step)
tfidf_ngram = TfidfVectorizer(ngram_range=(1,4), use_idf=True, smooth_idf=True)
train_tfidf_ngram = tfidf_ngram.fit_transform(train_df[column_name].values)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_ngram, f)